In [None]:
import json
import sqlite3
import pandas as pd
from tqdm.notebook import tqdm
import utilities

In [None]:
base_directory = 'khaleesi/data/'
db_dir = base_directory + 'crawl.sqlite'
json_representation_dir = base_directory + 'crawl-http.json'

In [None]:
con = sqlite3.connect(db_dir)
con.row_factory = sqlite3.Row

In [None]:
# replace browser_id with crawl_id for older schemas
requests = pd.read_sql("SELECT id, browser_id, visit_id, request_id, url, top_level_url, referrer, headers, method, resource_type, time_stamp FROM http_requests", con, index_col=['browser_id', 'visit_id', 'request_id'], parse_dates=['time_stamp'])

In [None]:
# replace browser_id with crawl_id for older schemas
redirects = pd.read_sql("SELECT id, browser_id, visit_id, old_request_id, old_request_url, response_status, headers, time_stamp FROM http_redirects", con, index_col=['browser_id', 'visit_id', 'old_request_id'], parse_dates=['time_stamp'])

In [None]:
json_representation = {}
seen_chains = set()
pbar = tqdm(total=len(requests), position=0, leave=True)

for index, requests_chain_row in requests.groupby(level=[0,1,2]):
    pbar.update(len(requests_chain_row))
    
    # ignoring non redirects 
    if len(requests_chain_row) < 2:
        continue
    
    requests_chain = requests_chain_row.sort_values(by='time_stamp')
    crawl_browser_id = index[0]
    visit_id = index[1]
    request_id = index[2]

    chain_id = 'H|' + str(crawl_browser_id) + '|' + str(visit_id) + '|' + str(request_id)

    if chain_id in seen_chains:
        continue
    
    seen_chains.add(chain_id)
    current_redirects = redirects.loc[(redirects.index.get_level_values('browser_id') == crawl_browser_id) &\
                                      (redirects.index.get_level_values('visit_id') == visit_id) &\
                                      (redirects.index.get_level_values('old_request_id') == request_id)]

    considered_ids = set()
    first_redirect = True 
    
    json_representation[chain_id] = {}
    json_representation[chain_id]['top_url'] = requests_chain.iloc[0]['top_level_url']
    json_representation[chain_id]['content'] = []
    json_representation[chain_id]['length'] = 0
    
    redirect_id_counter = 0 
    for idx, request in requests_chain.iterrows():
        current_responses = current_redirects[current_redirects.old_request_url == request['url']].sort_values(by='time_stamp')
        chain_item = {}
        chain_item['url'] = request['url']
        chain_item['referrer'] = request['referrer']
        chain_item['request_headers'] = json.loads(request['headers'])
        chain_item['resource_type'] = request['resource_type']
        chain_item['method'] = request['method']
        chain_item['time_stamp'] = request['time_stamp'].isoformat()
        
        response_status = 0
        response_headers = []
        response_time_stamp = ''
        
        # We may recieve multiple responses for a request. Though unlikely. 
        for i, current_response in current_responses.iterrows():
            
            if current_response['id'] in considered_ids:
                continue
            response_status = current_response['response_status']
            response_headers = json.loads(current_response['headers'])
            response_time_stamp = current_response['time_stamp'].isoformat()
            considered_ids.add(current_response['id'])
        
        chain_item['response_status'] = response_status
        chain_item['response_headers'] = response_headers
        chain_item['response_time_stamp'] = response_time_stamp
        chain_item['redirect_id'] = redirect_id_counter
        redirect_id_counter += 1
        
        if response_headers == [] and response_status == 0:
            if first_redirect:
                del json_representation[chain_id]
                break
            else:
                json_representation[chain_id]['content'].append(chain_item)
                json_representation[chain_id]['length'] += 1
                break
        
        first_redirect = False
        json_representation[chain_id]['content'].append(chain_item)
        json_representation[chain_id]['length'] += 1

In [None]:
utilities.write_json(json_representation_dir, json_representation)