In [None]:
import json
import sqlite3
import pandas as pd
from tqdm.notebook import tqdm
import utilities
from dateutil.parser import parse

In [None]:
base_directory = 'khaleesi/data/'
db_dir = base_directory + 'crawl.sqlite'

json_representation_dir = base_directory + 'crawl-js.json'
json_representation_dir_with_responses = base_directory + 'crawl-js-with-responses.json'
json_representation_dir_without_empty_response = base_directory + 'crawl-js-non-empty.json'
json_representation_dir_connected = base_directory + 'crawl-js-connected.json'

url_id_map_dir = base_directory + 'js-url-id-map.json'
identifier_url_id_dir = base_directory + 'identifier-url-id-map.json'

In [None]:
con = sqlite3.connect(db_dir)
con.row_factory = sqlite3.Row

## Querying callstack for each request is slow in real time.
#### We store the browser_id, visit_id, request_id to URL_id and URL_id to URL representations beforehand
#### See next few cells to load the representation as json objects

In [None]:
# replace browser_id with crawl_id for older schemas
callstacks = pd.read_sql("SELECT browser_id, visit_id, request_id, call_stack FROM callstacks", con, index_col=['browser_id', 'visit_id', 'request_id'])

In [None]:
def find_2nd(string, substring):
    return string.find(substring, string.find(substring) + 1)
    
def get_call_stack_url(current_stack):
    call_stack = current_stack['call_stack'].iloc[0]
    call_stack_items = call_stack.split('\n')
    
    # unlikely
    if len(call_stack_items) == 0:
        return ''
   
    current_item = call_stack_items[0]
    start = current_item.find('@') + 1
    stop = find_2nd(current_item, ':')
    
    if current_item[start:stop].startswith('http'):
        return current_item[start:stop]

    if current_item.count('@') > 1:
        start = find_2nd(current_item, '@') + 1
        stop = find_2nd(current_item, ':')

        if current_item[start:stop].startswith('http'):
            return current_item[start:stop]

            
    for current_item in call_stack_items[1:]:
        start = current_item.find('@') + 1
        stop = find_2nd(current_item, ':')
        
        if current_item[start:stop].startswith('http'):
            return current_item[start:stop]
    
    current_item = call_stack_items[0]
    start = current_item.find('@') + 1
    stop = find_2nd(current_item, ':')
    return current_item[start:stop]

In [None]:
URL_ids = {}
ids_URL = {}
identifier_url_id = {}
id_counter = 0

pbar = tqdm(total=len(callstacks), position=0, leave=True)

for index, current_stack in callstacks.groupby(level=[0,1,2]):
    pbar.update(len(current_stack))

    identifier = str(index[0]) + '|' + str(index[1]) + '|' + str(index[2])
    executing_script_url = get_call_stack_url(current_stack)
    
    if executing_script_url == '':
        continue
        
    if executing_script_url in URL_ids:
        url_id = URL_ids[executing_script_url]
        
    else:
        url_id = len(URL_ids) + 1
        URL_ids[executing_script_url] = url_id
        ids_URL[url_id] = executing_script_url
        
    identifier_url_id[identifier] = url_id

In [None]:
utilities.write_json(identifier_url_id_dir, identifier_url_id)
utilities.write_json(url_id_map_dir, ids_URL)

# OR

In [None]:
identifier_url_id = utilities.read_json(identifier_url_id_dir)

# Traverse requests table to create chains

In [None]:
# replace browser_id with crawl_id for older schemas
requests = pd.read_sql("SELECT browser_id, visit_id, request_id, url, top_level_url, referrer, headers, method, resource_type, time_stamp FROM http_requests", con, index_col=['browser_id', 'visit_id', 'request_id'], parse_dates=['time_stamp'])

In [None]:
json_representation = {}

pbar = tqdm(total=len(requests), position=0, leave=True)

for index, requests_chain in requests.groupby(level=[0,1,2]):
    pbar.update(len(requests_chain))
    
    # we are ignoring HTTP chains here
    if len(requests_chain) > 1:
        continue
    
    crawl_browser_id = index[0]
    visit_id = index[1]
    request_id = index[2]

    if str(index[0]) + '|' + str(index[1]) + '|' + str(index[2]) in identifier_url_id:
        url_id = identifier_url_id[str(index[0]) + '|' + str(index[1]) + '|' + str(index[2])]
    else:
        continue
        
    
    # change request_id to get a new id, according to script URL
    chain_id = 'J|' + str(crawl_browser_id) + '|' + str(visit_id) + '|' + str(url_id)
    
    request = requests_chain.iloc[0]
    chain_item = {}

    chain_item['url'] = request['url']
    chain_item['request_id'] = request_id
    chain_item['referrer'] = request['referrer']
    chain_item['request_headers'] = json.loads(request['headers'])
    chain_item['resource_type'] = request['resource_type']
    chain_item['method'] = request['method']
    chain_item['time_stamp'] = request['time_stamp'].isoformat()

        
    if chain_id not in json_representation:
        json_representation[chain_id] = {}
        json_representation[chain_id]['top_url'] = request['top_level_url']
        json_representation[chain_id]['content'] = []
        json_representation[chain_id]['length'] = 0
    
    json_representation[chain_id]['content'].append(chain_item)
    json_representation[chain_id]['length'] += 1

In [None]:
def order_json(json_representation):
    for key in json_representation:
        json_representation[key]['content'].sort(key=lambda k: parse(k['time_stamp']))
        json_representation[key]['length'] = len(json_representation[key]['content'])
        for idx, item in enumerate(json_representation[key]['content']):
            item['redirect_id'] = idx
    return json_representation

In [None]:
json_representation = order_json(json_representation)

In [None]:
utilities.write_json(json_representation_dir, json_representation)

# Read JSON here instead of processing the requests because it is already done

In [None]:
json_representation = utilities.read_json(json_representation_dir)

In [None]:
# replace browser_id with crawl_id for older schemas
responses = pd.read_sql("SELECT browser_id, visit_id, request_id, url, response_status, headers, time_stamp FROM http_responses", con, index_col=['browser_id', 'visit_id', 'request_id'], parse_dates=['time_stamp'])

In [None]:
js_requests = set()
for key in json_representation:
    for idx, chain_item in enumerate(json_representation[key]['content']):
        crawl_browser_id = str(key.split('|')[1])
        visit_id = str(key.split('|')[2])
        request_id = str(chain_item['request_id'])
        js_requests.add(crawl_browser_id + '|' + visit_id + '|' + request_id)

In [None]:
pbar = tqdm(total=len(responses), position=0, leave=True)
responses_data = {}

for index, response in responses.iterrows():
    pbar.update(1)

    identifier = str(index[0]) + '|' + str(index[1]) + '|' + str(index[2])
    
    if identifier in js_requests and identifier not in responses_data:
        responses_data[identifier] = {}

        responses_data[identifier]['response_status'] = response['response_status']
        responses_data[identifier]['response_headers'] = json.loads(response['headers'])
        responses_data[identifier]['response_time_stamp'] = response['time_stamp'].isoformat()

In [None]:
indexes_to_remove = {}
less_than_2 = set()
pbar = tqdm(total=len(json_representation), position=0, leave=True)

for key in json_representation:
    pbar.update(1)
    if len(json_representation[key]['content']) <= 1: 
        less_than_2.add(key)
        continue
    
    for idx, chain_item in enumerate(json_representation[key]['content']):
        crawl_browser_id = str(key.split('|')[1])
        visit_id = str(key.split('|')[2])
        request_id = str(chain_item['request_id'])
        
        if crawl_browser_id + '|' + visit_id + '|' + request_id in responses_data:
            response = responses_data[crawl_browser_id + '|' + visit_id + '|' + request_id]
            chain_item['response_status'] = response['response_status']
            chain_item['response_headers'] = response['response_headers']
            chain_item['response_time_stamp'] = response['response_time_stamp']
        else:
            chain_item['response_status'] = 0
            chain_item['response_headers'] = []
            chain_item['response_time_stamp'] = ''
            if key not in indexes_to_remove:
                indexes_to_remove[key] = []
            indexes_to_remove[key].append(idx)

In [None]:
for item in less_than_2:
    del json_representation[item]

In [None]:
utilities.write_json(json_representation_dir_with_responses, json_representation)

In [None]:
less_than_2 = set()
for key in indexes_to_remove:
    content = [i for j, i in enumerate(json_representation[key]['content']) if j not in indexes_to_remove[key]]

    json_representation[key]['content'] = content
    if len(json_representation[key]['content']) <= 1: 
        less_than_2.add(key)

In [None]:
for item in less_than_2:
    del json_representation[item]

In [None]:
utilities.write_json(json_representation_dir_without_empty_response, json_representation)

# Only keeping the connected ones

In [None]:
json_representation = utilities.read_json(json_representation_dir_without_empty_response)

In [None]:
import re
from urllib import parse as URLparse
import base64
import hashlib

In [None]:
def get_identifier_cookies(cookie_string, cookie_length = 8):
    cookie_set = set()
    
    for cookie in cookie_string.split('\n'):
        cookie = cookie.split(';')[0]
        if cookie.count('=') >= 1:
            cookie = cookie.split('=', 1)
            cookie_set |= set(re.split('[^a-zA-Z0-9_=-]', cookie[1]))
            cookie_set.add(cookie[0])
        else:
            cookie_set |= set(re.split('[^a-zA-Z0-9_=-]', cookie))
    
    # remove cookies with length < 8 
    cookie_set = set([s for s in list(cookie_set) if len(s) >= cookie_length])
    return cookie_set


def get_identifiers_from_qs(url, qs_item_length = 8):
    qs = URLparse.parse_qsl(URLparse.urlsplit(url).query)
    qs_set = set()
    
    for item in qs:
        qs_set |= set(re.split('[^a-zA-Z0-9_=-]', item[0]))
        qs_set |= set(re.split('[^a-zA-Z0-9_=-]', item[1]))
        
    qs_set = set([s for s in list(qs_set) if len(s) >= qs_item_length])
    return qs_set


def get_identifiers_from_uncommon_headers(header_prop, item_length = 8):
    splitted_header_prop_set = set()

    splitted_header_prop = set(re.split('[^a-zA-Z0-9_=-]', header_prop))
    splitted_header_prop_set = set([s for s in list(splitted_header_prop) if len(s) >= item_length])
    return splitted_header_prop_set

In [None]:
def check_csync_events(identifiers, next_url, next_identifiers):
    for identifier in identifiers:    
        if identifier in next_url or identifier in next_identifiers:
            return True
        
        base64_identifier = base64.b64encode(identifier.encode('utf-8')).decode('utf8')
        md5_identifier = hashlib.md5(identifier.encode('utf-8')).hexdigest()
        sha1_identifier = hashlib.sha1(identifier.encode('utf-8')).hexdigest()
        
        if base64_identifier in next_url or base64_identifier in next_identifiers:
            return True
        elif md5_identifier in next_url or md5_identifier in next_identifiers:
            return True
        elif sha1_identifier in next_url or sha1_identifier in next_identifiers:
            return True
               
    return False

# Non standard headers

In [None]:
known_http_headers = set()
known_http_headers_raw = utilities.read_file_newline_stripped('khaleesi/data/common_headers.txt')
for item in known_http_headers_raw:
    if item.strip() != '':
        known_http_headers.add(item.strip().lower())

In [None]:
indexes_to_remove = {}
pbar = tqdm(total=len(json_representation), position=0, leave=True)

for key in json_representation:
    pbar.update(1)
    indexes_to_remove[key] = []
    for idx, item in enumerate(json_representation[key]['content']):
        found = False
        current_identifiers = set()
        current_url = item['url']
        current_referrer = item['referrer']
        current_identifiers = set()
        current_headers = item['request_headers']

        sent_cookies = ''
        for s_item in current_headers:
            if s_item[0].lower() == 'cookie':
                sent_cookies = s_item[1]
            elif s_item[0].lower() not in known_http_headers:
                current_identifiers |= get_identifiers_from_uncommon_headers(s_item[1])

        recieved_cookies = ''
        for s_item in item['response_headers']:
            if s_item[0].lower() == 'set-cookie':
                recieved_cookies = s_item[1]
            elif s_item[0].lower() not in known_http_headers:
                current_identifiers |= get_identifiers_from_uncommon_headers(s_item[1])

        current_identifiers |= get_identifier_cookies(sent_cookies)
        current_identifiers |= get_identifier_cookies(recieved_cookies)
        current_identifiers |= get_identifiers_from_qs(current_url)
        current_identifiers |= get_identifiers_from_qs(current_referrer)
        
        
        # We need to traverse from the start. 
        # Becuase we never add the redirects who recieve the identifiers. 
        for idx_1, item_1 in enumerate(json_representation[key]['content']):
            
            if idx_1 == idx:
                continue
                
            next_url = item_1['url']
            next_headers = item_1['request_headers']

            next_identifiers = set()
            for s_item in next_headers:
                if s_item[0].lower() == 'cookie':
                    next_identifiers |= get_identifier_cookies(s_item[1])
                elif s_item[0].lower() not in known_http_headers:
                    next_identifiers |= get_identifiers_from_uncommon_headers(s_item[1])

            if check_csync_events(current_identifiers, next_url, next_identifiers):
                found = True
                break
                
        if not found:
            indexes_to_remove[key].append(idx)

In [None]:
less_than_2 = set()
for key in indexes_to_remove:
    content = [i for j, i in enumerate(json_representation[key]['content']) if j not in indexes_to_remove[key]]

    json_representation[key]['content'] = content
    if len(json_representation[key]['content']) <= 1: 
        less_than_2.add(key)

In [None]:
for item in less_than_2:
    del json_representation[item]
print(len(json_representation))

In [None]:
utilities.write_json(json_representation_dir_connected, json_representation)