In [None]:
import json
from tqdm.notebook import tqdm
import utilities
import datetime
from dateutil.parser import parse
import re
from urllib import parse as URLparse
from openwpm_utils import domain as du
import base64
import hashlib
from collections import OrderedDict
from prettytable import PrettyTable
from tqdm.notebook import tqdm

In [None]:
base_directory = 'khaleesi/data/'

non_interactive_http_dir = base_directory + 'crawl-http-labeled.json'
non_interactive_js_dir = base_directory + 'crawl-js-connected-labeled.json'

In [None]:
http_chains = utilities.read_json(non_interactive_http_dir)
js_chains = utilities.read_json(non_interactive_js_dir)

# Helper function for getting identifiers

In [None]:
def get_identifier_cookies(cookie_string, cookie_length = 8):
    cookie_set = set()
    
    for cookie in cookie_string.split('\n'):
        cookie = cookie.split(';')[0]
        if cookie.count('=') >= 1:
            cookie = cookie.split('=', 1)
            cookie_set |= set(re.split('[^a-zA-Z0-9_=-]', cookie[1]))
            cookie_set.add(cookie[0])
        else:
            cookie_set |= set(re.split('[^a-zA-Z0-9_=-]', cookie))
    
#     remove cookies with length < 8 
    cookie_set = set([s for s in list(cookie_set) if len(s) >= cookie_length])
    return cookie_set

In [None]:
def get_identifiers_from_qs(url, qs_item_length = 8):
    qs = URLparse.parse_qsl(URLparse.urlsplit(url).query)
    qs_set = set()
    
    for item in qs:
        qs_set |= set(re.split('[^a-zA-Z0-9_=-]', item[0]))
        qs_set |= set(re.split('[^a-zA-Z0-9_=-]', item[1]))
        
    qs_set = set([s for s in list(qs_set) if len(s) >= qs_item_length])
    return qs_set

In [None]:
def get_identifiers_from_uncommon_headers(header_prop, item_length = 8):
    splitted_header_prop_set = set()

    splitted_header_prop = set(re.split('[^a-zA-Z0-9_=-]', header_prop))
    splitted_header_prop_set = set([s for s in list(splitted_header_prop) if len(s) >= item_length])
    return splitted_header_prop_set

In [None]:
def get_domain_or_hostname(url):
    # we stop if we cannot retrieve the domain or hostanmes
    # we won't be able to link domains/hostnames if they are empty or unavailable  
    current_domain_or_hostname = du.get_ps_plus_1(url)
    
    if current_domain_or_hostname == '' or current_domain_or_hostname == None:
        current_domain_or_hostname = du.urlparse(url).hostname
        if current_domain_or_hostname == '' or current_domain_or_hostname == None:
            return False, ''
    
    return True, current_domain_or_hostname

In [None]:
known_http_headers = set()
known_http_headers_raw = utilities.read_file_newline_stripped('common_headers.txt')
for item in known_http_headers_raw:
    if item.strip() != '':
        known_http_headers.add(item.strip().lower())

In [None]:
def check_csync_events(identifiers, next_identifiers, key, current_domain_or_hostname, next_url, csync_domains):
    for identifier in identifiers:    
        next_domain_or_hostname = get_domain_or_hostname(next_url)
        if not next_domain_or_hostname[0]:
            break

        next_domain_or_hostname = next_domain_or_hostname[1]
        domain_domain = current_domain_or_hostname + '|' + next_domain_or_hostname
        
        if domain_domain not in csync_domains:
            csync_domains[domain_domain] = {}
            csync_domains[domain_domain]['chains'] = []
            csync_domains[domain_domain]['b64_chains'] = []
            csync_domains[domain_domain]['md5_chains'] = []
            csync_domains[domain_domain]['sha1_chains'] = []
        
        base64_identifier = base64.b64encode(identifier.encode('utf-8')).decode('utf8')
        md5_identifier = hashlib.md5(identifier.encode('utf-8')).hexdigest()
        sha1_identifier = hashlib.sha1(identifier.encode('utf-8')).hexdigest()
        
        if identifier in next_url or identifier in next_identifiers:
            csync_domains[domain_domain]['chains'].append({'chain': key, 'identifier': identifier})
        elif base64_identifier in next_url or base64_identifier in next_identifiers:
            csync_domains[domain_domain]['b64_chains'].append({'chain':key, 'identifier': identifier, 'encoded': base64_identifier})
        elif md5_identifier in next_url or md5_identifier in next_identifiers:
            csync_domains[domain_domain]['md5_chains'].append({'chain':key, 'identifier': identifier, 'encoded': md5_identifier})
        elif sha1_identifier in next_url or sha1_identifier in next_identifiers:
            csync_domains[domain_domain]['sha1_chains'].append({'chain':key, 'identifier': identifier, 'encoded': sha1_identifier})
            
    return csync_domains

# Cookie syncing identification code

In [None]:
def run_csync_heuristic(json_representation, known_http_headers, csync_domains):
    pbar = tqdm(total=len(json_representation), position=0, leave=True)
    for key in json_representation:
        pbar.update(1)

        for idx, item in enumerate(json_representation[key]['content']):
            current_url = item['url']
            current_referrer = item['referrer']
            current_identifiers = set()

            current_domain_or_hostname = get_domain_or_hostname(current_url)
            if not current_domain_or_hostname[0]:
                continue

            current_domain_or_hostname = current_domain_or_hostname[1]

            sent_cookies = ''
            for s_item in item['request_headers']:
                if s_item[0].lower() == 'cookie':
                    current_identifiers |= get_identifier_cookies(s_item[1])
                if s_item[0].lower() not in known_http_headers:
                    current_identifiers |= get_identifiers_from_uncommon_headers(s_item[1])


            recieved_cookies = ''
            for s_item in item['response_headers']:
                if s_item[0].lower() == 'set-cookie':
                    current_identifiers |= get_identifier_cookies(s_item[1])
                if s_item[0].lower() not in known_http_headers:
                    current_identifiers |= get_identifiers_from_uncommon_headers(s_item[1])


            current_identifiers |= get_identifiers_from_qs(current_url)
            current_identifiers |= get_identifiers_from_qs(current_referrer)

    
            if key.startswith('J|'):
                end = len(json_representation[key]['content'])
            else:
                end = idx + 2
                if end > len(json_representation[key]['content']):
                    continue

            for item_1 in json_representation[key]['content'][idx+1:end]:
                next_url = item_1['url']
                next_headers = item_1['request_headers']

                next_identifiers = set()
                for s_item in next_headers:
                    if s_item[0].lower() == 'cookie':
                        next_identifiers |= get_identifier_cookies(s_item[1])
                    if s_item[0].lower() not in known_http_headers:
                        next_identifiers |= get_identifiers_from_uncommon_headers(s_item[1])


                csync_domains = check_csync_events(current_identifiers, next_identifiers, key, current_domain_or_hostname, next_url, csync_domains)
    return csync_domains 

In [None]:
current_csync = {}
current_csync = run_csync_heuristic(http_chains, known_http_headers, results_dict, current_csync)
current_csync = run_csync_heuristic(js_chains, known_http_headers, results_dict, current_csync)

# Clean up cysnc events 

In [None]:
def cysnc_clean_up(csync_domains):
    to_delete = set()
    for domain_domain in csync_domains:
        if len(csync_domains[domain_domain]['chains']) == 0 and \
            len(csync_domains[domain_domain]['b64_chains']) == 0 and \
            len(csync_domains[domain_domain]['md5_chains']) == 0 and \
            len(csync_domains[domain_domain]['sha1_chains']) == 0:
            to_delete.add(domain_domain)
    for key in to_delete:
        del csync_domains[key]
    return csync_domains

In [None]:
print(len(current_csync))
current_csync = cysnc_clean_up(current_csync)
print(len(current_csync))

## Helper function for cookie syncing statistics

In [None]:
def count_csync_events(_from, _to, sending_json_obj, receiving_json_obj):
    if _from not in sending_json_obj:
        sending_json_obj[_from] = {}
        sending_json_obj[_from]['count'] = 1
        sending_json_obj[_from]['domains'] = set({_to})
    else:    
        sending_json_obj[_from]['count'] += 1
        sending_json_obj[_from]['domains'].add(_to)

    if _to not in receiving_json_obj:
        receiving_json_obj[_to] = {}
        receiving_json_obj[_to]['count'] = 1
        receiving_json_obj[_to]['domains'] = set({_from})
    else:
        receiving_json_obj[_to]['count'] += 1
        receiving_json_obj[_to]['domains'].add(_from)
            
    return sending_json_obj, receiving_json_obj

In [None]:
def get_csynced_chains(chains, chains_synced):
    for item in chains:
        if item['chain'] not in chains_synced:
            chains_synced[item['chain']] = {}
            chains_synced[item['chain']]['count'] = 1
        else:
            chains_synced[item['chain']]['count'] += 1
#         break
    return chains_synced

In [None]:
def get_unique_domains_in_chains(json_representation, khaleesi_detections):
    all_domains = set()
    for key in json_representation:

        if key not in khaleesi_detections:
            continue

        for idx, item in enumerate(json_representation[key]['content']):
            current_domain_or_hostname = get_domain_or_hostname(item['url'])

            if not current_domain_or_hostname[0]:
                continue

            all_domains.add(current_domain_or_hostname[1])
    return all_domains

# Finding cookie syncing stats

In [None]:
def compute_csync_stats(csync_domains, no_of_chains, no_of_domains):
    all_domains = set()

    sending_to = {}
    recieved_from = {}

    b64_sending_to = {}
    b64_recieved_from = {}
    md5_sending_to = {}
    md5_recieved_from = {}
    sha1_sending_to = {}
    sha1_recieved_from = {}

    chains_synced_simple = {}
    chains_synced_b64 = {}
    chains_synced_md5 = {}
    chains_synced_sha1 = {}

    for domain_domain in csync_domains:
        _from = domain_domain.split('|')[0]
        _to = domain_domain.split('|')[1]
        
        if _from == _to:
            continue

        if len(csync_domains[domain_domain]['chains']) > 0:
            sending_to, recieved_from = count_csync_events(_from, _to, sending_to, recieved_from)
            chains_synced_simple = get_csynced_chains(csync_domains[domain_domain]['chains'], chains_synced_simple)

        if len(csync_domains[domain_domain]['b64_chains']) > 0:
            sending_to, recieved_from = count_csync_events(_from, _to, sending_to, recieved_from)
            b64_sending_to, b64_recieved_from = count_csync_events(_from, _to, b64_sending_to, b64_recieved_from)
            chains_synced_b64 = get_csynced_chains(csync_domains[domain_domain]['b64_chains'], chains_synced_b64)

        if len(csync_domains[domain_domain]['md5_chains']) > 0:
            sending_to, recieved_from = count_csync_events(_from, _to, sending_to, recieved_from)
            md5_sending_to, md5_recieved_from = count_csync_events(_from, _to, md5_sending_to, md5_recieved_from)
            chains_synced_md5 = get_csynced_chains(csync_domains[domain_domain]['md5_chains'], chains_synced_md5)

        if len(csync_domains[domain_domain]['sha1_chains']) > 0:
            sending_to, recieved_from = count_csync_events(_from, _to, sending_to, recieved_from)
            sha1_sending_to, sha1_recieved_from = count_csync_events(_from, _to, sha1_sending_to, sha1_recieved_from)
            chains_synced_sha1 = get_csynced_chains(csync_domains[domain_domain]['sha1_chains'], chains_synced_sha1)
    
    
    # csync domain statistics
    csync_domains = set(sending_to.keys()).union(set(recieved_from.keys())).\
                        union(set(b64_sending_to.keys())).union(set(b64_recieved_from.keys())).\
                        union(set(md5_sending_to.keys())).union(set(md5_recieved_from.keys())).\
                        union(set(sha1_sending_to.keys())).union(set(sha1_recieved_from.keys()))


    # csync chain statistics
    csync_chains = set(chains_synced_simple.keys()).union(set(chains_synced_b64.keys()))\
                            .union(set(chains_synced_md5.keys()))\
                            .union(set(chains_synced_sha1.keys()))


    # csync encoded chain statistics
    csync_encoded = set(b64_sending_to.keys()).union(set(b64_recieved_from.keys()))\
                    .union(set(md5_sending_to.keys())).union(set(md5_recieved_from.keys()))\
                    .union(set(sha1_sending_to.keys())).union(set(sha1_recieved_from.keys()))
    
    # encoded cookie syncing stats can also be returned
    return csync_domains, sending_to, recieved_from  

In [None]:
csync_domains, sending_to, recieved_from = compute_csync_stats(current_csync)

# Print top csync domains

In [None]:
def print_table(json_obj, count_limit = 20):
    count = 0
    t = PrettyTable(['Domains', 'Csync count'])
    for key in json_obj:
        count += 1
        if count <= count_limit:
            t.add_row([key, json_obj[key]['count']])
            
    print(t)
    
def average_sharing(syncing_domains):
    total = 0
    for key in syncing_domains:
        total += syncing_domains[key]['count']
    print(total / len(syncing_domains))

In [None]:
def get_top_csyncs(sending_to, recieved_from):
    sending_to_sorted = OrderedDict(sorted(sending_to.items(), key=lambda k: k[1]['count'], reverse=True))
    recieved_from_sorted = OrderedDict(sorted(recieved_from.items(), key=lambda k: k[1]['count'], reverse=True))
    
    print_table(sending_to_sorted)
    average_sharing(sending_to)

    print_table(recieved_from_sorted)
    average_sharing(recieved_from)

In [None]:
get_top_csyncs(sending_to, recieved_from)