In [2]:
import json
import utilities
from dateutil.parser import parse
from urllib import parse as URLparse
from openwpm_utils import domain as du
from collections import OrderedDict
from prettytable import PrettyTable
from tqdm.notebook import tqdm

In [10]:
base_directory = 'khaleesi/data/'

fx_interactive_http_dir = base_directory + 'crawl-http-labeled.json'
fx_interactive_js_dir = base_directory + 'crawl-js-connected-labeled.json'

top_10k_dir = base_directory + 'top-10K.csv'

In [11]:
top_10k_raw = utilities.read_file_newline_stripped(top_10k_dir)

In [12]:
top_10k = set()
for item in top_10k_raw:
    top_10k.add(du.get_ps_plus_1('http://www.' + item.split(',')[1]))

In [6]:
fx_interactive_http = utilities.read_json(fx_interactive_http_dir)
fx_interactive_js = utilities.read_json(fx_interactive_js_dir)

### Heuristic
1. Navigate to third party webpages
2. Set a cookie
3. Come back to first party webpages

In [13]:
def get_domain_or_hostname(url):
    # we stop if we cannot retrieve the domain or hostanmes
    # we won't be able to link domains/hostnames if they are empty or unavailable  
    current_domain_or_hostname = du.get_ps_plus_1(url)
    
    if current_domain_or_hostname == '' or current_domain_or_hostname == None:
        current_domain_or_hostname = du.urlparse(url).hostname
        if current_domain_or_hostname == '' or current_domain_or_hostname == None:
            return False, ''
    
    return True, current_domain_or_hostname

In [14]:
def find_bounce_trackers(json_representation):
    pbar = tqdm(total=len(json_representation), position=0, leave=True)
    bounce_tracking_candidates = {}
    for key in json_representation:
        pbar.update(1)

        top_url = json_representation[key]['top_url']
        top_domain_or_hostname = get_domain_or_hostname(top_url)

        if not top_domain_or_hostname[0]:
            continue

        top_domain_or_hostname = top_domain_or_hostname[1]

        for idx, item in enumerate(json_representation[key]['content']):
            current_url = item['url']
            current_domain_or_hostname = get_domain_or_hostname(current_url)

            if not current_domain_or_hostname[0]:
                continue

            current_domain_or_hostname = current_domain_or_hostname[1]

            if top_domain_or_hostname == current_domain_or_hostname:
                continue
            
            # because we start after seeing the first third party
            encountered_third_parties = 1 
            encountered_first_party = False
            last_redirect = False
            third_parties_setting_cookies = set()
            third_parties_accessing_cookies = set()
            
            third_party_main_page = False
            first_party_main_page = False
            
            if item['resource_type'] == 'main_frame':
                third_party_main_page = True
            
            if not third_party_main_page:
                continue
            
            for s_item in item['request_headers']:
                if s_item[0].lower() == 'cookie':
                    third_parties_accessing_cookies.add(current_domain_or_hostname)

            for s_item in item['response_headers']:
                if s_item[0].lower() == 'set-cookie':
                    third_parties_setting_cookies.add(current_domain_or_hostname)
            

            for idx_1 in range(idx+1, len(json_representation[key]['content'])):
                item_1 = json_representation[key]['content'][idx_1]
                next_url = item_1['url']
                next_domain_or_hostname = get_domain_or_hostname(next_url)

                if not next_domain_or_hostname[0]:
                    continue

                next_domain_or_hostname = next_domain_or_hostname[1]
                
                if next_domain_or_hostname == top_domain_or_hostname:
                    encountered_first_party = True

                    # We shoud also check if this is the last redirect
                    if idx_1+1 == len(json_representation[key]['content']):
                        last_redirect = True
                    
                    if item_1['resource_type'] == 'main_frame':
                        first_party_main_page = True
                        break

                elif next_domain_or_hostname != current_domain_or_hostname:
                    encountered_third_parties += 1 
                    
                    if item_1['resource_type'] == 'main_frame':
                        third_party_main_page = True
                    
                    if not third_party_main_page:
                        continue
                    
                    for s_item in item_1['request_headers']:
                        if s_item[0].lower() == 'cookie':
                            third_parties_accessing_cookies.add(next_domain_or_hostname)
                
                    for s_item in item_1['response_headers']:
                        if s_item[0].lower() == 'set-cookie':
                            third_parties_setting_cookies.add(next_domain_or_hostname)
                    
                else:
                    if item_1['resource_type'] == 'main_frame':
                        third_party_main_page = True
                        
                    if not third_party_main_page:
                        continue
                        
                    for s_item in item_1['request_headers']:
                        if s_item[0].lower() == 'cookie':
                            third_parties_accessing_cookies.add(next_domain_or_hostname)
                
                    for s_item in item_1['response_headers']:
                        if s_item[0].lower() == 'set-cookie':
                            third_parties_setting_cookies.add(next_domain_or_hostname)

            # if encountered_first_party:
            if first_party_main_page and third_party_main_page:
                bounce_tracking_candidates[key] = {}
                bounce_tracking_candidates[key]['last_redirect'] = last_redirect
                bounce_tracking_candidates[key]['accessing_cookies'] = third_parties_accessing_cookies
                bounce_tracking_candidates[key]['setting_cookies'] = third_parties_setting_cookies
                break
    return bounce_tracking_candidates
            

In [15]:
bounce_tracking_candidates = find_bounce_trackers(fx_interactive_http)

HBox(children=(IntProgress(value=0, max=166018), HTML(value='')))

In [16]:
print(len(bounce_tracking_candidates))

178


In [17]:
bounce_tracking_chains_count = 0
cookie_setting_domains = {}
for key in bounce_tracking_candidates:
    if len(bounce_tracking_candidates[key]['setting_cookies']) > 0:
        bounce_tracking_chains_count += 1
    for domain in bounce_tracking_candidates[key]['setting_cookies']:
        if domain not in cookie_setting_domains:
            cookie_setting_domains[domain] = {}
            cookie_setting_domains[domain]['chains'] = set()
        cookie_setting_domains[domain]['chains'].add(key)

print('Bounce tracking chains:', bounce_tracking_chains_count)

Bounce tracking chains: 161


In [18]:
def print_table(json_obj, count_limit = 20):
    count = 0
    t = PrettyTable(['Domains', 'count'])
    for key in json_obj:
        count += 1
        if count <= count_limit:
            t.add_row([key, len(json_obj[key]['top_domains'])])
            
    print(t)
    
def get_top_bouncers(third_parties):
    third_parties_sorted = OrderedDict(sorted(third_parties.items(), key=lambda k: len(k[1]['top_domains']), reverse=True))
    
    print_table(third_parties_sorted)

In [19]:
cookie_setting_domains_non_10k = {}
for item in cookie_setting_domains:
    if item not in top_10k:
        cookie_setting_domains_non_10k[item] = cookie_setting_domains[item]

len(cookie_setting_domains_non_10k)

14

In [20]:
def count_bounce_prevalence(json_representation, cookie_setting_domains_non_10k):
    pbar = tqdm(total=len(json_representation), position=0, leave=True)
    for key in json_representation:
        pbar.update(1)

        top_url = json_representation[key]['top_url']
        top_domain_or_hostname = get_domain_or_hostname(top_url)

        if not top_domain_or_hostname[0]:
            continue

        top_domain_or_hostname = top_domain_or_hostname[1]

        for idx, item in enumerate(json_representation[key]['content']):
            current_url = item['url']
            current_domain = get_domain_or_hostname(current_url)[1]

            if current_domain in cookie_setting_domains_non_10k:
                if 'top_domains' not in cookie_setting_domains_non_10k[current_domain]:
                    cookie_setting_domains_non_10k[current_domain]['top_domains'] = set()
                cookie_setting_domains_non_10k[current_domain]['top_domains'].add(top_domain_or_hostname)
                break
                
    return cookie_setting_domains_non_10k

In [21]:
cookie_setting_domains_non_10k = count_bounce_prevalence(fx_interactive_http, cookie_setting_domains_non_10k)
cookie_setting_domains_non_10k = count_bounce_prevalence(fx_interactive_js, cookie_setting_domains_non_10k)

HBox(children=(IntProgress(value=0, max=166018), HTML(value='')))

HBox(children=(IntProgress(value=0, max=130187), HTML(value='')))

In [22]:
get_top_bouncers(cookie_setting_domains_non_10k)

+----------------------+-------+
|       Domains        | count |
+----------------------+-------+
| googleadservices.com |  3073 |
|      adsrvr.org      |  1377 |
|      adform.net      |  322  |
|   flashtalking.com   |  141  |
|     queue-it.net     |   9   |
|  elsevierhealth.com  |   3   |
|      bngpt.com       |   2   |
|      optrck.com      |   2   |
|      olsvc.com       |   1   |
|     editorx.com      |   1   |
|   bridgetrack.com    |   1   |
|    surfconext.nl     |   1   |
| depositaccounts.com  |   1   |
|      kodeks.ru       |   1   |
+----------------------+-------+
