In [1]:
import pandas as pd
from os.path import join
from nb_utils import value_counts_and_percentage
# show two decimal places for floats
pd.options.display.float_format = '{:,.1f}'.format
from collections import defaultdict
import pickle

In [2]:
with open('successed_hostnames.pkl', 'rb') as f:
    successed_hostnames = pickle.load(f)

In [3]:
with open('LeakDetector/tracker_category_dict.pkl', 'rb') as handle:
    tracker_category_dict = pickle.load(handle)
with open('LeakDetector/category_domains.pkl', 'rb') as handle:
    category_domains_dict = pickle.load(handle)
with open('LeakDetector/tracker_owner_dict.pkl', 'rb') as handle:
    tracker_owner_dict = pickle.load(handle)

In [4]:
df = pd.read_pickle("../pickles/100k_nyc_all_reqs.pkl")

In [5]:
delegation_df = pd.read_pickle("../pickles/100k_nyc_delegation_df.pkl")

In [10]:
# CONSTANTS:

DEFAULT_SAVE_HEADERS = [
    "accept_ch",
]


RESPONSE_HEADERS = [
    "permissions_policy",
    "feature_policy",
    "critical_ch",
    "vary",
]

CLIENT_HINT_REQUEST_HEADERS = [
   "save_data",
    "sec_ch_dpr",
    "sec_ch_width",
    "sec_ch_viewport_width",
    "sec_ch_viewport_height",
    "sec_ch_device_memory",
    "rtt",
    "downlink",
    "ect",
    "sec_ch_prefers_color_scheme",
    "sec_ch_prefers_reduced_motion",
    "sec_ch_ua",
    "sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_mobile",
    "sec_ch_ua_model",
    "sec_ch_ua_platform",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64",
]

CRITICAL_CLIENT_HINT_REQUEST_HEADERS = [
   "save_data",
    "sec_ch_ua",
    "sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_mobile",
    "sec_ch_ua_model",
    "sec_ch_ua_platform",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64"
]

# Crawl Statistics:

In [11]:
TOTAL_NUM_SUCC_SITES = 89763

# A specific (non-default) UA client hint (CH) is sent

In [12]:
def check_header_prominence_all(df, header_values, total_page_num):
    for header_value in header_values:
        res = df.loc[ (df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates('hostname')
        print(header_value, len(res), round(len(res)/total_page_num,5))

In [13]:
for header in [DEFAULT_SAVE_HEADERS, RESPONSE_HEADERS, CRITICAL_CLIENT_HINT_REQUEST_HEADERS]:
    check_header_prominence_all(df,header,TOTAL_NUM_SUCC_SITES)

accept_ch 33346 0.37149
permissions_policy 31236 0.34798
feature_policy 1586 0.01767
critical_ch 587 0.00654
vary 83722 0.9327
save_data 226 0.00252
sec_ch_ua 89141 0.99307
sec_ch_ua_arch 667 0.00743
sec_ch_ua_bitness 491 0.00547
sec_ch_ua_full_version 581 0.00647
sec_ch_ua_full_version_list 696 0.00775
sec_ch_ua_mobile 89141 0.99307
sec_ch_ua_model 886 0.00987
sec_ch_ua_platform 89141 0.99307
sec_ch_ua_platform_version 886 0.00987
sec_ch_ua_wow64 401 0.00447


In [14]:
def check_header_prominence_third_party(df, header_values):
    ch_values = dict()
    for header_value in header_values:
        res = len(df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['hostname']))
        ch_values[header_value] = res
    return ch_values

In [None]:
headers = DEFAULT_SAVE_HEADERS + RESPONSE_HEADERS + CRITICAL_CLIENT_HINT_REQUEST_HEADERS
ch_values_third_party = check_header_prominence_third_party(df, headers)

In [None]:
sorted(ch_values_third_party.items(), key=lambda x:x[1], reverse=True)

In [None]:
h_ent_headers = ["sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_model",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64"]
third_domains = set()
third_p_sites = set()
for header_value in h_ent_headers:
    domains = df[(df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].request_url_domain.unique()
    for domain in domains:
        third_domains.add(domain)
    sites = df[(df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].hostname.unique()
    for site in sites:
        third_p_sites.add(site)

In [None]:
len(third_p_sites), third_p_sites

In [None]:
len(third_domains),third_domains

In [None]:
df[df.is_third_party].request_url_domain.nunique()

In [None]:
def check_header_prominence_tracker(df, header_values):
    ch_values = dict()
    for header_value in header_values:
        res = len(df.loc[ (df['is_third_party']) &(df['is_tracker']=='1') &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['hostname']))
        ch_values[header_value] = res
    return ch_values

In [None]:
ch_values_tracker = check_header_prominence_tracker(df, headers)

In [None]:
sorted(ch_values_tracker.items(), key=lambda x:x[1], reverse=True)

# A CH is sent to a third-party domain

In [None]:
def check_sites_send_ch(df, header_values):
    sites = set()
    for header_value in header_values:
        res = list(df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates('hostname').hostname)
        for domain in res:
            sites.add(domain)
    return sites

In [None]:
sites_send_ch_to_third_parties = check_sites_send_ch(df, CRITICAL_CLIENT_HINT_REQUEST_HEADERS)

# Number of sites where a CH is sent to a third party:

In [None]:
len(sites_send_ch_to_third_parties)

# Sites where a CH is sent to a third party:

In [None]:
sites_send_ch_to_third_parties

# Top domains that receive CH headers (for each type of CH)

In [None]:
def check_third_party_domains(df, header_values):
    third_party_domains = defaultdict(int)
    categories_num_dict = defaultdict(int)
    for header_value in header_values:
        res = df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['request_url_domain','hostname']).request_url_domain.value_counts()
        print(header_value, len(res), round(len(res)/TOTAL_NUM_SUCC_SITES,5))
        for key, value in res.items():
            for category, domains in category_domains_dict.items():
                if key in  domains:
                    categories_num_dict[category]+=1
            third_party_domains[key]+=value
    return third_party_domains, categories_num_dict

In [None]:
categories_num_dict = check_third_party_domains(df,headers)

In [None]:
h_ent_headers = ["sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_model",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64"]
categories_num_dict = defaultdict(set)
for header_value in h_ent_headers:
    distinct_reqs = df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['request_url_domain','hostname'])
    for index, row in distinct_reqs.iterrows():
        script_domain = row['request_url_domain']
        site_domain = row['site_domain']
        for category, domains in category_domains_dict.items():
            if script_domain in  domains:
                categories_num_dict[category].add(site_domain)

In [None]:
sorted_list = sorted(categories_num_dict.items(), key=lambda x:len(x[1]),reverse=True)

In [None]:
for i in sorted_list:
    print(i[0],len(i[1]))

# CHs are delegated by a Permissions Policy header or HTML tags

# CHs are delegated by delegation with equiv attr:

In [None]:
delegation_df[delegation_df.delegateCHOnEquiv!=''][['site_domain','delegateCHOnEquiv']]

# CHs are delegated by delegation with name attr:

In [None]:
delegation_df[delegation_df.delegateCHOnName!=''][['site_domain','delegateCHOnName']]

# CHs are accepted with euqiv attr:

In [None]:
delegation_df[delegation_df.acceptCHOnEquiv!=''][['site_domain','acceptCHOnEquiv']]

In [None]:
delegation_df[delegation_df.acceptCHOnEquiv!=''][['site_domain','acceptCHOnEquiv']]

In [None]:
delegation_df[delegation_df.acceptCHOnEquiv!=''][['site_domain','acceptCHOnEquiv']].to_pickle('../pickles/accept_ch_html.pkl')

# CHs are accepted with name attr:

In [None]:
delegation_df[delegation_df.acceptCHOnName!=''][['site_domain','acceptCHOnName']]

# CHs are delegated by an iframe:

In [None]:
delegation_df[delegation_df.frameClientHints!=''][['site_domain','frameClientHints', 'frameSrcs']]