In [1]:
import pandas as pd
from os.path import join
# show two decimal places for floats
pd.options.display.float_format = '{:,.1f}'.format
from collections import defaultdict
import json
import pickle

In [2]:
# Opening JSON file
f = open('data/category_domains.json')
category_domains_json = json.load(f)
category_domains_dict = json.loads(category_domains_json)

In [4]:
df = pd.read_csv("data/100k_nyc_all_reqs.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
delegation_df = pd.read_csv('data/100k_nyc_delegation_df.csv')

In [37]:
# CONSTANTS:

DEFAULT_SAVE_HEADERS = [
    "accept_ch",
]


RESPONSE_HEADERS = [
    "permissions_policy",
    "feature_policy",
    "critical_ch",
    "vary",
]

CLIENT_HINT_REQUEST_HEADERS = [
   "save_data",
    "sec_ch_dpr",
    "sec_ch_width",
    "sec_ch_viewport_width",
    "sec_ch_viewport_height",
    "sec_ch_device_memory",
    "rtt",
    "downlink",
    "ect",
    "sec_ch_prefers_color_scheme",
    "sec_ch_prefers_reduced_motion",
    "sec_ch_ua",
    "sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_mobile",
    "sec_ch_ua_model",
    "sec_ch_ua_platform",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64",
]

CRITICAL_CLIENT_HINT_REQUEST_HEADERS = [
   "save_data",
    "sec_ch_ua",
    "sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_mobile",
    "sec_ch_ua_model",
    "sec_ch_ua_platform",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64"
]

# Crawl Statistics:

In [38]:
TOTAL_NUM_SUCC_SITES = 89763

# A specific (non-default) UA client hint (CH) is sent

In [39]:
def check_header_prominence_all(df, header_values, total_page_num):
    for header_value in header_values:
        res = df.loc[ (df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates('hostname')
        print(header_value, len(res), round(len(res)/total_page_num,5))

In [40]:
for header in [DEFAULT_SAVE_HEADERS, RESPONSE_HEADERS, CRITICAL_CLIENT_HINT_REQUEST_HEADERS]:
    check_header_prominence_all(df,header,TOTAL_NUM_SUCC_SITES)

accept_ch 33346 0.37149
permissions_policy 31236 0.34798
feature_policy 1586 0.01767
critical_ch 587 0.00654
vary 83722 0.9327
save_data 226 0.00252
sec_ch_ua 89141 0.99307
sec_ch_ua_arch 667 0.00743
sec_ch_ua_bitness 492 0.00548
sec_ch_ua_full_version 582 0.00648
sec_ch_ua_full_version_list 696 0.00775
sec_ch_ua_mobile 89142 0.99308
sec_ch_ua_model 887 0.00988
sec_ch_ua_platform 89141 0.99307
sec_ch_ua_platform_version 886 0.00987
sec_ch_ua_wow64 401 0.00447


In [41]:
def check_header_prominence_third_party(df, header_values):
    ch_values = dict()
    for header_value in header_values:
        res = len(df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['hostname']))
        ch_values[header_value] = res
    return ch_values

In [42]:
headers = DEFAULT_SAVE_HEADERS + RESPONSE_HEADERS + CRITICAL_CLIENT_HINT_REQUEST_HEADERS
ch_values_third_party = check_header_prominence_third_party(df, headers)

In [43]:
sorted(ch_values_third_party.items(), key=lambda x:x[1], reverse=True)

[('sec_ch_ua', 78476),
 ('sec_ch_ua_mobile', 78476),
 ('sec_ch_ua_platform', 78476),
 ('vary', 75602),
 ('accept_ch', 32489),
 ('permissions_policy', 29495),
 ('feature_policy', 612),
 ('critical_ch', 519),
 ('sec_ch_ua_platform_version', 331),
 ('sec_ch_ua_model', 329),
 ('sec_ch_ua_full_version_list', 261),
 ('sec_ch_ua_arch', 257),
 ('save_data', 225),
 ('sec_ch_ua_bitness', 217),
 ('sec_ch_ua_full_version', 217),
 ('sec_ch_ua_wow64', 210)]

In [44]:
h_ent_headers = ["sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_model",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64"]
third_domains = set()
third_p_sites = set()
for header_value in h_ent_headers:
    domains = df[(df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].request_url_domain.unique()
    for domain in domains:
        third_domains.add(domain)
    sites = df[(df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].hostname.unique()
    for site in sites:
        third_p_sites.add(site)

In [45]:
len(third_p_sites), third_p_sites

(331,
 {'9now.nine.com.au',
  'accounts.google.com',
  'admanager.google.com',
  'admin.google.com',
  'adssettings.google.com',
  'ai2.appinventor.mit.edu',
  'aldine.schoology.com',
  'amp.fitforfun.de',
  'amp.focus.de',
  'amphi.schoology.com',
  'andrews.schoology.com',
  'annaisd.schoology.com',
  'aopcatholicschools.schoology.com',
  'appo.schoology.com',
  'apps.admob.com',
  'augprep.schoology.com',
  'autot.tori.fi',
  'bard.google.com',
  'baseball.yahoo.co.jp',
  'bathwildcats.schoology.com',
  'bboed.schoology.com',
  'beauty.yahoo.co.jp',
  'blountboe.schoology.com',
  'bryanisd.schoology.com',
  'burlesonisd.schoology.com',
  'card.yahoo.co.jp',
  'carview.yahoo.co.jp',
  'centennial.schoology.com',
  'chat.google.com',
  'chiebukuro.yahoo.co.jp',
  'chisd.schoology.com',
  'clarence.schoology.com',
  'classical.schoology.com',
  'cloudsearch.google.com',
  'comicsflix.com',
  'connect.cargotec.com',
  'consent.youtube.com',
  'console.cloud.google.com',
  'console.fireb

In [46]:
len(third_domains),third_domains

(378,
 {'10kysymysta.fi',
  '1rx.io',
  '2cnt.net',
  '2mdn.net',
  '33across.com',
  '360yield-basic.com',
  '360yield.com',
  '3lift.com',
  '4dex.io',
  '9cdn.net',
  'a-mpd.com',
  'a2d.io',
  'acuityplatform.com',
  'ad-delivery.net',
  'ad-stir.com',
  'adentifi.com',
  'adform.net',
  'adgrx.com',
  'adhese.com',
  'adingo.jp',
  'adkernel.com',
  'admanmedia.com',
  'adnami.io',
  'adnxs-simple.com',
  'adnxs.com',
  'adobedtm.com',
  'ads-twitter.com',
  'adsrvr.org',
  'adtdp.com',
  'advertising.com',
  'aftonbladet-cdn.se',
  'agrvt.com',
  'aimtell.com',
  'aimtell.io',
  'ajax.googleapis.com',
  'akamaized.net',
  'akstat.io',
  'amazon-adsystem.com',
  'amazon.dev',
  'amplitude.com',
  'ampproject.org',
  'app-us1.com',
  'app.link',
  'appier.net',
  'aptoma.no',
  'atp.fox',
  'auth0.com',
  'bendapp.co',
  'betweendigital.com',
  'bf-ad.net',
  'bf-tools.net',
  'bfops.io',
  'bidr.io',
  'bidswitch.net',
  'bidtheatre.com',
  'bing.com',
  'bizographics.com',
  'bra

In [47]:
df[df.is_third_party].request_url_domain.nunique()

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [None]:
def check_header_prominence_tracker(df, header_values):
    ch_values = dict()
    for header_value in header_values:
        res = len(df.loc[ (df['is_third_party']) &(df['is_tracker']==1) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['hostname']))
        ch_values[header_value] = res
    return ch_values

In [None]:
ch_values_tracker = check_header_prominence_tracker(df, headers)

In [None]:
sorted(ch_values_tracker.items(), key=lambda x:x[1], reverse=True)

# A CH is sent to a third-party domain

In [None]:
def check_sites_send_ch(df, header_values):
    sites = set()
    for header_value in header_values:
        res = list(df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates('hostname').hostname)
        for domain in res:
            sites.add(domain)
    return sites

In [None]:
sites_send_ch_to_third_parties = check_sites_send_ch(df, CRITICAL_CLIENT_HINT_REQUEST_HEADERS)

# Number of sites where a CH is sent to a third party:

In [None]:
len(sites_send_ch_to_third_parties)

# Sites where a CH is sent to a third party:

In [None]:
sites_send_ch_to_third_parties

# Top domains that receive CH headers (for each type of CH)

In [None]:
def check_third_party_domains(df, header_values):
    third_party_domains = defaultdict(int)
    categories_num_dict = defaultdict(int)
    for header_value in header_values:
        res = df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['request_url_domain','hostname']).request_url_domain.value_counts()
        print(header_value, len(res), round(len(res)/TOTAL_NUM_SUCC_SITES,5))
        for key, value in res.items():
            for category, domains in category_domains_dict.items():
                if key in  domains:
                    categories_num_dict[category]+=1
            third_party_domains[key]+=value
    return third_party_domains, categories_num_dict

In [None]:
categories_num_dict = check_third_party_domains(df,headers)

In [None]:
h_ent_headers = ["sec_ch_ua_arch",
    "sec_ch_ua_bitness",
    "sec_ch_ua_full_version",
    "sec_ch_ua_full_version_list",
    "sec_ch_ua_model",
    "sec_ch_ua_platform_version",
    "sec_ch_ua_wow64"]
categories_num_dict = defaultdict(set)
for header_value in h_ent_headers:
    distinct_reqs = df.loc[ (df['is_third_party']) &(df[header_value] != '') & (~df[header_value].isnull())].drop_duplicates(['request_url_domain','hostname'])
    for index, row in distinct_reqs.iterrows():
        script_domain = row['request_url_domain']
        site_domain = row['site_domain']
        for category, domains in category_domains_dict.items():
            if script_domain in  domains:
                categories_num_dict[category].add(site_domain)

In [None]:
for category, domains in category_domains_dict.items():
    print(category)

In [None]:
sorted_list = sorted(categories_num_dict.items(), key=lambda x:len(x[1]),reverse=True)

In [None]:
for i in sorted_list:
    print(i[0],len(i[1]))

# CHs are delegated by a Permissions Policy header or HTML tags

# CHs are delegated by delegation with equiv attr:

In [None]:
delegation_df[delegation_df.delegateCHOnEquiv!=''][['site_domain','delegateCHOnEquiv']]

# CHs are delegated by delegation with name attr:

In [None]:
delegation_df[delegation_df.delegateCHOnName!=''][['site_domain','delegateCHOnName']]

# CHs are accepted with euqiv attr:

In [None]:
delegation_df[delegation_df.acceptCHOnEquiv!=''][['site_domain','acceptCHOnEquiv']]

In [None]:
delegation_df[delegation_df.acceptCHOnEquiv!=''][['site_domain','acceptCHOnEquiv']]

In [None]:
delegation_df[delegation_df.acceptCHOnEquiv!=''][['site_domain','acceptCHOnEquiv']].to_pickle('../pickles/accept_ch_html.pkl')

# CHs are accepted with name attr:

In [None]:
delegation_df[delegation_df.acceptCHOnName!=''][['site_domain','acceptCHOnName']]

# CHs are delegated by an iframe:

In [None]:
delegation_df[delegation_df.frameClientHints!=''][['site_domain','frameClientHints', 'frameSrcs']]