In [43]:
import glob
from os.path import basename
import pandas as pd
import json
from collections import defaultdict

In [44]:
high_entropy_values = set(['full_version', 'platform_version'])

In [45]:
# Opening JSON file
f = open('data/tracker_category.json')
tracker_category_json = json.load(f)
tracker_category_dict = json.loads(tracker_category_json)

In [46]:
# Opening JSON file
f = open('data/category_domains.json')
category_domains_json = json.load(f)
category_domains_dict = json.loads(category_domains_json)

In [47]:
# Opening JSON file
f = open('data/tracker_owner.json')
tracker_owner_json = json.load(f)
tracker_owner_dict = json.loads(tracker_owner_json)

In [11]:
api_calls = pd.read_csv("data/100k_nyc_api_calls.csv")

In [12]:
df = pd.read_csv("data/100k_nyc_leaky_reqs_with_hashes.csv")

In [13]:
df = df[df.search_type.isin(high_entropy_values)]

In [14]:
df.encoding.value_counts()

unencoded             470993
urlencode             352486
urlencode-base64       51426
base64                  3068
lzstring                   1
urlencode-lzstring         1
Name: encoding, dtype: int64

In [15]:
df[df.search_type.isin(['full_version','platform_version'])].drop_duplicates(['initial_url','hostname']).encoding.value_counts()

urlencode           42980
unencoded            3325
urlencode-base64     1992
base64                 58
Name: encoding, dtype: int64

In [16]:
third_party_leaks = df[df.third_party_req]

In [50]:
tracker_leaks = df[(df.third_party_req)&(df.is_tracker==1)]

In [51]:
tracker_leaks.head()

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_url_domain,third_party_req,req_has_third_party_initiator,...,request,request_type,category,rank_of_site,req_initiators,is_same_party,req_domain_entity,req_domain_category,is_tracker,hostname
0,113.0.5672.63,full_version,urlencode,url_leaks,https://selcuksportshd78.com/,selcuksportshd78.com,https://selcuksportshd78.com/,selcuksportshd78.com,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,,50000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,selcuksportshd78.com
1,5.15.0,platform_version,urlencode,url_leaks,https://selcuksportshd78.com/,selcuksportshd78.com,https://selcuksportshd78.com/,selcuksportshd78.com,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,,50000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,selcuksportshd78.com
2,5.15.0,platform_version,unencoded,url_leaks,https://selcuksportshd78.com/,selcuksportshd78.com,https://selcuksportshd78.com/,selcuksportshd78.com,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,,50000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,selcuksportshd78.com
4,113.0.5672.63,full_version,unencoded,url_leaks,https://selcuksportshd78.com/,selcuksportshd78.com,https://selcuksportshd78.com/,selcuksportshd78.com,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,,50000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,selcuksportshd78.com
5,113.0.5672.63,full_version,urlencode,url_leaks,https://selcuksportshd78.com/,selcuksportshd78.com,https://selcuksportshd78.com/,selcuksportshd78.com,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,,50000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,selcuksportshd78.com


# Crawl Statistics:

In [52]:
TOTAL_NUM_SUCC_SITES = 89763

## Leaks to first parties

In [53]:
df.hostname.nunique(), df.hostname.nunique()/TOTAL_NUM_SUCC_SITES

(48355, 0.5386963448191349)

# Percentage of sites where we detected a leak to third-party domains

In [54]:
third_party_leaks.hostname.nunique(),third_party_leaks.hostname.nunique()/TOTAL_NUM_SUCC_SITES

(47691, 0.5312990875973397)

# Sample leaks where we detected UA-leaks to third parties (sorted by rank)

In [55]:
third_party_leaks.sort_values(by='rank_of_site', ascending=True).drop_duplicates('hostname').head()

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_url_domain,third_party_req,req_has_third_party_initiator,...,request,request_type,category,rank_of_site,req_initiators,is_same_party,req_domain_entity,req_domain_category,is_tracker,hostname
694633,113.0.5672.63,full_version,urlencode,url_leaks,https://yadongkorea.org/,yadongkorea.org,https://yadongkorea.org/,yadongkorea.org,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,,1000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,yadongkorea.org
1337322,113.0.5672.63,full_version,urlencode,response_location_leaks,https://www.chinatimes.com/?chdtv,chinatimes.com,https://www.chinatimes.com/,chinatimes.com,True,True,...,{'url': 'https://feed.pghub.io/tag?gdpr=0&gdpr...,document,General News,1000,https://pghub.io/js/pandg-sdk.js,False,,,1,www.chinatimes.com
194182,113.0.5672.63,full_version,unencoded,post_leaks,https://www.google.com.mx/,google.com.mx,https://www.google.com.mx/,google.com.mx,True,False,...,{'url': 'https://play.google.com/log?format=js...,xhr,Search Engines,1000,https://www.gstatic.com/og/_/js/k=og.qtm.en_US...,True,Google,Ad Motivated Tracking Advertising Online Payme...,1,www.google.com.mx
595636,113.0.5672.63,full_version,unencoded,url_leaks,https://girlschannel.net/,girlschannel.net,https://girlschannel.net/,girlschannel.net,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,Forum/Bulletin Boards,1000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,girlschannel.net
276377,113.0.5672.63,full_version,urlencode,url_leaks,https://news.detik.com/,detik.com,https://news.detik.com/,detik.com,True,True,...,{'url': 'https://analytics.google.com/g/collec...,ping,Portal Sites,1000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Ad Motivated Tracking Advertising Online Payme...,1,news.detik.com


# Most popular third-party domains

In [56]:
popular_third_party_domains = third_party_leaks.drop_duplicates(['hostname', 'request_url']).request_url_domain.value_counts()

In [57]:
popular_third_party_domains.head()

doubleclick.net          76953
google-analytics.com     55639
google.com               45943
googlesyndication.com    36431
crwdcntrl.net             5197
Name: request_url_domain, dtype: int64

# Third-Party Domain Categories

In [58]:
for i, v in popular_third_party_domains.head().items():
    if i in tracker_category_dict:
        print('Domain: ', i,'# Distinct Req:', v,'# Distinct Sites',len(third_party_leaks[third_party_leaks.request_url_domain==i].drop_duplicates('initial_url')),'Category: ', tracker_category_dict[i][0:30], 'Owner: ', tracker_owner_dict[i][0:20],'\n')
    else:
        print('Domain', i,'# Distinct Req:', v,'# Distinct Sites',len(third_party_leaks[third_party_leaks.request_url_domain==i].drop_duplicates('initial_url')),'\n')

Domain:  doubleclick.net # Distinct Req: 76953 # Distinct Sites 14999 Category:  Ad Motivated Tracking Advertis Owner:  Google 

Domain:  google-analytics.com # Distinct Req: 55639 # Distinct Sites 27225 Category:  Advertising Analytics Audience Owner:  Google 

Domain:  google.com # Distinct Req: 45943 # Distinct Sites 17058 Category:  Ad Motivated Tracking Advertis Owner:  Google 

Domain:  googlesyndication.com # Distinct Req: 36431 # Distinct Sites 12271 Category:  Ad Motivated Tracking Advertis Owner:  Google 

Domain:  crwdcntrl.net # Distinct Req: 5197 # Distinct Sites 4819 Category:  Ad Motivated Tracking Advertis Owner:  Lotame Solutions 



# Percentage of sites where we detected a leak to tracker domains

In [59]:
tracker_leaks.hostname.nunique(),tracker_leaks.hostname.nunique()/TOTAL_NUM_SUCC_SITES

(47285, 0.5267760658623264)

# Sites where we detected UA-leak to tracker domains (sorted by rank)

In [60]:
tracker_leaks.sort_values(by='rank_of_site', ascending=True).drop_duplicates('initial_url_domain').initial_url_domain.head(10)

39703         tabelog.com
1642479    serviporno.com
1380609       pornhub.com
901171        discogs.com
679713            sex.com
1645024            999.md
586024              o2.pl
1637606       baji999.com
1386520     tmohentai.com
1636046          tenki.jp
Name: initial_url_domain, dtype: object

# Most popular tracker domains

In [61]:
popular_tracker_domains = tracker_leaks.drop_duplicates(['hostname']).request_url_domain.value_counts()

In [62]:
popular_tracker_domains.head()

google-analytics.com     22517
google.com                9325
doubleclick.net           8853
googlesyndication.com     2017
crwdcntrl.net              985
Name: request_url_domain, dtype: int64

# Tracker Categories

In [63]:
for i, v in popular_tracker_domains.head().items():
    if i in tracker_category_dict:
        print('Domain: ', i,'# Distinct Req:', v,'# Distinct Sites',len(tracker_leaks[tracker_leaks.request_url_domain==i].drop_duplicates('initial_url')),'Category: ', tracker_category_dict[i][0:30], 'Owner: ', tracker_owner_dict[i][0:30],'\n')
    else:
        print('Domain', i,'# Distinct Req:', v,'# Distinct Sites',len(tracker_leaks[tracker_leaks.request_url_domain==i].drop_duplicates('initial_url')),'\n')

Domain:  google-analytics.com # Distinct Req: 22517 # Distinct Sites 27225 Category:  Advertising Analytics Audience Owner:  Google 

Domain:  google.com # Distinct Req: 9325 # Distinct Sites 17058 Category:  Ad Motivated Tracking Advertis Owner:  Google 

Domain:  doubleclick.net # Distinct Req: 8853 # Distinct Sites 14999 Category:  Ad Motivated Tracking Advertis Owner:  Google 

Domain:  googlesyndication.com # Distinct Req: 2017 # Distinct Sites 12271 Category:  Ad Motivated Tracking Advertis Owner:  Google 

Domain:  crwdcntrl.net # Distinct Req: 985 # Distinct Sites 4819 Category:  Ad Motivated Tracking Advertis Owner:  Lotame Solutions 



# Most popular categories that we detected UA-leak

In [64]:
tracker_leaks.drop_duplicates(['hostname', 'request_url_domain']).category.value_counts().head()

 General News           7671
 Entertainment          3320
 Business               2460
 Education/Reference    2364
 Online Shopping        2164
Name: category, dtype: int64

# Most popular tracker categories that we detected UA-leak

In [65]:
distinct_reqs = third_party_leaks.drop_duplicates(['hostname','request_url_domain'])

In [66]:
categories_num_dict = defaultdict(set)
for index, row in distinct_reqs.iterrows():
    script_domain = row['request_url_domain']
    site_domain = row['initial_url_domain']
    for category, domains in category_domains_dict.items():
        if script_domain in  domains:
            categories_num_dict[category].add(site_domain)

In [67]:
sorted_list = sorted(categories_num_dict.items(), key=lambda x:len(x[1]),reverse=True)

In [68]:
for i in sorted_list:
    print(i[0],len(i[1]))

Advertising 41124
Audience Measurement 27526
Ad Motivated Tracking 27253
Third-Party Analytics Marketing 26220
Analytics 25509
CDN 15512
Online Payment 15084
Social - Share 2471
Embedded Content 2154
Action Pixels 625
Ad Fraud 337
Session Replay 205
Federated Login 138
Social Network 126
Badge 95
Social - Comment 84
SSO 12
Non-Tracking 1


# Which information was leaked most?

In [69]:
tracker_leaks.drop_duplicates(['hostname','search_type']).search_type.value_counts().head()

full_version        47046
platform_version    45404
Name: search_type, dtype: int64

In [70]:
tracker_leaks.sort_values(by='rank_of_site', ascending=True).drop_duplicates('hostname').head()

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_url_domain,third_party_req,req_has_third_party_initiator,...,request,request_type,category,rank_of_site,req_initiators,is_same_party,req_domain_entity,req_domain_category,is_tracker,hostname
39703,113.0.5672.63,full_version,urlencode-base64,url_leaks,https://tabelog.com/,tabelog.com,https://s.tabelog.com/,tabelog.com,True,True,...,{'url': 'https://securepubads.g.doubleclick.ne...,fetch,"Restaurants, Blogs/Wiki",1000,https://www.googletagservices.com/activeview/j...,False,Google,Ad Motivated Tracking Advertising,1,s.tabelog.com
1642479,5.15.0,platform_version,unencoded,url_leaks,https://www.serviporno.com/,serviporno.com,https://www.serviporno.com/,serviporno.com,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,Pornography,1000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,www.serviporno.com
1380609,113.0.5672.63,full_version,urlencode,url_leaks,https://it.pornhub.com/,pornhub.com,https://it.pornhub.com/,pornhub.com,True,True,...,{'url': 'https://www.google-analytics.com/g/co...,ping,Pornography,1000,https://www.googletagmanager.com/gtag/js?id=G-...,False,Google,Advertising Analytics Audience Measurement Thi...,1,it.pornhub.com
901171,5.15.0,platform_version,unencoded,post_leaks,https://www.discogs.com/,discogs.com,https://www.discogs.com/,discogs.com,True,False,...,"{'url': 'https://id5-sync.com/g/v2/488.json', ...",xhr,Entertainment,1000,https://st.discogs.com/b2de82f03ebb0f5acd8e2ec...,False,ID5,Ad Motivated Tracking Advertising,1,www.discogs.com
679713,113.0.5672.63,full_version,unencoded,url_leaks,https://www.sex.com/,sex.com,https://www.sex.com/,sex.com,True,True,...,{'url': 'https://analytics.cdn.live/matomo.php...,ping,Pornography,1000,https://analytics.cdn.live/matomo.js,False,,,1,www.sex.com


In [71]:
leak_set = set(df.hostname.unique())

In [72]:
api_call_set = set(api_calls[(api_calls.description=='NavigatorUAData.getHighEntropyValues')].hostname.unique())

In [73]:
len(leak_set),len(api_call_set)

(48355, 53148)

In [74]:
len(leak_set.difference(api_call_set))

72

In [75]:
len(api_call_set.difference(leak_set))

4865