In [1]:
import pandas as pd
import janitor

from utilities import pandas_to_tex, load_visit_data
import warnings

from constants import filepaths

warnings.filterwarnings("ignore")
# pd.set_option("display.max_columns", 150)

Checking that all paths exist:
{'web_mobile': True, 'web_desktop': True, 'web': True, 'yg_profile': True, 'blacklight': True, 'who': True}


In [2]:
df_visits = load_visit_data()
print(f"{len(df_visits)=:,}")
df_visits.head()

len(df_visits)=6,297,382
len(df_visits)=6,297,382


Unnamed: 0,caseid,private_domain,category,visit_time_local,visit_duration,page_views,source
0,205323077,google.com,Search Engines and Portals,2022-05-31 23:52:37,2,1,mobile_web
1,205323077,coupons.com,"Business, Shopping",2022-06-01 01:07:35,457,10,mobile_web
2,205323077,google.com,Business,2022-06-01 01:15:12,55,1,mobile_web
3,205323077,coupons.com,"Business, Shopping",2022-06-01 01:16:07,2225,4,mobile_web
4,205323077,google.com,Search Engines and Portals,2022-06-01 04:38:10,10,1,mobile_web


In [3]:
# Get domain-category (using most common cat by visits)
df_domain_cat = (
    df_visits.groupby(["private_domain", "category"])
    .size()
    .reset_index(name="count")
    .sort_values(["private_domain", "count"], ascending=(True, False))
    .drop_duplicates("private_domain", keep="first")
    .remove_columns("count")
)
df_domain_cat

Unnamed: 0,private_domain,category
0,007james.com,"Business, Entertainment"
1,0123movie.net,"Entertainment, Illegal Content"
2,08liter.com,Business
3,09myuser.com,Business
4,0redird.com,Parked
...,...,...
30038,zyn.com,"Alcohol and Tobacco, Education"
30041,zynga.com,Games
30042,zyngaplayerforums.com,Games
30043,zype.com,"Adult, Information Technology"


In [4]:
# Get domain-unique traffic from users
df_domain_traffic = (
    df_visits.groupby(["private_domain"])["caseid"]
    .nunique()
    .reset_index(name="traffic")
    .sort_values("traffic", ascending=False, ignore_index=True)
)

dict_domain_traffic = df_domain_traffic.set_index("private_domain")["traffic"].to_dict()

df_domain_traffic

Unnamed: 0,private_domain,traffic
0,google.com,1080
1,facebook.com,886
2,amazon.com,827
3,youtube.com,766
4,yahoo.com,570
...,...,...
64069,1007bobfm.com,1
64070,1-800-shaved-ice.com,1
64071,zooskoolvideos.com,1
64072,zyratalk.com,1


In [5]:
df_blacklight = (
    pd.read_csv(filepaths["blacklight"])
    # Fix filename
    .assign(
        private_domain=lambda df_: df_["filename"].str.replace("_", ".", regex=False)
    )
    .remove_columns("filename")
    .reorder_columns(["private_domain"])
    .set_index("private_domain")
    .add_prefix("bl_")
    .reset_index()
)

df_blacklight

Unnamed: 0,private_domain,bl_ddg_join_ads,bl_third_party_cookies,bl_canvas_fingerprinting,bl_session_recording,bl_key_logging,bl_fb_pixel,bl_google_analytics
0,costarmanager.com,5,10,0,1,0,0,0
1,teasource.com,11,11,0,0,0,1,1
2,1800tequila.com,8,6,0,0,0,0,0
3,mazon.com,1,0,0,0,0,0,0
4,theancestorhunt.com,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
34073,methowtrails.org,1,0,0,0,0,0,0
34074,mistystoyaussies.com,4,6,0,0,0,0,0
34075,hellostarz.com,0,0,0,0,0,0,0
34076,kbdfans.com,5,7,0,0,0,0,1


In [6]:
# Combine
tracker_cols = [
    "bl_ddg_join_ads",
    "bl_third_party_cookies",
    "bl_fb_pixel",
    "bl_google_analytics",
    "bl_session_recording",
    "bl_key_logging",
    "bl_canvas_fingerprinting",
]

df = (
    df_visits.select_columns(["caseid", "private_domain"])
    # ================================================
    # Only get unique visits to domains
    .drop_duplicates(["caseid", "private_domain"])
    # Get tt unique traffic per domain
    .groupby(["private_domain"])
    .size()
    .reset_index(name="visits")
    .sort_values("visits", ascending=False, ignore_index=True)
    # ================================================
    # Get trackers
    .merge(df_blacklight, how="left", on="private_domain", validate="m:1")
    # Get weighted sum
    .assign(**{k: lambda df_, col=k: df_[col] * df_["visits"] for k in tracker_cols})
    .reorder_columns(["private_domain", "visits", *tracker_cols])
)
df.head(30)

Unnamed: 0,private_domain,visits,bl_ddg_join_ads,bl_third_party_cookies,bl_fb_pixel,bl_google_analytics,bl_session_recording,bl_key_logging,bl_canvas_fingerprinting
0,google.com,1080,3240.0,4320.0,0.0,0.0,0.0,0.0,0.0
1,facebook.com,886,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,amazon.com,827,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,youtube.com,766,1532.0,766.0,0.0,0.0,0.0,0.0,0.0
4,yahoo.com,570,17670.0,15960.0,0.0,0.0,0.0,570.0,0.0
5,yougov.com,544,,,,,,,
6,paypal.com,541,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,walmart.com,518,518.0,0.0,0.0,0.0,0.0,0.0,0.0
8,bing.com,461,0.0,2305.0,0.0,0.0,0.0,0.0,0.0
9,wikipedia.org,456,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_long = (
    df.melt(
        id_vars=["private_domain"],
        value_vars=tracker_cols,
        var_name="tracker_type",
        value_name="weighted_exposure",
    )
    .query("weighted_exposure>0")
    .reset_index(drop=True)
)
df_long

Unnamed: 0,private_domain,tracker_type,weighted_exposure
0,google.com,bl_ddg_join_ads,3240.0
1,youtube.com,bl_ddg_join_ads,1532.0
2,yahoo.com,bl_ddg_join_ads,17670.0
3,walmart.com,bl_ddg_join_ads,518.0
4,twitter.com,bl_ddg_join_ads,440.0
...,...,...,...
57857,zipzipzipper.com,bl_canvas_fingerprinting,1.0
57858,zip-hudforeclosures.com,bl_canvas_fingerprinting,1.0
57859,zonehaven.com,bl_canvas_fingerprinting,1.0
57860,zombiesrungame.com,bl_canvas_fingerprinting,1.0


In [8]:
# tt exposure by domain and tracker type
df_domain_exposure = df_long.groupby(
    ["tracker_type", "private_domain"], as_index=False
).agg(tt_exposure=("weighted_exposure", "sum"))
df_domain_exposure

Unnamed: 0,tracker_type,private_domain,tt_exposure
0,bl_canvas_fingerprinting,1031thewave.com,1.0
1,bl_canvas_fingerprinting,1040now.net,1.0
2,bl_canvas_fingerprinting,10tv.com,2.0
3,bl_canvas_fingerprinting,11alive.com,1.0
4,bl_canvas_fingerprinting,12newsnow.com,1.0
...,...,...,...
57857,bl_third_party_cookies,zwift.com,65.0
57858,bl_third_party_cookies,zyratalk.com,6.0
57859,bl_third_party_cookies,zyvr.com,9.0
57860,bl_third_party_cookies,zzounds.com,50.0


In [9]:
_top_n = 50
df_top_domains = (
    df_domain_exposure.sort_values(
        ["tracker_type", "tt_exposure"], ascending=[True, False], ignore_index=True
    )
    .groupby("tracker_type")
    .head(_top_n)
)
df_top_domains

Unnamed: 0,tracker_type,private_domain,tt_exposure
0,bl_canvas_fingerprinting,microsoft.com,439.0
1,bl_canvas_fingerprinting,linkedin.com,362.0
2,bl_canvas_fingerprinting,live.com,288.0
3,bl_canvas_fingerprinting,capitalone.com,242.0
4,bl_canvas_fingerprinting,homedepot.com,217.0
...,...,...,...
39775,bl_third_party_cookies,timeanddate.com,3096.0
39776,bl_third_party_cookies,medallia.com,3087.0
39777,bl_third_party_cookies,homedepot.com,3038.0
39778,bl_third_party_cookies,civicscience.com,2997.0


In [10]:
summary = {}

for tracker in tracker_cols:
    tracker_data = (
        df_top_domains.query("tracker_type == @tracker")
        .merge(df_domain_traffic, how="left", on="private_domain")
        .assign(
            annotate=lambda df_: df_["private_domain"]
            + " ("
            + df_["traffic"].apply(str)
            + ")"
        )["annotate"]
        .tolist()
    )
    summary[tracker] = tracker_data

df_summary = (
    pd.DataFrame.from_dict(summary, orient="index")
    .transpose()
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_summary

Unnamed: 0,ix,bl_ddg_join_ads,bl_third_party_cookies,bl_fb_pixel,bl_google_analytics,bl_session_recording,bl_key_logging,bl_canvas_fingerprinting
0,1,yahoo.com (570),weather.com (194),ebay.com (379),force.com (144),capitalone.com (242),yahoo.com (570),microsoft.com (439)
1,2,imdb.com (312),microsoft.com (439),usps.com (309),kohls.com (124),homedepot.com (217),weather.com (194),linkedin.com (362)
2,3,weather.com (194),yahoo.com (570),homedepot.com (217),evergage.com (61),attn.tv (157),yelp.com (160),live.com (288)
3,4,cnn.com (207),imdb.com (312),ups.com (196),narvar.com (61),medallia.com (147),attn.tv (157),capitalone.com (242)
4,5,microsoft.com (439),cnn.com (207),chase.com (191),gerberlife.com (45),cmix.com (142),capitaloneshopping.com (154),homedepot.com (217)
5,6,kohls.com (124),live.com (288),rakuten.com (185),coupons.com (43),att.com (132),dropbox.com (99),tiktok.com (216)
6,7,live.com (288),kohls.com (124),netflix.com (177),factor75.com (43),kohls.com (124),qvc.com (96),target.com (214)
7,8,usps.com (309),cbsnews.com (86),attn.tv (157),adp.com (40),emi-rs.com (121),activemeasure.com (71),rakuten.com (185)
8,9,nytimes.com (202),xfinity.com (120),capitaloneshopping.com (154),everyplate.com (38),xfinity.com (120),kroger.com (69),capitaloneshopping.com (154)
9,10,forbes.com (87),huffpost.com (82),hulu.com (146),priceline.com (37),discover.com (111),mapquest.com (67),washingtonpost.com (147)


In [11]:
pandas_to_tex(
    df_summary,
    texfile="../tables/bl_top_contributors_domain.tex",
    escape=False,
)

In [12]:
!cat ../tables/bl_top_contributors_domain.tex

1 & yahoo.com (570) & weather.com (194) & ebay.com (379) & force.com (144) & capitalone.com (242) & yahoo.com (570) & microsoft.com (439) \\
2 & imdb.com (312) & microsoft.com (439) & usps.com (309) & kohls.com (124) & homedepot.com (217) & weather.com (194) & linkedin.com (362) \\
3 & weather.com (194) & yahoo.com (570) & homedepot.com (217) & evergage.com (61) & attn.tv (157) & yelp.com (160) & live.com (288) \\
4 & cnn.com (207) & imdb.com (312) & ups.com (196) & narvar.com (61) & medallia.com (147) & attn.tv (157) & capitalone.com (242) \\
5 & microsoft.com (439) & cnn.com (207) & chase.com (191) & gerberlife.com (45) & cmix.com (142) & capitaloneshopping.com (154) & homedepot.com (217) \\
6 & kohls.com (124) & live.com (288) & rakuten.com (185) & coupons.com (43) & att.com (132) & dropbox.com (99) & tiktok.com (216) \\
7 & live.com (288) & kohls.com (124) & netflix.com (177) & factor75.com (43) & kohls.com (124) & qvc.com (96) & target.com (214) \\
8 & usps.com (309) & cbsn