In [1]:
import os
import pandas as pd
import janitor

from utilities import pandas_to_tex
import warnings

warnings.filterwarnings("ignore")

FP_WEB_MOBILE = "../data/yg/realityMine_web_mobile_2022-06-01_2022-06-30.csv"
FP_WEB_DESKTOP = "../data/yg/realityMine_web_desktop_2022-06-01_2022-06-30.csv"
FP_WEB = "../data/yg/realityMine_web_2022-06-01_2022-06-30.csv"
FP_BLACKLIGHT = "../data/blacklight_domain.csv"

filepaths = dict(
    web_mobile=FP_WEB_MOBILE,
    web_desktop=FP_WEB_DESKTOP,
    web=FP_WEB,
    blacklight=FP_BLACKLIGHT,
)

print("Checking that all paths exist:")
{key: os.path.exists(path) for key, path in filepaths.items()}

Checking that all paths exist:


{'web_mobile': True, 'web_desktop': True, 'web': True, 'blacklight': True}

## Prep data

In [2]:
# (fold cell) Load and prep web browsing data
# output = df with visits to adult sites
usecols_webmobile_webdesktop = [
    "caseid",
    "category",
    "private_domain",
    "visit_duration",
    "visit_time_local",
    "page_views",
]
usecols_web = [
    "caseid",
    "category",
    "private_domain",
    "page_duration",
    "session_start_time",
]

df_visits = pd.concat(
    [
        # ===============================================
        # Get web_mobile
        pd.read_csv(
            filepaths["web_mobile"],
            usecols=usecols_webmobile_webdesktop,
            low_memory=False,
        ).assign(source="mobile_web"),
        # ===============================================
        # Get web_desktop
        pd.read_csv(
            filepaths["web_desktop"],
            usecols=usecols_webmobile_webdesktop,
            low_memory=False,
        ).assign(source="desktop_web"),
        # ===============================================
        # Get web
        (
            pd.read_csv(
                filepaths["web"],
                usecols=usecols_web,
                low_memory=False,
            )
            # Renaming columns to be consistent w/ web_mobile & web_desktop
            .rename_column("session_start_time", "visit_time_local")
            .rename_column("page_duration", "visit_duration")
            .assign(source="web")
            # Adding page_view to be consistent w/ web_mobile & web_desktop
            .assign(page_views=1)
        ),
    ]
)
print(f"{len(df_visits)=:,}")
df_visits.head()

len(df_visits)=6,297,382


Unnamed: 0,caseid,private_domain,category,visit_time_local,visit_duration,page_views,source
0,205323077,google.com,Search Engines and Portals,2022-05-31 23:52:37,2,1,mobile_web
1,205323077,coupons.com,"Business, Shopping",2022-06-01 01:07:35,457,10,mobile_web
2,205323077,google.com,Business,2022-06-01 01:15:12,55,1,mobile_web
3,205323077,coupons.com,"Business, Shopping",2022-06-01 01:16:07,2225,4,mobile_web
4,205323077,google.com,Search Engines and Portals,2022-06-01 04:38:10,10,1,mobile_web


In [3]:
# Get domain-category (using most common cat by visits)
df_domain_cat = (
    df_visits.groupby(["private_domain", "category"])
    .size()
    .reset_index(name="count")
    .sort_values(["private_domain", "count"], ascending=(True, False))
    .drop_duplicates("private_domain", keep="first")
    .remove_columns("count")
)
df_domain_cat

Unnamed: 0,private_domain,category
0,007james.com,"Business, Entertainment"
1,0123movie.net,"Entertainment, Illegal Content"
2,08liter.com,Business
3,09myuser.com,Business
4,0redird.com,Parked
...,...,...
30038,zyn.com,"Alcohol and Tobacco, Education"
30041,zynga.com,Games
30042,zyngaplayerforums.com,Games
30043,zype.com,"Adult, Information Technology"


In [4]:
# Get domain-unique traffic from users
df_domain_traffic = (
    df_visits.groupby(["private_domain"])["caseid"]
    .nunique()
    .reset_index(name="traffic")
    .sort_values("traffic", ascending=False, ignore_index=True)
)

dict_domain_traffic = df_domain_traffic.set_index("private_domain")["traffic"].to_dict()

df_domain_traffic

Unnamed: 0,private_domain,traffic
0,google.com,1080
1,facebook.com,886
2,amazon.com,827
3,youtube.com,766
4,yahoo.com,570
...,...,...
64069,1007bobfm.com,1
64070,1-800-shaved-ice.com,1
64071,zooskoolvideos.com,1
64072,zyratalk.com,1


In [5]:
df_blacklight = (
    pd.read_csv(filepaths["blacklight"])
    # Fix filename
    .assign(
        private_domain=lambda df_: df_["filename"].str.replace("_", ".", regex=False)
    )
    .remove_columns("filename")
    .reorder_columns(["private_domain"])
    .set_index("private_domain")
    .add_prefix("bl_")
    .reset_index()
)

df_blacklight

Unnamed: 0,private_domain,bl_ddg_join_ads,bl_third_party_cookies,bl_canvas_fingerprinting,bl_session_recording,bl_key_logging,bl_fb_pixel,bl_google_analytics
0,costarmanager.com,5,10,0,1,0,0,0
1,teasource.com,11,11,0,0,0,1,1
2,1800tequila.com,8,6,0,0,0,0,0
3,mazon.com,1,0,0,0,0,0,0
4,theancestorhunt.com,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
34073,methowtrails.org,1,0,0,0,0,0,0
34074,mistystoyaussies.com,4,6,0,0,0,0,0
34075,hellostarz.com,0,0,0,0,0,0,0
34076,kbdfans.com,5,7,0,0,0,0,1


### Combine

In [6]:
# Combine
tracker_cols = [
    "bl_ddg_join_ads",
    "bl_third_party_cookies",
    "bl_fb_pixel",
    "bl_google_analytics",
    "bl_session_recording",
    "bl_key_logging",
    "bl_canvas_fingerprinting",
]

df = (
    df_visits.select_columns(["caseid", "private_domain"])
    # ================================================
    # Only get unique visits to domains
    .drop_duplicates(["caseid", "private_domain"])
    # Get tt unique traffic per domain
    .groupby(["private_domain"])
    .size()
    .reset_index(name="visits")
    .sort_values("visits", ascending=False, ignore_index=True)
    # ================================================
    # Get domain category
    .merge(df_domain_cat, how="left", on="private_domain", validate="m:1")
    .dropna(subset=["category"])
    # ================================================
    # Get trackers
    .merge(df_blacklight, how="left", on="private_domain", validate="m:1")
    # Get weighted sum
    .assign(**{k: lambda df_, col=k: df_[col] * df_["visits"] for k in tracker_cols})
    .reorder_columns(["private_domain", "visits", "category", *tracker_cols])
    #     # Then aggregate by category
    #     .remove_columns(["visits", "private_domain"])
    #     .groupby("category").sum().reset_index()
)
df.head(30)

Unnamed: 0,private_domain,visits,category,bl_ddg_join_ads,bl_third_party_cookies,bl_fb_pixel,bl_google_analytics,bl_session_recording,bl_key_logging,bl_canvas_fingerprinting
0,google.com,1080,Search Engines and Portals,3240.0,4320.0,0.0,0.0,0.0,0.0,0.0
1,facebook.com,886,"Business, Social Networking",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,amazon.com,827,Shopping,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,youtube.com,766,"Entertainment, Streaming Media",1532.0,766.0,0.0,0.0,0.0,0.0,0.0
4,yahoo.com,570,Chat and Instant Messaging,17670.0,15960.0,0.0,0.0,0.0,570.0,0.0
5,yougov.com,544,"Business, Education",,,,,,,
6,paypal.com,541,"Business, Economy and Finance",0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,walmart.com,518,Shopping,518.0,0.0,0.0,0.0,0.0,0.0,0.0
8,bing.com,461,"News and Media, Search Engines and Portals",0.0,2305.0,0.0,0.0,0.0,0.0,0.0
9,wikipedia.org,456,Education,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Tab

In [7]:
df_long = (
    df.melt(
        id_vars=["category", "private_domain"],
        value_vars=tracker_cols,
        var_name="tracker_type",
        value_name="weighted_exposure",
    )
    .query("weighted_exposure>0")
    .reset_index(drop=True)
)
df_long

Unnamed: 0,category,private_domain,tracker_type,weighted_exposure
0,Search Engines and Portals,google.com,bl_ddg_join_ads,3240.0
1,"Entertainment, Streaming Media",youtube.com,bl_ddg_join_ads,1532.0
2,Chat and Instant Messaging,yahoo.com,bl_ddg_join_ads,17670.0
3,Shopping,walmart.com,bl_ddg_join_ads,518.0
4,Social Networking,twitter.com,bl_ddg_join_ads,440.0
...,...,...,...,...
25960,"Business, Education",yourvasurvey.info,bl_canvas_fingerprinting,1.0
25961,Adult,zeenite.com,bl_canvas_fingerprinting,1.0
25962,"Business, Education",zhihu.com,bl_canvas_fingerprinting,1.0
25963,"Business, Education",zipzipzipper.com,bl_canvas_fingerprinting,1.0


In [8]:
# tt exposure by category and tracker type
df_category_exposure = df_long.groupby(
    ["tracker_type", "category"], as_index=False
).agg(tt_exposure=("weighted_exposure", "sum"))
df_category_exposure

Unnamed: 0,tracker_type,category,tt_exposure
0,bl_canvas_fingerprinting,Adult,84.0
1,bl_canvas_fingerprinting,"Adult, Business",1.0
2,bl_canvas_fingerprinting,"Adult, Entertainment",20.0
3,bl_canvas_fingerprinting,"Adult, Entertainment, Shopping",1.0
4,bl_canvas_fingerprinting,"Adult, Entertainment, Streaming Media",2.0
...,...,...,...
1062,bl_third_party_cookies,Translation Sites,4.0
1063,bl_third_party_cookies,Travel,2774.0
1064,bl_third_party_cookies,Uncategorized,1672.0
1065,bl_third_party_cookies,Vehicles,1651.0


In [9]:
_top_n = 5
df_top_categories = (
    df_category_exposure.sort_values(
        ["tracker_type", "tt_exposure"], ascending=[True, False], ignore_index=True
    )
    .groupby("tracker_type")
    .head(_top_n)
)
df_top_categories

Unnamed: 0,tracker_type,category,tt_exposure
0,bl_canvas_fingerprinting,Shopping,1690.0
1,bl_canvas_fingerprinting,Business,1422.0
2,bl_canvas_fingerprinting,"Business, Economy and Finance",880.0
3,bl_canvas_fingerprinting,"Business, Shopping",599.0
4,bl_canvas_fingerprinting,Information Technology,569.0
113,bl_ddg_join_ads,Business,131742.0
114,bl_ddg_join_ads,Shopping,52765.0
115,bl_ddg_join_ads,"Business, Education",51039.0
116,bl_ddg_join_ads,Entertainment,46705.0
117,bl_ddg_join_ads,"Business, Information Technology",37926.0


In [10]:
_top_n = 5
df_top_domains = (
    df_long
    .merge(df_top_categories, on=["tracker_type", "category"])
    .sort_values(
        ["tracker_type", "category", "weighted_exposure"], ascending=[True, True, False]
    )
    .groupby(["tracker_type", "category"])
    .head(_top_n)
)
df_top_domains

Unnamed: 0,category,private_domain,tracker_type,weighted_exposure,tt_exposure
11784,Business,capitalone.com,bl_canvas_fingerprinting,242.0,1422.0
11789,Business,adobe.com,bl_canvas_fingerprinting,139.0,1422.0
11791,Business,hp.com,bl_canvas_fingerprinting,122.0,1422.0
11793,Business,xfinity.com,bl_canvas_fingerprinting,120.0,1422.0
11808,Business,comenity.net,bl_canvas_fingerprinting,41.0,1422.0
...,...,...,...,...,...
4852,Shopping,kohls.com,bl_third_party_cookies,9300.0,60571.0
4859,Shopping,qvc.com,bl_third_party_cookies,6624.0,60571.0
4832,Shopping,ebay.com,bl_third_party_cookies,5685.0,60571.0
4837,Shopping,homedepot.com,bl_third_party_cookies,3038.0,60571.0


In [11]:
summary = {}

for tracker in tracker_cols:
    tracker_data = df_top_categories.query("tracker_type == @tracker")
    entries = []

    for _, row in tracker_data.iterrows():
        cat = row["category"]
        #             visits = row["total_visits"]
        top_domains = (
            df_top_domains.query("tracker_type == @tracker & category == @cat")
            .sort_values("weighted_exposure", ascending=False)
            .private_domain.tolist()
        )

        # Add (traffic) next to each domain
        top_domains_with_traffic = [
            f"{domain} ({dict_domain_traffic[domain]})" for domain in top_domains
        ]

        #             entry = f"{cat}: {', '.join(top_domains_with_traffic)}"
        entry = f"\\textbf{{{cat}}}: {', '.join(top_domains_with_traffic)}"
        entries.append(entry)

    summary[tracker] = entries

df_summary = (
    pd.DataFrame.from_dict(summary, orient="index")
    .transpose()
#     .assign(ix=range(1, 1 + _top_n))
#     .reorder_columns(["ix"])
)
df_summary

Unnamed: 0,bl_ddg_join_ads,bl_third_party_cookies,bl_fb_pixel,bl_google_analytics,bl_session_recording,bl_key_logging,bl_canvas_fingerprinting
0,"\textbf{Business}: usps.com (309), xfinity.com...","\textbf{Business}: xfinity.com (120), adobe.co...","\textbf{Business}: usps.com (309), ups.com (19...","\textbf{Business}: force.com (144), narvar.com...","\textbf{Business}: capitalone.com (242), attn....","\textbf{Business}: yelp.com (160), attn.tv (15...","\textbf{Shopping}: homedepot.com (217), target..."
1,"\textbf{Shopping}: kohls.com (124), ebay.com (...","\textbf{Entertainment}: imdb.com (312), looper...","\textbf{Shopping}: ebay.com (379), homedepot.c...","\textbf{Shopping}: kohls.com (124), hobbylobby...","\textbf{Business, Information Technology}: att...",\textbf{Chat and Instant Messaging}: yahoo.com...,"\textbf{Business}: capitalone.com (242), adobe..."
2,"\textbf{Business, Education}: weather.com (194...","\textbf{Business, Education}: weather.com (194...","\textbf{Business, Information Technology}: exp...","\textbf{Business, Information Technology}: eve...","\textbf{Shopping}: homedepot.com (217), kohls....","\textbf{Business, Education}: weather.com (194...","\textbf{Business, Economy and Finance}: discov..."
3,"\textbf{Entertainment}: imdb.com (312), looper...","\textbf{Shopping}: kohls.com (124), qvc.com (9...","\textbf{Business, Economy and Finance}: chase....","\textbf{Business, Education}: hubapi.com (20),...","\textbf{Business, Education}: gofundme.com (72...",\textbf{Shopping}: capitaloneshopping.com (154...,"\textbf{Business, Shopping}: rakuten.com (185)..."
4,"\textbf{Business, Information Technology}: pcm...","\textbf{Business, Information Technology}: spo...","\textbf{Business, Education}: gofundme.com (72...","\textbf{Business, Shopping}: coupons.com (43),...","\textbf{Business, Economy and Finance}: discov...","\textbf{Business, Information Technology}: spo...",\textbf{Information Technology}: microsoft.com...


In [12]:
pandas_to_tex(
    df_summary,
    texfile="../tables/bl_top_contributors_domain_category.tex",
    escape=False,
)

In [13]:
!cat ../tables/bl_top_contributors_domain_category.tex

\textbf{Business}: usps.com (309), xfinity.com (120), adobe.com (139), ups.com (196), hp.com (122) & \textbf{Business}: xfinity.com (120), adobe.com (139), attn.tv (157), nielseniq.com (136), gfk.com (73) & \textbf{Business}: usps.com (309), ups.com (196), attn.tv (157), adobe.com (139), nielseniq.com (136) & \textbf{Business}: force.com (144), narvar.com (61), adp.com (40), quantilope.com (31), epsilon.com (26) & \textbf{Business}: capitalone.com (242), attn.tv (157), medallia.com (147), cmix.com (142), emi-rs.com (121) & \textbf{Business}: yelp.com (160), attn.tv (157), activemeasure.com (71), mapquest.com (67), doceree.com (53) & \textbf{Shopping}: homedepot.com (217), target.com (214), capitaloneshopping.com (154), kohls.com (124), samsclub.com (120) \\
\textbf{Shopping}: kohls.com (124), ebay.com (379), qvc.com (96), homedepot.com (217), hsn.com (67) & \textbf{Entertainment}: imdb.com (312), looper.com (35), slashfilm.com (25), knowyourmeme.com (23), comicbook.com (37) & \textbf{