In [2]:
import pandas as pd
import janitor

from utilities import pandas_to_tex, load_visit_data
from IPython.display import display
import warnings

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 150)

from constants import filepaths

Checking that all paths exist:
{'web_mobile': True, 'web_desktop': True, 'web': True, 'yg_profile': True, 'blacklight': True, 'who': True}


## Visit logs

In [3]:
# skim data
pd.read_csv(filepaths["web_mobile"], nrows=3)

Unnamed: 0,caseid,group_name,os_name,os_version,device_manufacturer,device_model,device_type,private_domain,category,visit_time_utc,visit_time_local,visit_duration,page_views
0,205323077,YouGov.USA,Android,9,LGE,VS988,Smartphone,google.com,Search Engines and Portals,2022-06-01 03:52:37,2022-05-31 23:52:37,2,1
1,205323077,YouGov.USA,Android,9,LGE,VS988,Smartphone,coupons.com,"Business, Shopping",2022-06-01 05:07:35,2022-06-01 01:07:35,457,10
2,205323077,YouGov.USA,Android,9,LGE,VS988,Smartphone,google.com,Business,2022-06-01 05:15:12,2022-06-01 01:15:12,55,1


In [4]:
# skim data
pd.read_csv(filepaths["web_desktop"], nrows=3)

Unnamed: 0,caseid,group_name,os_name,os_version,device_manufacturer,device_model,device_type,private_domain,category,visit_time_utc,visit_time_local,visit_duration,page_views
0,214407333,YouGov.USA,Windows,10,LENOVO,20BFS02S01,Laptop/Desktop,facebook.com,"Business, Social Networking",2022-06-01 00:29:31,2022-05-31 18:29:31,200,2
1,214407333,YouGov.USA,Windows,10,LENOVO,20BFS02S01,Laptop/Desktop,tryitsampling.com,"Business, Education",2022-06-01 00:32:51,2022-05-31 18:32:51,16,1
2,214407333,YouGov.USA,Windows,10,LENOVO,20BFS02S01,Laptop/Desktop,facebook.com,"Business, Social Networking",2022-06-01 00:48:34,2022-05-31 18:48:34,54,1


In [5]:
# skim data
pd.read_csv(filepaths["web"], nrows=3)

Unnamed: 0,caseid,group_name,client_id,client_key,os_name,os_version,device_manufacturer,device_model,device_type,session_start_time,start_time_utc,date,time,page_domain,referer,browser_vendor,browser_version,ref_domain,content_type,content_length,search_term,page_duration,private_domain,category,page_url_anonymized,predecessor_url_anonymized,succesor_url_anonymized
0,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:32:42.634,2022-06-08 00:32:42.634,2022-06-07,19:32:42,www.heb.com,,ChromeBrowserPlugin,,,,,,5,heb.com,"Food and Recipes, Shopping",https://www.heb.com/,,https://www.heb.com/weekly-ads/weekly-deals/
1,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:32:47.937,2022-06-08 00:32:47.937,2022-06-07,19:32:47,www.heb.com,,ChromeBrowserPlugin,,,,,,9,heb.com,"Food and Recipes, Shopping",https://www.heb.com/weekly-ads/weekly-deals/,https://www.heb.com/,https://www.kroger.com/savings/weeklyad/
2,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:00.322,2022-06-08 00:35:00.322,2022-06-07,19:35:00,www.kroger.com,,ChromeBrowserPlugin,,,,,,40,kroger.com,"Business, Shopping",https://www.kroger.com/savings/weeklyad/,https://www.heb.com/weekly-ads/weekly-deals/,https://www.google.com/search?ANONYMIZED


In [6]:
df_visits = load_visit_data()
print(f"{len(df_visits)=:,}")
df_visits.head()

len(df_visits)=6,297,382
len(df_visits)=6,297,382


Unnamed: 0,caseid,private_domain,category,visit_time_local,visit_duration,page_views,source
0,205323077,google.com,Search Engines and Portals,2022-05-31 23:52:37,2,1,mobile_web
1,205323077,coupons.com,"Business, Shopping",2022-06-01 01:07:35,457,10,mobile_web
2,205323077,google.com,Business,2022-06-01 01:15:12,55,1,mobile_web
3,205323077,coupons.com,"Business, Shopping",2022-06-01 01:16:07,2225,4,mobile_web
4,205323077,google.com,Search Engines and Portals,2022-06-01 04:38:10,10,1,mobile_web


### Basic check

In [7]:
df_visits["caseid"].nunique()

1135

In [8]:
len(df_visits.query("caseid!=caseid"))

0

In [9]:
# 60k have no private domain and will drop out
len(df_visits.query("private_domain!=private_domain"))

60548

In [10]:
# all 60k are from web
df_visits.query("private_domain!=private_domain")["source"].unique()

array(['web'], dtype=object)

In [11]:
# 1 guy has all records w empty private_domain field
df_visits.query("private_domain==private_domain")["caseid"].nunique()

1134

In [12]:
n_domains = df_visits["private_domain"].nunique()
n_domains

64074

## Individuals

In [13]:
df_ind = (
    pd.read_csv(filepaths["yg_profile"])
    .astype({"caseid": int, "birthyr": int})
    .assign(
        # https://github.com/themains/bad_domains/blob/main/data/codebook.pdf
        gender_lab=lambda df_: df_["gender"].replace({1: "Male", 2: "Female"}),
        race_lab=lambda df_: df_["race"].replace(
            {
                1: "White",
                2: "Black",
                3: "Hispanic",
                4: "Asian",
                5: "Other",
                6: "Other",
                7: "Other",
                8: "Other",
            }
        ),
        educ_lab=lambda df_: df_["educ"].replace(
            {
                1: "HS or Below",
                2: "HS or Below",
                3: "Some college",
                4: "Some college",
                5: "College",
                6: "Postgrad",
            }
        ),
        agegroup_lab=lambda df_: pd.cut(
            df_["birthyr"],
            # early baby boomers
            # late baby boomers/ early genX
            # genX, early millenials
            # millenials
            # genZ
            bins=[1929, 1958, 1973, 1988, 1998, 2003],
            labels=["65+", "50-64", "35-49", "25-34", "<25"],
        ),
    )
)
print(f"{len(df_ind)=:,}")
df_ind.head()

len(df_ind)=1,200


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,gender_lab,race_lab,educ_lab,agegroup_lab
0,200661421,1963,2,1,4,3,3,1,39,2,Female,White,Some college,50-64
1,200686597,1992,2,6,5,5,8,-1,48,3,Female,Other,College,25-34
2,200953869,1959,2,1,5,2,7,2,42,1,Female,White,College,50-64
3,201302005,1966,2,2,3,5,8,1,12,3,Female,Black,Some college,50-64
4,201590505,1977,1,4,5,3,3,1,6,4,Male,Asian,College,35-49


In [14]:
# Summary
custom_order = [
    "Female",
    "Male",
    "White",
    "Hispanic",
    "Black",
    "Other",
    "Asian",
    "HS or Below",
    "Some college",
    "College",
    "Postgrad",
    "<25",
    "25-34",
    "35-49",
    "50-64",
    "65+",
]
demo_cat_labels = {
    "<25": "$<$ 25 years old",
    "25-34": "25--34 years old",
    "35-49": "35--49 years old",
    "50-64": "50--64 years old",
    "65+": "65+ years old",
    "HS or Below": "High school diploma or below",
    "Some college": "Some College education",
    "College": "College Graduate",
    "Postgrad": "Postgraduate",
}

df_demo_summ = (
    pd.concat(
        [
            df_ind["gender_lab"].value_counts(),
            df_ind["race_lab"].value_counts(),
            df_ind["educ_lab"].value_counts(),
            df_ind["agegroup_lab"].value_counts(),
        ]
    )
    .reset_index(name="n")
    .rename_column("index", "cat")
    .assign(
        cat=lambda df_: pd.Categorical(
            df_["cat"], categories=custom_order, ordered=True
        )
    )
    .sort_values("cat")
    .reset_index(drop=True)
    .assign(perc=lambda df_: 100 * df_["n"] / len(df_ind))
    .assign(
        perc=lambda df_: df_["perc"].round(1).astype(str).apply(lambda x: f"({x}\%)")
    )
    .assign(cat=lambda df_: df_["cat"].replace(demo_cat_labels))
)
pandas_to_tex(df_demo_summ, "../tables/demo_summary")
df_demo_summ

Unnamed: 0,cat,n,perc
0,Female,635,(52.9\%)
1,Male,565,(47.1\%)
2,White,762,(63.5\%)
3,Hispanic,176,(14.7\%)
4,Black,152,(12.7\%)
5,Other,61,(5.1\%)
6,Asian,49,(4.1\%)
7,High school diploma or below,427,(35.6\%)
8,Some College education,350,(29.2\%)
9,College Graduate,272,(22.7\%)


## Trackers

### BL

In [15]:
df_blacklight = (
    pd.read_csv(filepaths["blacklight"])
    # Fix filename
    .assign(
        private_domain=lambda df_: df_["filename"].str.replace("_", ".", regex=False)
    )
    .remove_columns("filename")
    .reorder_columns(["private_domain"])
    .set_index("private_domain")
    .add_prefix("bl_")
    .reset_index()
)

df_blacklight

Unnamed: 0,private_domain,bl_ddg_join_ads,bl_third_party_cookies,bl_canvas_fingerprinting,bl_session_recording,bl_key_logging,bl_fb_pixel,bl_google_analytics
0,costarmanager.com,5,10,0,1,0,0,0
1,teasource.com,11,11,0,0,0,1,1
2,1800tequila.com,8,6,0,0,0,0,0
3,mazon.com,1,0,0,0,0,0,0
4,theancestorhunt.com,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
34073,methowtrails.org,1,0,0,0,0,0,0
34074,mistystoyaussies.com,4,6,0,0,0,0,0
34075,hellostarz.com,0,0,0,0,0,0,0
34076,kbdfans.com,5,7,0,0,0,0,1


In [16]:
# confirm that a row exists even with no detected trackers
(
    df_blacklight.set_index("private_domain")
    .assign(tt=lambda df_: df_.sum(axis=1))
    .query("tt==0")
    .shape
)

(8097, 8)

In [17]:
# coverage of domains
100 * len(df_blacklight) / n_domains

53.185379405062896

In [18]:
# coverage of visits
(
    df_visits.merge(
        df_blacklight, on="private_domain", how="right", validate="m:1"
    ).shape[0]
    / len(df_visits)
    * 100
)

75.69969552426706

In [19]:
# coverage of visits
n_bl_visits = df_visits.merge(
    df_blacklight, on="private_domain", how="right", validate="m:1"
).shape[0]
n_bl_visits

4767099

In [20]:
100 * n_bl_visits / len(df_visits)

75.69969552426707

### Who

In [21]:
df_who = (
    pd.read_csv(filepaths["who"])
    .rename_column("domain_name", "private_domain")
    .set_index("private_domain")
    # remove domain/row if all column vals are missing (true zeroes)
    .dropna(how="all")
    # NaNs are now 0s rather than true missing (min. count before this = 1)
    .fillna(0)
    .pipe(lambda df_: df_.loc[:, sorted(df_.columns)])
    .add_prefix("who_")
    .reset_index()
)
display(df_who.head())
df_who.info()

Unnamed: 0,private_domain,who_Adult Advertising,who_Advertising,who_Audio/Video Player,who_Consent Management,who_Customer Interaction,who_Data Saved,who_Hosting,who_Misc,who_Site Analytics,who_Social Media,who_Trackers Per Page Load,who_Trackers Requests / All Requests,who_Tracking Requests Per Page Load,who_Utilities
0,tennis-warehouse.com,0.0,7.0,1.0,1.0,2.0,0.63,5.0,1.0,1.0,1.0,4.95,12.5,3.89,1.0
1,dazn.com,0.0,12.0,1.0,1.0,1.0,138.38,11.0,1.0,10.0,4.0,7.39,0.87,12.72,0.0
2,saashr.com,0.0,1.0,0.0,0.0,0.0,0.15,2.0,0.0,0.0,0.0,2.75,6.17,0.69,0.0
3,24hourcampfire.com,0.0,13.0,2.0,0.0,1.0,1.57,5.0,2.0,4.0,2.0,7.83,10.55,3.06,1.0
4,therealreal.com,0.0,52.0,1.0,1.0,3.0,0.15,3.0,0.0,9.0,1.0,10.71,29.33,11.83,4.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4261 entries, 0 to 4260
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   private_domain                        4261 non-null   object 
 1   who_Adult Advertising                 4261 non-null   float64
 2   who_Advertising                       4261 non-null   float64
 3   who_Audio/Video Player                4261 non-null   float64
 4   who_Consent Management                4261 non-null   float64
 5   who_Customer Interaction              4261 non-null   float64
 6   who_Data Saved                        4261 non-null   float64
 7   who_Hosting                           4261 non-null   float64
 8   who_Misc                              4261 non-null   float64
 9   who_Site Analytics                    4261 non-null   float64
 10  who_Social Media                      4261 non-null   float64
 11  who_Trackers Per 

In [22]:
df_who.describe()

Unnamed: 0,who_Adult Advertising,who_Advertising,who_Audio/Video Player,who_Consent Management,who_Customer Interaction,who_Data Saved,who_Hosting,who_Misc,who_Site Analytics,who_Social Media,who_Trackers Per Page Load,who_Trackers Requests / All Requests,who_Tracking Requests Per Page Load,who_Utilities
count,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0,4261.0
mean,0.099038,16.398498,0.854494,0.552687,1.464914,11.688723,5.960103,1.183056,4.615114,0.961511,7.072502,10.108132,7.658977,1.00751
std,0.474764,19.134705,0.855771,0.777836,1.477532,59.708296,3.067979,1.946419,3.561426,1.211611,4.498857,8.526858,14.655546,1.326841
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.02,0.01,0.0,0.0
25%,0.0,4.0,0.0,0.0,0.0,0.36,4.0,0.0,2.0,0.0,3.77,3.94,1.31,0.0
50%,0.0,8.0,1.0,0.0,1.0,0.93,6.0,1.0,4.0,0.0,5.94,7.96,3.65,0.0
75%,0.0,22.0,1.0,1.0,2.0,2.84,8.0,1.0,7.0,2.0,9.3,13.83,8.55,2.0
max,5.0,125.0,5.0,5.0,10.0,1042.76,17.0,18.0,21.0,8.0,31.85,80.4,353.96,7.0


In [23]:
# coverage of domains
100 * len(df_who) / n_domains

6.650123294940226

In [24]:
# coverage of visits
n_who_visits = df_visits.merge(
    df_who, on="private_domain", how="right", validate="m:1"
).shape[0]
n_who_visits

4426771

In [25]:
100 * n_who_visits / len(df_visits)

70.29541800068664

## Merge

In [26]:
tracker_cols = [
    "bl_ddg_join_ads",
    "bl_third_party_cookies",
    "bl_canvas_fingerprinting",
    "bl_session_recording",
    "bl_key_logging",
    "bl_fb_pixel",
    "bl_google_analytics",
    "who_trackers_per_page_load",
    "who_tracking_requests_per_page_load",
    "who_trackers_requests_all_requests",
    "who_data_saved",
    "who_advertising",
    "who_audio_video_player",
    "who_customer_interaction",
    "who_hosting",
    "who_consent_management",
    "who_site_analytics",
    "who_misc",
    "who_utilities",
    "who_social_media",
    "who_adult_advertising",
]

In [34]:
df = (
    df_visits.dropna(subset=["private_domain"])
    .merge(df_blacklight, how="left", on="private_domain", validate="m:1")
    .merge(df_who, how="left", on="private_domain", validate="m:1")
    # This is total visits conditional on private_domain not NA
    .assign(
        tt_visits=lambda df_: df_.groupby("caseid")["caseid"].transform("count"),
        tt_domains=lambda df_: df_.groupby("caseid")["private_domain"].transform(
            "nunique"
        ),
    )
    # ===============================================================
    # Compute cumulative exposure
    # no longer doing by unique domains
#     .drop_duplicates(["caseid", "private_domain"], ignore_index=True)
    # Drop before groupby
    .remove_columns(
        [
            "private_domain",
            "category",
            "visit_time_local",
            "visit_duration",
            "page_views",
            "source",
        ]
    )
    .groupby(["caseid", "tt_visits", "tt_domains"], as_index=False)
    .sum()
    .clean_names()
    # ===============================================================
    # Get ind. profle
    .merge(df_ind, how="left", on="caseid", validate="1:1")
)

# Get exposure rates
for col in tracker_cols:
    df[f"{col}_rate"] = df[col] / df["tt_visits"]

    # At least x number of tracker encountered
    for threshold in [1, 3, 5, 10]:
        df[f"{col}_al{threshold}"] = df[col] >= threshold

df.to_csv("../data/combined_yg_bl_who.csv", index=False)
display(df.head())
df.info(verbose=True, show_counts=True)

Unnamed: 0,caseid,tt_visits,tt_domains,bl_ddg_join_ads,bl_third_party_cookies,bl_canvas_fingerprinting,bl_session_recording,bl_key_logging,bl_fb_pixel,bl_google_analytics,who_adult_advertising,who_advertising,who_audio_video_player,who_consent_management,who_customer_interaction,who_data_saved,who_hosting,who_misc,who_site_analytics,who_social_media,who_trackers_per_page_load,who_trackers_requests_all_requests,who_tracking_requests_per_page_load,who_utilities,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,gender_lab,race_lab,educ_lab,agegroup_lab,bl_ddg_join_ads_rate,bl_ddg_join_ads_al1,bl_ddg_join_ads_al3,bl_ddg_join_ads_al5,bl_ddg_join_ads_al10,bl_third_party_cookies_rate,bl_third_party_cookies_al1,bl_third_party_cookies_al3,bl_third_party_cookies_al5,bl_third_party_cookies_al10,bl_canvas_fingerprinting_rate,bl_canvas_fingerprinting_al1,bl_canvas_fingerprinting_al3,bl_canvas_fingerprinting_al5,bl_canvas_fingerprinting_al10,bl_session_recording_rate,bl_session_recording_al1,bl_session_recording_al3,bl_session_recording_al5,bl_session_recording_al10,bl_key_logging_rate,bl_key_logging_al1,bl_key_logging_al3,bl_key_logging_al5,bl_key_logging_al10,bl_fb_pixel_rate,bl_fb_pixel_al1,bl_fb_pixel_al3,bl_fb_pixel_al5,bl_fb_pixel_al10,bl_google_analytics_rate,bl_google_analytics_al1,bl_google_analytics_al3,bl_google_analytics_al5,bl_google_analytics_al10,who_trackers_per_page_load_rate,who_trackers_per_page_load_al1,who_trackers_per_page_load_al3,who_trackers_per_page_load_al5,who_trackers_per_page_load_al10,who_tracking_requests_per_page_load_rate,who_tracking_requests_per_page_load_al1,who_tracking_requests_per_page_load_al3,who_tracking_requests_per_page_load_al5,who_tracking_requests_per_page_load_al10,who_trackers_requests_all_requests_rate,who_trackers_requests_all_requests_al1,who_trackers_requests_all_requests_al3,who_trackers_requests_all_requests_al5,who_trackers_requests_all_requests_al10,who_data_saved_rate,who_data_saved_al1,who_data_saved_al3,who_data_saved_al5,who_data_saved_al10,who_advertising_rate,who_advertising_al1,who_advertising_al3,who_advertising_al5,who_advertising_al10,who_audio_video_player_rate,who_audio_video_player_al1,who_audio_video_player_al3,who_audio_video_player_al5,who_audio_video_player_al10,who_customer_interaction_rate,who_customer_interaction_al1,who_customer_interaction_al3,who_customer_interaction_al5,who_customer_interaction_al10,who_hosting_rate,who_hosting_al1,who_hosting_al3,who_hosting_al5,who_hosting_al10,who_consent_management_rate,who_consent_management_al1,who_consent_management_al3,who_consent_management_al5,who_consent_management_al10,who_site_analytics_rate,who_site_analytics_al1,who_site_analytics_al3,who_site_analytics_al5,who_site_analytics_al10,who_misc_rate,who_misc_al1,who_misc_al3,who_misc_al5,who_misc_al10,who_utilities_rate,who_utilities_al1,who_utilities_al3,who_utilities_al5,who_utilities_al10,who_social_media_rate,who_social_media_al1,who_social_media_al3,who_social_media_al5,who_social_media_al10,who_adult_advertising_rate,who_adult_advertising_al1,who_adult_advertising_al3,who_adult_advertising_al5,who_adult_advertising_al10
0,47541,17194,553,161861.0,172095.0,1451.0,1173.0,3909.0,2715.0,50.0,0.0,213560.0,11359.0,3216.0,16254.0,55455.31,75118.0,17844.0,43904.0,16419.0,62103.68,101685.62,95196.59,9853.0,1955,2,1,2,2,6,2,12,3,Female,White,HS or Below,65+,9.413807,True,True,True,True,10.009015,True,True,True,True,0.08439,True,True,True,True,0.068221,True,True,True,True,0.227347,True,True,True,True,0.157904,True,True,True,True,0.002908,True,True,True,True,3.611939,True,True,True,True,5.536617,True,True,True,True,5.914018,True,True,True,True,3.225271,True,True,True,True,12.420612,True,True,True,True,0.660637,True,True,True,True,0.94533,True,True,True,True,4.36885,True,True,True,True,0.187042,True,True,True,True,2.553449,True,True,True,True,1.037804,True,True,True,True,0.573049,True,True,True,True,0.954926,True,True,True,True,0.0,False,False,False,False
1,56565,11479,334,97694.0,103287.0,222.0,185.0,1625.0,585.0,36.0,0.0,137218.0,8382.0,2435.0,6814.0,42170.22,49408.0,11746.0,24816.0,10572.0,44027.76,53020.08,56047.07,3397.0,1940,2,1,3,3,5,2,17,2,Female,White,Some college,65+,8.510672,True,True,True,True,8.997909,True,True,True,True,0.01934,True,True,True,True,0.016116,True,True,True,True,0.141563,True,True,True,True,0.050963,True,True,True,True,0.003136,True,True,True,True,3.835505,True,True,True,True,4.882574,True,True,True,True,4.618876,True,True,True,True,3.673684,True,True,True,True,11.953829,True,True,True,True,0.730203,True,True,True,True,0.593606,True,True,True,True,4.304208,True,True,True,True,0.212126,True,True,True,True,2.161861,True,True,True,True,1.02326,True,True,True,True,0.295932,True,True,True,True,0.920986,True,True,True,True,0.0,False,False,False,False
2,203271,6540,152,11317.0,22787.0,137.0,786.0,10.0,24.0,2.0,0.0,18808.0,2614.0,130.0,2320.0,47961.31,23955.0,9812.0,4640.0,2382.0,11919.72,20579.55,14833.83,946.0,1980,2,1,6,1,2,-1,54,3,Female,White,Postgrad,35-49,1.730428,True,True,True,True,3.484251,True,True,True,True,0.020948,True,True,True,True,0.120183,True,True,True,True,0.001529,True,True,True,True,0.00367,True,True,True,True,0.000306,True,False,False,False,1.822587,True,True,True,True,2.26817,True,True,True,True,3.14672,True,True,True,True,7.333534,True,True,True,True,2.875841,True,True,True,True,0.399694,True,True,True,True,0.35474,True,True,True,True,3.662844,True,True,True,True,0.019878,True,True,True,True,0.70948,True,True,True,True,1.500306,True,True,True,True,0.144648,True,True,True,True,0.36422,True,True,True,True,0.0,False,False,False,False
3,216457,1770,49,19793.0,5050.0,22.0,10.0,2.0,10.0,28.0,0.0,30640.0,1180.0,120.0,1489.0,3502.56,6751.0,1204.0,5692.0,1243.0,8770.35,13329.55,26357.05,226.0,1976,2,1,2,2,7,2,27,2,Female,White,HS or Below,35-49,11.182486,True,True,True,True,2.853107,True,True,True,True,0.012429,True,True,True,True,0.00565,True,True,True,True,0.00113,True,False,False,False,0.00565,True,True,True,True,0.015819,True,True,True,True,4.955,True,True,True,True,14.890989,True,True,True,True,7.530819,True,True,True,True,1.978847,True,True,True,True,17.310734,True,True,True,True,0.666667,True,True,True,True,0.841243,True,True,True,True,3.814124,True,True,True,True,0.067797,True,True,True,True,3.215819,True,True,True,True,0.680226,True,True,True,True,0.127684,True,True,True,True,0.70226,True,True,True,True,0.0,False,False,False,False
4,257495,10012,284,73761.0,77189.0,1516.0,445.0,2036.0,1221.0,18.0,0.0,131407.0,6504.0,4164.0,9856.0,28534.64,44383.0,9164.0,28161.0,8168.0,42553.97,67794.69,63233.1,9064.0,1952,2,7,1,1,1,1,15,4,Female,Other,HS or Below,65+,7.367259,True,True,True,True,7.709648,True,True,True,True,0.151418,True,True,True,True,0.044447,True,True,True,True,0.203356,True,True,True,True,0.121954,True,True,True,True,0.001798,True,True,True,True,4.250297,True,True,True,True,6.315731,True,True,True,True,6.771343,True,True,True,True,2.850044,True,True,True,True,13.12495,True,True,True,True,0.64962,True,True,True,True,0.984419,True,True,True,True,4.43298,True,True,True,True,0.415901,True,True,True,True,2.812725,True,True,True,True,0.915302,True,True,True,True,0.905314,True,True,True,True,0.815821,True,True,True,True,0.0,False,False,False,False


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1134 entries, 0 to 1133
Data columns (total 142 columns):
 #    Column                                    Non-Null Count  Dtype   
---   ------                                    --------------  -----   
 0    caseid                                    1134 non-null   int64   
 1    tt_visits                                 1134 non-null   int64   
 2    tt_domains                                1134 non-null   int64   
 3    bl_ddg_join_ads                           1134 non-null   float64 
 4    bl_third_party_cookies                    1134 non-null   float64 
 5    bl_canvas_fingerprinting                  1134 non-null   float64 
 6    bl_session_recording                      1134 non-null   float64 
 7    bl_key_logging                            1134 non-null   float64 
 8    bl_fb_pixel                               1134 non-null   float64 
 9    bl_google_analytics                       1134 non-null   float64 
 10   who_adult_

In [35]:
(
    df.filter(regex="bl_|who_")
    .describe(percentiles=[0.1, 0.25, 0.5, 0.75])
    .round(3)
    .T.reset_index(names="var")
)

Unnamed: 0,var,count,mean,std,min,10%,25%,50%,75%,max
0,bl_ddg_join_ads,1134.0,27407.428,48278.835,0.0,618.6,2620.5,9738.0,29240.0,517968.0
1,bl_third_party_cookies,1134.0,32325.168,55184.294,0.0,770.4,3133.0,11757.0,35647.0,700142.0
2,bl_canvas_fingerprinting,1134.0,319.799,696.847,0.0,2.0,18.0,84.0,287.75,7643.0
3,bl_session_recording,1134.0,155.432,353.368,0.0,0.0,10.0,53.5,165.0,5788.0
4,bl_key_logging,1134.0,309.1,935.195,0.0,0.0,4.0,26.0,147.75,10315.0
5,bl_fb_pixel,1134.0,383.327,657.458,0.0,6.0,40.0,147.0,463.0,5808.0
6,bl_google_analytics,1134.0,35.059,104.464,0.0,0.0,0.0,8.0,29.0,1619.0
7,who_adult_advertising,1134.0,64.959,287.648,0.0,0.0,0.0,0.0,2.0,4283.0
8,who_advertising,1134.0,45899.937,79784.283,0.0,981.1,4122.0,16899.0,52593.25,1008990.0
9,who_audio_video_player,1134.0,3358.057,5516.094,0.0,69.6,283.25,1264.0,3987.25,42896.0
