In [26]:
import pandas as pd
import janitor
import warnings
from typing import List, Dict

import sidetable
warnings.filterwarnings("ignore")

from utilities import process_bl_json_files, pandas_to_tex


FP_US_DOMAINS = "../data/us_gov_domain_list.csv"
FP_US_BL = "../data/us_blacklight_json"
FP_IN_DOMAINS = "../data/in_gov_domain_list.csv"
FP_IN_BL = "../data/in_blacklight_json"
FP_GDIR_DOMAINS = "../data/combined_tsv_data.tsv"
FP_GDIR_BL = "../data/blacklight_json/"

In [2]:
bl_labels = {
    "ddg_join_ads": "Ad Trackers",
    "third_party_cookies": "Third-Party Cookies",
    "fb_pixel": "Facebook Pixel",
    "google_analytics": "Google Analytics",
    "key_logging": "Keylogging",
    "session_recording": "Session Recording",
    "canvas_fingerprinting": "Canvas Fingerprinting",
}
bl_measures = list(bl_labels.keys())
bl_measures_al1 = ["".join([var, "_al1"]) for var in bl_measures]
bl_measures_al2 = ["".join([var, "_al2"]) for var in bl_measures]
bl_measures

['ddg_join_ads',
 'third_party_cookies',
 'fb_pixel',
 'google_analytics',
 'key_logging',
 'session_recording',
 'canvas_fingerprinting']

In [3]:
def summarize_bl(
    df,
    bl_measures: List[str] = bl_measures,
    bl_measures_al1: List[str] = bl_measures_al1,
    bl_measures_al2: List[str] = bl_measures_al2,
    bl_labels: Dict[str, str] = bl_labels,
    percentiles: List[float] = [0.25, 0.5, 0.75, 0.9],
    float_format: str = "{:,.1f}"
) -> pd.DataFrame:
    """
    Summarize tracking metrics with descriptive statistics and prevalence indicators.

    Parameters
    ----------
    df : DataFrame-like
        The input DataFrame containing tracking-related columns.
    bl_measures : list of str
        Column names for the primary tracking measures (e.g., cookies, ads).
    bl_measures_al1 : list of str
        Columns indicating whether each measure was present at least once (binary indicators).
    bl_measures_al2 : list of str
        Columns indicating whether each measure was present at least twice.
    bl_labels : dict of str to str
        Dictionary mapping original variable names to display-friendly labels.
    percentiles : list of float, optional
        Percentiles to include in summary statistics (default is [0.25, 0.5, 0.75, 0.9]).
    float_format : str, optional
        Format string to apply to numeric summary values (default is "{:,.1f}").

    Returns
    -------
    pd.DataFrame
        A tidy DataFrame with transposed summary stats, and columns for share of domains
        with ≥1 and ≥2 hits per measure. Variable names are labeled according to `bl_labels`.
    """
    return (
        df.select_columns(bl_measures)
        .describe(percentiles=percentiles)
        .T.reset_index(names="var")
        .remove_columns("count")
        .applymap(lambda x: float_format.format(x) if isinstance(x, (float, int)) else x)
        .merge(
            df.select_columns(bl_measures_al1)
              .mean()
              .multiply(100)
              .round(1)
              .astype(str)
              .add("%")
              .reset_index(name="atleast1")
              .rename_column("index", "var")
              .replace("_al1", "", regex=True),
            how="left", on="var", validate="1:1"
        )
        .merge(
            df.select_columns(bl_measures_al2)
              .mean()
              .multiply(100)
              .round(1)
              .astype(str)
              .add("%")
              .reset_index(name="atleast2")
              .rename_column("index", "var")
              .replace("_al2", "", regex=True),
            how="left", on="var", validate="1:1"
        )
        .assign(var=lambda df_: df_["var"].replace(bl_labels))
    )

## US

In [4]:
df_us = (
    pd.read_csv(FP_US_DOMAINS)
    .clean_names()
    .rename_column("domain_name", "domain")
)
df_us.head(3)

Unnamed: 0,domain,domain_type,agency,organization_name,city,state,security_contact_email
0,36thdistrictcourtmi.gov,City,Non-Federal Agency,36th District Court,Detroit,MI,govdnssecurity@36thdistrictcourt.org
1,59dcmi.gov,City,Non-Federal Agency,59th District Court,Walker,MI,soc@walker.city
2,abbevillecitymarshal.gov,City,Non-Federal Agency,Abbeville City Marshal Office,Abbeville,LA,(blank)


In [5]:
df_us_bl = (
    pd.DataFrame(process_bl_json_files(FP_US_BL))
    .merge(df_us, on="domain", how="left", validate="1:1")
    .assign(
        **{f"{col}_al1": (lambda df, k=col: df[k] >= 1) for col in bl_measures},
        **{f"{col}_al2": (lambda df, k=col: df[k] >= 2) for col in bl_measures},
    )
)

df_us_bl

Unnamed: 0,domain,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics,domain_type,agency,...,key_logging_al1,session_recording_al1,canvas_fingerprinting_al1,ddg_join_ads_al2,third_party_cookies_al2,fb_pixel_al2,google_analytics_al2,key_logging_al2,session_recording_al2,canvas_fingerprinting_al2
0,fonda-fultonvillewastewater.gov,0,0,0,0,0,0,0,Special district,Non-Federal Agency,...,False,False,False,False,False,False,False,False,False,False
1,johnsoncountyso-ne.gov,0,0,0,0,0,0,0,County,Non-Federal Agency,...,False,False,False,False,False,False,False,False,False,False
2,scatdhhs.gov,2,8,0,0,0,0,0,Tribal,Non-Federal Agency,...,False,False,False,True,True,False,False,False,False,False
3,jeffersontown-ny.gov,0,0,0,0,0,0,0,City,Non-Federal Agency,...,False,False,False,False,False,False,False,False,False,False
4,floridasspeechaudiology.gov,1,0,0,0,0,0,0,State or territory,Non-Federal Agency,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7785,ehpsmt.gov,2,1,0,0,0,0,0,School district,Non-Federal Agency,...,False,False,False,True,False,False,False,False,False,False
7786,coopercohealth.gov,3,0,0,0,0,0,0,County,Non-Federal Agency,...,False,False,False,True,False,False,False,False,False,False
7787,randolphcountyil.gov,1,4,0,0,0,0,0,County,Non-Federal Agency,...,False,False,False,False,True,False,False,False,False,False
7788,boveymn.gov,1,0,0,0,0,0,0,City,Non-Federal Agency,...,False,False,False,False,False,False,False,False,False,False


In [6]:
df_us_summ = summarize_bl(df_us_bl)
pandas_to_tex(
    df_us_summ,
    "../tables/bl_summ_us_domains",
    escape=True,
)
df_us_summ

Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,1.7,2.8,0.0,0.0,1.0,2.0,4.0,37.0,71.7%,38.7%
1,Third-Party Cookies,1.2,4.5,0.0,0.0,0.0,1.0,4.0,86.0,26.0%,15.0%
2,Facebook Pixel,0.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,3.0%,0.0%
3,Google Analytics,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.7%,0.0%
4,Keylogging,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,1.8%,0.0%
5,Session Recording,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,1.5%,0.0%
6,Canvas Fingerprinting,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,1.6%,0.0%


In [7]:
!cat ../tables/bl_summ_us_domains.tex

Ad Trackers & 1.7 & 2.8 & 0.0 & 0.0 & 1.0 & 2.0 & 4.0 & 37.0 & 71.7\% & 38.7\% \\
Third-Party Cookies & 1.2 & 4.5 & 0.0 & 0.0 & 0.0 & 1.0 & 4.0 & 86.0 & 26.0\% & 15.0\% \\
Facebook Pixel & 0.0 & 0.2 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 1.0 & 3.0\% & 0.0\% \\
Google Analytics & 0.0 & 0.1 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 1.0 & 0.7\% & 0.0\% \\
Keylogging & 0.0 & 0.1 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 1.0 & 1.8\% & 0.0\% \\
Session Recording & 0.0 & 0.1 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 1.0 & 1.5\% & 0.0\% \\
Canvas Fingerprinting & 0.0 & 0.1 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 1.0 & 1.6\% & 0.0\% \\

## IN

In [8]:
df_in = (
    pd.read_csv(FP_IN_DOMAINS)
    .rename_column("url", "domain")
    .dropna(subset=["domain"])
    .drop_duplicates("domain")
)
df_in.head(3)

Unnamed: 0,sector,name,domain
0,Agriculture & Cooperation,"State Wetland Authority, Haryana",https://swa.haryana.gov.in/
1,Agriculture & Cooperation,Krishi Vigyan Kendra Knowledge Network,https://kvk.icar.gov.in/
2,Agriculture & Cooperation,Bihar State Election Authority (BSEA),https://bsea.bihar.gov.in/


In [9]:
df_in[df_in.duplicated("domain", keep=False)].sort_values("domain")

Unnamed: 0,sector,name,domain


In [10]:
df_in_bl = (
    pd.DataFrame(process_bl_json_files(FP_IN_BL))
    .merge(df_in, on="domain", how="left", validate="1:1")
    .assign(
        **{f"{col}_al1": (lambda df, k=col: df[k] >= 1) for col in bl_measures},
        **{f"{col}_al2": (lambda df, k=col: df[k] >= 2) for col in bl_measures},
    )
)

df_in_bl

Error processing yoga.ayush.gov.in.json: list index out of range
Error processing wbkanyashree.gov.in.json: list index out of range
Error processing rural.assam.gov.in.json: list index out of range
Error processing www.odishaone.gov.in.json: list index out of range
Error processing fcsca.assam.gov.in.json: list index out of range
Error processing rdsdemaharashtra.dgt.gov.in.json: list index out of range
Error processing www.ocac.in.json: list index out of range
Error processing nhm.utl.gov.in.json: list index out of range
Error processing nehrutrophy.nic.in.json: list index out of range
Error processing www.hyderabadpolice.gov.in.json: list index out of range
Error processing cidwestbengal.gov.in.json: list index out of range
Error processing bcasindia.gov.in.json: list index out of range
Error processing www.nmdfc.org.json: list index out of range
Error processing geoportal.natmo.gov.in.json: list index out of range
Error processing asacs.assam.gov.in.json: list index out of range
Err

Error processing nests.tribal.gov.in.json: list index out of range
Error processing cpri.in.json: list index out of range
Error processing usof.gov.in.json: list index out of range
Error processing svnirtar.nic.in.json: list index out of range
Error processing dlrar.assam.gov.in.json: list index out of range
Error processing asbtc.assam.gov.in.json: list index out of range
Error processing www.plrs.org.in.json: list index out of range
Error processing vscrap.parivahan.gov.in_vehiclescrap_vahan_welcome.xhtml.json: list index out of range
Error processing www.nwapune.gov.in.json: list index out of range
Error processing handlooms.nic.in.json: list index out of range
Error processing nbcfdc.gov.in.json: list index out of range
Error processing fda.assam.gov.in.json: list index out of range
Error processing ehrms.meghalaya.gov.in.json: list index out of range


Unnamed: 0,domain,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics,sector,name,...,key_logging_al1,session_recording_al1,canvas_fingerprinting_al1,ddg_join_ads_al2,third_party_cookies_al2,fb_pixel_al2,google_analytics_al2,key_logging_al2,session_recording_al2,canvas_fingerprinting_al2
0,brns.res.in,0,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False
1,ayodhya.cantt.gov.in,0,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False
2,ociservices.gov.in,0,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False
3,pngrb.gov.in,0,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False
4,www.rites.com,3,2,0,0,0,0,0,,,...,False,False,False,True,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2470,bhavishya.nic.in,0,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False
2471,vizagcustoms.gov.in,0,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False
2472,powerfoundation.org.in,1,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False
2473,darbhangadivision.bih.nic.in,0,0,0,0,0,0,0,,,...,False,False,False,False,False,False,False,False,False,False


In [11]:
df_in_summ = summarize_bl(df_in_bl)

pandas_to_tex(
    df_in_summ,
    "../tables/bl_summ_in_domains",
    escape=True,
)
df_in_summ

Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.7,2.1,0.0,0.0,0.0,1.0,3.0,42.0,28.5%,13.3%
1,Third-Party Cookies,0.5,3.5,0.0,0.0,0.0,0.0,0.0,84.0,8.3%,5.5%
2,Facebook Pixel,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.8%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.2%,0.0%
4,Keylogging,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.9%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.8%,0.0%


## GDir

* will have missing/uncleaned data, eg [https://www.govdirectory.org/singapore/Q1687545/](https://www.govdirectory.org/singapore/Q1687545/)

In [15]:
df_gdir = (
    pd.read_csv(FP_GDIR_DOMAINS, sep="\t")
    .clean_names()
    .rename_column("website", "domain")
    .rename_column("source_file", "country")
    .dropna(subset=["domain"])
    .drop_duplicates("domain")
    # ========================================================================
    # clean domain string
    .assign(
        domain=lambda df: df["domain"]
            .str.replace(r"^https?://", "", regex=True)
            .str.rstrip("/")
    )
    .drop_duplicates("domain")
)
df_gdir.head()

Unnamed: 0,name,govdirectory_url,type,domain,country
0,Cherkasy Oblast,https://www.govdirectory.org/ukraine/Q161808/,oblast of Ukraine,www.oblradack.gov.ua,Ukraine
1,Chernihiv Oblast,https://www.govdirectory.org/ukraine/Q167874/,oblast of Ukraine,cg.gov.ua,Ukraine
2,Chernivtsi Oblast,https://www.govdirectory.org/ukraine/Q168856/,oblast of Ukraine,bukoda.gov.ua,Ukraine
3,Dnipropetrovsk Oblast,https://www.govdirectory.org/ukraine/Q170672/,oblast of Ukraine,www.adm.dp.gov.ua,Ukraine
4,Donetsk Oblast,https://www.govdirectory.org/ukraine/Q2012050/,oblast of Ukraine,dn.gov.ua,Ukraine


In [16]:
df_gdir.stb.freq(["country"], sort_cols=True)

Unnamed: 0,country,count,percent,cumulative_count,cumulative_percent
0,Austria,2174,21.008891,2174,21.008891
1,Belgium,956,9.2385,3130,30.247391
2,Bermuda,13,0.125628,3143,30.373019
3,Cameroon,28,0.270584,3171,30.643603
4,Canada,33,0.318902,3204,30.962505
5,Czech Republic,233,2.251643,3437,33.214148
6,Denmark,124,1.198299,3561,34.412447
7,East Timor,15,0.144956,3576,34.557402
8,Finland,339,3.275995,3915,37.833398
9,Germany,262,2.53189,4177,40.365288


In [17]:
df_gdir_bl = (
    pd.DataFrame(process_bl_json_files(FP_GDIR_BL))
    # clean domain string
    .assign(domain=lambda df_: df_["domain"].str.rstrip('_'))
    .merge(df_gdir, on="domain", how="inner", validate="1:1")
    .assign(
        **{f"{col}_al1": (lambda df, k=col: df[k] >= 1) for col in bl_measures},
        **{f"{col}_al2": (lambda df, k=col: df[k] >= 2) for col in bl_measures},
    )
)
df_gdir_bl

Unnamed: 0,domain,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics,name,govdirectory_url,...,key_logging_al1,session_recording_al1,canvas_fingerprinting_al1,ddg_join_ads_al2,third_party_cookies_al2,fb_pixel_al2,google_analytics_al2,key_logging_al2,session_recording_al2,canvas_fingerprinting_al2
0,www.arbetsformedlingen.se,0,1,0,0,0,0,0,Arbetsförmedlingen,https://www.govdirectory.org/sweden/Q3440237/,...,False,False,False,False,False,False,False,False,False,False
1,www.vaux-sur-sure.be,0,0,0,0,0,0,0,Vaux-sur-Sûre,https://www.govdirectory.org/belgium/Q713864/,...,False,False,False,False,False,False,False,False,False,False
2,www.la-tour-de-peilz.ch,1,0,0,0,0,0,0,La Tour-de-Peilz,https://www.govdirectory.org/switzerland/Q69721/,...,False,False,False,False,False,False,False,False,False,False
3,www.grafenegg.gv.at,0,0,0,0,0,0,0,Grafenegg,https://www.govdirectory.org/austria/Q665801/,...,False,False,False,False,False,False,False,False,False,False
4,www.adliswil.ch,0,0,0,0,0,0,0,Adliswil,https://www.govdirectory.org/switzerland/Q68210/,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3596,www.niederwil.ch,0,0,0,0,0,0,0,Niederwil,https://www.govdirectory.org/switzerland/Q65660/,...,False,False,False,False,False,False,False,False,False,False
3597,www.vogar.is,1,1,0,0,0,0,0,Vogar,https://www.govdirectory.org/iceland/Q3482077/,...,False,False,False,False,False,False,False,False,False,False
3598,www.communeduchenit.ch,0,0,0,0,0,0,0,Le Chenit,https://www.govdirectory.org/switzerland/Q52743/,...,False,False,False,False,False,False,False,False,False,False
3599,www.grindelwald.com,1,0,1,0,0,0,0,Grindelwald,https://www.govdirectory.org/switzerland/Q64004/,...,False,False,True,False,False,False,False,False,False,False


In [18]:
df_gdir_bl.stb.freq(["country"], sort_cols=True)

Unnamed: 0,country,count,percent,cumulative_count,cumulative_percent
0,Austria,387,10.747015,387,10.747015
1,Belgium,444,12.329908,831,23.076923
2,Bermuda,2,0.05554,833,23.132463
3,Cameroon,14,0.388781,847,23.521244
4,East Timor,6,0.16662,853,23.687864
5,Ghana,51,1.416273,904,25.104138
6,Iceland,58,1.610664,962,26.714801
7,Latvia,18,0.499861,980,27.214663
8,Malaysia,29,0.805332,1009,28.019994
9,Nepal,88,2.443766,1097,30.46376


In [19]:
df_gdir_summ = summarize_bl(df_gdir_bl)
pandas_to_tex(
    df_gdir_summ,
    "../tables/bl_summ_gdir_domains",
    escape=True,
)
df_gdir_summ

Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.4,0.8,0.0,0.0,0.0,0.0,1.0,8.0,23.2%,6.4%
1,Third-Party Cookies,0.2,1.0,0.0,0.0,0.0,0.0,0.0,15.0,8.0%,2.9%
2,Facebook Pixel,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.7%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1%,0.0%
4,Keylogging,0.1,0.3,0.0,0.0,0.0,0.0,0.0,1.0,9.1%,0.0%
5,Session Recording,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.5%,0.0%
6,Canvas Fingerprinting,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.4%,0.0%


In [25]:
for ix, ctry in enumerate(list(df_gdir_bl["country"].unique())):
    _df_ctry = df_gdir.query(f"country=='{ctry}'")
    print(f"{1+ix}: {ctry} (n = {len(_df_ctry)})")
    
    _df_ctry_bl = (
        pd.DataFrame(process_bl_json_files(FP_GDIR_BL))
        # clean domain string
        .assign(domain=lambda df_: df_["domain"].str.rstrip('_'))
        .merge(_df_ctry, on="domain", how="right", validate="1:1")
        .assign(
            **{f"{col}_al1": (lambda df, k=col: df[k] >= 1) for col in bl_measures},
            **{f"{col}_al2": (lambda df, k=col: df[k] >= 2) for col in bl_measures},
        )
    )
    _df_summ = summarize_bl(_df_ctry_bl)
    pandas_to_tex(
        _df_summ,
        f"../tables/bl_summ_{ctry}_domains",
        escape=True,
    )
    display(_df_summ)

1: Sweden (n = 748)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.2,0.7,0.0,0.0,0.0,0.0,1.0,7.0,9.0%,2.7%
1,Third-Party Cookies,0.2,1.0,0.0,0.0,0.0,0.0,0.0,15.0,6.4%,1.7%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.7%,0.0%
5,Session Recording,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.4%,0.0%
6,Canvas Fingerprinting,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.4%,0.0%


2: Belgium (n = 956)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.4,0.9,0.0,0.0,0.0,1.0,1.0,7.0,12.3%,2.9%
1,Third-Party Cookies,0.2,1.1,0.0,0.0,0.0,0.0,0.0,12.0,2.4%,1.3%
2,Facebook Pixel,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.9%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.4%,0.0%
5,Session Recording,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.6%,0.0%
6,Canvas Fingerprinting,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.4%,0.0%


3: Switzerland (n = 2149)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.3,0.7,0.0,0.0,0.0,0.0,1.0,8.0,21.3%,4.2%
1,Third-Party Cookies,0.2,0.9,0.0,0.0,0.0,0.0,0.0,11.0,6.9%,2.2%
2,Facebook Pixel,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.4%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1%,0.0%
4,Keylogging,0.1,0.4,0.0,0.0,0.0,0.0,1.0,1.0,13.7%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.2%,0.0%
6,Canvas Fingerprinting,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.2%,0.0%


4: Austria (n = 2174)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.2,0.5,0.0,0.0,0.0,0.0,0.0,4.0,1.7%,0.7%
1,Third-Party Cookies,0.2,1.0,0.0,0.0,0.0,0.0,0.0,14.0,1.1%,0.6%
2,Facebook Pixel,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.1%,0.0%
3,Google Analytics,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.0%,0.0%
4,Keylogging,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,0.1%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


5: Nepal (n = 97)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,1.7,1.5,0.0,0.0,1.0,3.0,4.0,4.0,62.9%,42.3%
1,Third-Party Cookies,0.5,1.9,0.0,0.0,0.0,0.0,0.3,8.0,9.3%,6.2%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,1.0%,0.0%


6: Iceland (n = 81)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,1.0,1.4,0.0,0.0,1.0,1.0,3.0,5.0,38.3%,14.8%
1,Third-Party Cookies,0.4,1.1,0.0,0.0,0.0,0.0,1.0,5.0,14.8%,3.7%
2,Facebook Pixel,0.1,0.3,0.0,0.0,0.0,0.0,0.0,1.0,6.2%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
5,Session Recording,0.0,0.1,0.0,0.0,0.0,0.0,0.0,1.0,1.2%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


7: Cameroon (n = 28)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.2,0.4,0.0,0.0,0.0,0.0,1.0,1.0,10.7%,0.0%
1,Third-Party Cookies,0.3,1.1,0.0,0.0,0.0,0.0,0.0,4.0,3.6%,3.6%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


8: Ghana (n = 60)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.7,1.4,0.0,0.0,0.0,1.0,3.0,6.0,23.3%,13.3%
1,Third-Party Cookies,0.3,1.6,0.0,0.0,0.0,0.0,0.0,11.0,6.7%,5.0%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


9: Ukraine (n = 43)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,1.5,1.5,0.0,1.0,1.0,1.8,4.0,5.0,46.5%,16.3%
1,Third-Party Cookies,0.5,1.4,0.0,0.0,0.0,0.0,1.0,5.0,9.3%,4.7%
2,Facebook Pixel,0.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,2.3%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,2.3%,0.0%


10: Malaysia (n = 55)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.9,1.1,0.0,0.0,1.0,1.0,2.2,4.0,30.9%,7.3%
1,Third-Party Cookies,0.7,1.8,0.0,0.0,0.0,0.0,2.2,8.0,10.9%,9.1%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.8%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.8%,0.0%


11: Latvia (n = 65)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.3,0.7,0.0,0.0,0.0,0.0,1.3,2.0,6.2%,3.1%
1,Third-Party Cookies,0.3,0.5,0.0,0.0,0.0,1.0,1.0,1.0,9.2%,0.0%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.1,0.3,0.0,0.0,0.0,0.0,0.3,1.0,3.1%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


12: Russia (n = 21)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.4,0.5,0.0,0.0,0.0,1.0,1.0,1.0,19.0%,0.0%
1,Third-Party Cookies,0.7,1.3,0.0,0.0,0.0,1.0,1.6,4.0,14.3%,4.8%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.1,0.3,0.0,0.0,0.0,0.0,0.2,1.0,4.8%,0.0%
5,Session Recording,0.4,0.5,0.0,0.0,0.0,1.0,1.0,1.0,19.0%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


13: East Timor (n = 15)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
1,Third-Party Cookies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


14: Bermuda (n = 13)


Unnamed: 0,var,mean,std,min,25%,50%,75%,90%,max,atleast1,atleast2
0,Ad Trackers,3.0,1.4,2.0,2.5,3.0,3.5,3.8,4.0,15.4%,15.4%
1,Third-Party Cookies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
2,Facebook Pixel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
3,Google Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
4,Keylogging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
5,Session Recording,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%
6,Canvas Fingerprinting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0%,0.0%


## CC

In [None]:
FP_CC_DOMAINS = "../data/common_crawl_sample.csv"
FP_CC_BL = "../data/cc_blacklight_json"


In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_cc = (
    pd.read_csv("../data/common_crawl_sample.csv")
    .rename_column("url", "domain")
)
df_cc.head(50)

In [None]:
df_cc.head(50)["domain"].tolist()

In [None]:
for x in df_cc["domain"].unique():
    if "http://gov.sr" in x:
        print(x)

In [None]:
df_cc[df_cc.duplicated("domain", keep=False)].sort_values("domain")

In [None]:
df_cc_bl = (
    pd.DataFrame(process_bl_json_files(FP_CC_BL))
#     .merge(df_cc, on="domain", how="left", validate="1:1")
#     .assign(
#         **{f"{col}_al1": (lambda df, k=col: df[k] >= 1) for col cc bl_measures},
#         **{f"{col}_al2": (lambda df, k=col: df[k] >= 2) for col cc bl_measures},
#     )
)

df_cc_bl