In [1]:
import json
import os
import pandas as pd
import zipfile

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
input_folder = "privacy_scraper/blacklight_json"

In [4]:
def process_json_files(input_folder: str) -> list:
    """
    Process JSON files to extract tracking metrics from DuckDuckGo privacy data.
    
    Args:
        input_folder (str): Path to folder containing JSON files
        
    Returns:
        list: List of dictionaries containing extracted metrics
    """
    rows = []
    
    for filename in os.listdir(input_folder):
        if not filename.endswith(".json"):
            continue
            
        file_path = os.path.join(input_folder, filename)
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            domain_name = filename.replace(".json", "")
            cards = data.get("groups", [])[0].get("cards", [])
            
            metrics = {
                "filename": domain_name,
                "ddg_join_ads": 0,
                "third_party_cookies": 0,
                "canvas_fingerprinting": 0,
                "session_recording": 0,
                "key_logging": 0,
                "fb_pixel": 0,
                "google_analytics": 0
            }
            
            for card in cards:
                card_type = card.get("cardType", "")
                if card_type == "ddg_join_ads":
                    metrics["ddg_join_ads"] = card.get("bigNumber", 0)
                elif card_type == "cookies":
                    metrics["third_party_cookies"] = card.get("bigNumber", 0)
                elif card_type in ["canvas_fingerprinters", "session_recorders", 
                                 "key_logging", "fb_pixel_events"]:
                    metric_key = {
                        "canvas_fingerprinters": "canvas_fingerprinting",
                        "session_recorders": "session_recording",
                        "key_logging": "key_logging",
                        "fb_pixel_events": "fb_pixel"
                    }[card_type]
                    metrics[metric_key] = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "ga":
                    metrics["google_analytics"] = 1 if card.get("testEventsFound", False) else 0
            
            rows.append(metrics)
        
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            
    return rows

In [5]:
govdir_blacklight_df = pd.DataFrame(process_json_files("../data/blacklight_json"))
govdir_blacklight_df.shape

(3877, 8)

In [6]:
govdir_blacklight_df.head()

Unnamed: 0,filename,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
0,www.freienbach.ch_,0,0,0,0,1,0,0
1,drangsnes.is_,0,0,0,0,0,0,0
2,www.wald.zh.ch_,0,0,0,0,1,0,0
3,www.bournens.ch_,1,0,0,0,0,0,0
4,hagneck.ch_,3,0,0,0,0,0,0


In [7]:
govdir_blacklight_df.to_csv("../data/blacklight_domain.csv", index = False)

In [8]:
govdir_blacklight_df.describe()

Unnamed: 0,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
count,3877.0,3877.0,3877.0,3877.0,3877.0,3877.0,3877.0
mean,0.388703,0.207893,0.004127,0.004643,0.084859,0.006706,0.000774
std,0.854362,0.989683,0.064117,0.067988,0.278708,0.081627,0.02781
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8.0,15.0,1.0,1.0,1.0,1.0,1.0


In [9]:
govdir = pd.read_csv("../data/combined_tsv_data.tsv", sep = "\t")
govdir.head()

Unnamed: 0,Name,Govdirectory URL,Type,Website,source_file
0,Cherkasy Oblast,https://www.govdirectory.org/ukraine/Q161808/,oblast of Ukraine,https://www.oblradack.gov.ua/,Ukraine
1,Chernihiv Oblast,https://www.govdirectory.org/ukraine/Q167874/,oblast of Ukraine,https://cg.gov.ua/,Ukraine
2,Chernivtsi Oblast,https://www.govdirectory.org/ukraine/Q168856/,oblast of Ukraine,https://bukoda.gov.ua/,Ukraine
3,Dnipropetrovsk Oblast,https://www.govdirectory.org/ukraine/Q170672/,oblast of Ukraine,http://www.adm.dp.gov.ua/,Ukraine
4,Donetsk Oblast,https://www.govdirectory.org/ukraine/Q2012050/,oblast of Ukraine,https://dn.gov.ua/,Ukraine


In [10]:
# Keys
govdir_blacklight_df['filename'] = govdir_blacklight_df['filename'].str.rstrip('_')
govdir['Website'] = govdir['Website'].str.replace(r'^https?://', '', regex=True)
govdir['Website'] = govdir['Website'].str.rstrip('/')

In [11]:
fin_df = govdir.merge(govdir_blacklight_df, 
            left_on = "Website",
            right_on = "filename",
                     how = "left")

In [12]:
fin_df.shape

(12255, 13)

In [13]:
fin_df.head()

Unnamed: 0,Name,Govdirectory URL,Type,Website,source_file,filename,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
0,Cherkasy Oblast,https://www.govdirectory.org/ukraine/Q161808/,oblast of Ukraine,www.oblradack.gov.ua,Ukraine,www.oblradack.gov.ua,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Chernihiv Oblast,https://www.govdirectory.org/ukraine/Q167874/,oblast of Ukraine,cg.gov.ua,Ukraine,cg.gov.ua,4.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Chernivtsi Oblast,https://www.govdirectory.org/ukraine/Q168856/,oblast of Ukraine,bukoda.gov.ua,Ukraine,bukoda.gov.ua,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Dnipropetrovsk Oblast,https://www.govdirectory.org/ukraine/Q170672/,oblast of Ukraine,www.adm.dp.gov.ua,Ukraine,,,,,,,,
4,Donetsk Oblast,https://www.govdirectory.org/ukraine/Q2012050/,oblast of Ukraine,dn.gov.ua,Ukraine,dn.gov.ua,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
fin_df.groupby('source_file').describe().reset_index().reset_index().dropna()

Unnamed: 0_level_0,index,source_file,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,session_recording,session_recording,session_recording,session_recording,session_recording,session_recording,session_recording,session_recording,key_logging,key_logging,key_logging,key_logging,key_logging,key_logging,key_logging,key_logging,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
0,0,Austria,389.0,0.154242,0.539056,0.0,0.0,0.0,0.0,4.0,389.0,0.169666,0.980219,0.0,0.0,0.0,0.0,14.0,389.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,389.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,389.0,0.005141,0.071611,0.0,0.0,0.0,0.0,1.0,389.0,0.005141,0.071611,0.0,0.0,0.0,0.0,1.0,389.0,0.002571,0.050702,0.0,0.0,0.0,0.0,1.0
1,1,Belgium,444.0,0.423423,0.931511,0.0,0.0,0.0,1.0,7.0,444.0,0.189189,1.085621,0.0,0.0,0.0,0.0,12.0,444.0,0.009009,0.094594,0.0,0.0,0.0,0.0,1.0,444.0,0.013514,0.11559,0.0,0.0,0.0,0.0,1.0,444.0,0.029279,0.168778,0.0,0.0,0.0,0.0,1.0,444.0,0.02027,0.141082,0.0,0.0,0.0,0.0,1.0,444.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Bermuda,2.0,3.0,1.414214,2.0,2.5,3.0,3.5,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Cameroon,14.0,0.214286,0.425815,0.0,0.0,0.0,0.0,1.0,14.0,0.285714,1.069045,0.0,0.0,0.0,0.0,4.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,East Timor,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,10,Ghana,51.0,0.686275,1.378263,0.0,0.0,0.0,1.0,6.0,51.0,0.333333,1.608312,0.0,0.0,0.0,0.0,11.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,11,Iceland,59.0,1.016949,1.370773,0.0,0.0,1.0,1.0,5.0,59.0,0.40678,1.13135,0.0,0.0,0.0,0.0,5.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,0.016949,0.130189,0.0,0.0,0.0,0.0,1.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,0.084746,0.280894,0.0,0.0,0.0,0.0,1.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,15,Latvia,18.0,0.333333,0.685994,0.0,0.0,0.0,0.0,2.0,18.0,0.333333,0.485071,0.0,0.0,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.111111,0.323381,0.0,0.0,0.0,0.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,17,Malaysia,29.0,0.896552,1.113066,0.0,0.0,1.0,1.0,4.0,29.0,0.724138,1.810615,0.0,0.0,0.0,0.0,8.0,29.0,0.034483,0.185695,0.0,0.0,0.0,0.0,1.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.034483,0.185695,0.0,0.0,0.0,0.0,1.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,20,Nepal,88.0,1.670455,1.467828,0.0,0.0,1.0,3.0,4.0,88.0,0.545455,1.923294,0.0,0.0,0.0,0.0,8.0,88.0,0.011364,0.1066,0.0,0.0,0.0,0.0,1.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
fin_df.groupby(['source_file', 'Type']).describe().reset_index().reset_index().dropna()

Unnamed: 0_level_0,index,source_file,Type,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,ddg_join_ads,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,third_party_cookies,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,canvas_fingerprinting,session_recording,session_recording,session_recording,session_recording,session_recording,session_recording,session_recording,session_recording,key_logging,key_logging,key_logging,key_logging,key_logging,key_logging,key_logging,key_logging,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,fb_pixel,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics,google_analytics
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
0,0,Austria,Bundesministerium,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Austria,district of Austria,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Austria,federal state of Austria,7.0,0.428571,1.133893,0.0,0.0,0.0,0.0,3.0,7.0,2.428571,3.154739,0.0,0.0,0.0,5.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Austria,municipality of Austria,372.0,0.153226,0.530133,0.0,0.0,0.0,0.0,4.0,372.0,0.13172,0.863369,0.0,0.0,0.0,0.0,14.0,372.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,372.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,372.0,0.005376,0.073225,0.0,0.0,0.0,0.0,1.0,372.0,0.005376,0.073225,0.0,0.0,0.0,0.0,1.0,372.0,0.002688,0.051848,0.0,0.0,0.0,0.0,1.0
4,4,Belgium,Belgian delegations,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,Belgium,Federal Public Service,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.75,0.5,0.0,0.75,1.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,Belgium,Federal Scientific Institute,6.0,1.833333,2.041241,0.0,0.0,1.5,3.75,4.0,6.0,0.666667,1.632993,0.0,0.0,0.0,0.0,4.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.166667,0.408248,0.0,0.0,0.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.333333,0.516398,0.0,0.0,0.0,0.75,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,Belgium,Public Institution of Social Security,5.0,0.2,0.447214,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,Belgium,Public Interest Organizations,7.0,1.571429,1.812654,0.0,0.5,1.0,2.0,5.0,7.0,2.285714,3.9036,0.0,0.0,0.0,4.0,8.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.142857,0.377964,0.0,0.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,10,Belgium,Public Planning Service,2.0,0.5,0.707107,0.0,0.25,0.5,0.75,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's do the same for US gov. list

In [16]:
usgov_blacklight_df = pd.DataFrame(process_json_files("../data/us_blacklight_json/"))
usgov_blacklight_df.shape

(7790, 8)

In [17]:
usgov_blacklight_df.head()

Unnamed: 0,filename,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
0,vermontvilletownshipmi.gov,1,1,0,0,0,0,0
1,snoqualmiewa.gov,5,0,0,0,0,0,0
2,bucklinmo.gov,0,5,0,0,0,0,0
3,bentoncountywa.gov,0,0,0,0,0,0,0
4,ibab.gov,0,0,0,0,0,0,0


In [18]:
usgov_blacklight_df.describe()

Unnamed: 0,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
count,7790.0,7790.0,7790.0,7790.0,7790.0,7790.0,7790.0
mean,1.70077,1.18896,0.015789,0.015276,0.017587,0.030039,0.006547
std,2.775863,4.455279,0.124668,0.122656,0.131452,0.170704,0.080653
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,1.0,0.0,0.0,0.0,0.0,0.0
max,37.0,86.0,1.0,1.0,1.0,1.0,1.0


### Let's do it for Indian Gov. List

In [19]:
ingov_blacklight_df = pd.DataFrame(process_json_files("../data/in_blacklight_json"))
ingov_blacklight_df.shape

Error processing nild.nic.in.json: list index out of range
Error processing icadr.telangana.gov.in.json: list index out of range
Error processing sd2.tourism.gov.in.json: list index out of range
Error processing oreat.nic.in.json: list index out of range
Error processing uiic.co.in.json: list index out of range
Error processing ttwd.assam.gov.in.json: list index out of range
Error processing nli.gov.in.json: list index out of range
Error processing handlooms.gov.in.json: list index out of range
Error processing iusstf.org.json: list index out of range
Error processing mphc.gov.in.json: list index out of range
Error processing nests.tribal.gov.in.json: list index out of range
Error processing mmscmsguy.assam.gov.in.json: list index out of range
Error processing secl-cil.in.json: list index out of range
Error processing vijayawadapost.gov.in.json: list index out of range
Error processing cybertreasury.gujarat.gov.in.json: list index out of range
Error processing missionshakti.wcd.gov.in.

(2475, 8)

In [20]:
ingov_blacklight_df.describe()

Unnamed: 0,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
count,2475.0,2475.0,2475.0,2475.0,2475.0,2475.0,2475.0
mean,0.699394,0.459394,0.008485,0.000404,0.009293,0.008081,0.001616
std,2.053483,3.453506,0.09174,0.020101,0.09597,0.089547,0.040177
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,42.0,84.0,1.0,1.0,1.0,1.0,1.0


In [21]:
ccgov_blacklight_df = pd.DataFrame(process_json_files("../data/cc_blacklight_json"))
ccgov_blacklight_df.shape

Error processing data.gov.lt_datasets_?q=&category_id=96&tags=nelaimingas.json: list index out of range
Error processing www.centralbank.gov.ye_home_detail_1256-33402_money%20changers%20laws%20and%20regulations.json: list index out of range
Error processing budget.gouv.dj_particulier-2.json: list index out of range
Error processing solomons.gov.sb_four-4-waste-disposal-3-tonne-trucks-ready-launched-and-ready-for-action.json: list index out of range
Error processing journaldemonaco.gouv.mc_content_search?searchtext=&filter%5b%5d=attr_theme_s:%22concessions%20trentenaires%22&activefacets%5battr_theme_s:th%c3%a8mes%5d=concessions%20trentenaires&sort=score_desc&page_limit=15.json: list index out of range
Error processing www.elevage.gouv.ne_robots.txt.json: list index out of range
Error processing msmt.gov.cz_ministerstvo_prehled-realizovanych-a-ukoncenych-zadavacich-rizeni-v-roce-2007.json: list index out of range
Error processing solomons.gov.sb_construction-of-interlocking-blocks-paveme

(9057, 8)

In [22]:
ccgov_blacklight_df.describe()

Unnamed: 0,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
count,9057.0,9057.0,9057.0,9057.0,9057.0,9057.0,9057.0
mean,0.853483,0.630783,0.010379,0.020537,0.015568,0.012145,0.004858
std,1.950259,3.892597,0.101352,0.141835,0.123804,0.10954,0.069535
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65.0,166.0,1.0,1.0,1.0,1.0,1.0


In [23]:
cc_samp = pd.read_csv("../data/common_crawl_sample.csv")
cc_samp.head()

Unnamed: 0,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,country,pattern,languages,encoding,redirect,truncated
0,"af,gov,moj)/content/files/crpd.pdf",20240714034654,https://moj.gov.af/Content/files/CRPD.pdf,text/html,text/html,404,GY5DQ2BNNLQPCCOAYY3FTL43EN6DI5LO,8518,13208486,crawl-data/CC-MAIN-2024-30/segments/1720763514548.45/crawldiagnostics/CC-MAIN-20240714032952-20240714062952-00501.warc.gz,Afghanistan,*.gov.af,,,,
1,"af,gov,mfa,islamabad)/introductory-meeting-of-acting-ambassador-h-e-sardar-ahmad-shakeeb-with-afghan-students-in-pakistan",20240723080327,https://islamabad.mfa.gov.af/introductory-meeting-of-acting-ambassador-h-e-sardar-ahmad-shakeeb-with-afghan-students-in-pakistan/,text/html,text/html,200,4VOGNJXYP5M74XLAL7FZCLYDKLQQ77XU,19902,266134915,crawl-data/CC-MAIN-2024-30/segments/1720763518029.81/warc/CC-MAIN-20240723072353-20240723102353-00285.warc.gz,Afghanistan,*.gov.af,eng,UTF-8,,
2,"af,gov,mfa)/en/category/breaking-news",20240721095153,https://mfa.gov.af/en/category/breaking-news/,text/html,text/html,301,3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ,736,13362462,crawl-data/CC-MAIN-2024-30/segments/1720763517663.24/crawldiagnostics/CC-MAIN-20240721091006-20240721121006-00649.warc.gz,Afghanistan,*.gov.af,,,https://mfa.gov.af/en/category/breaking-news,
3,"af,gov,mudh)/dr/%d8%a8%d8%b1%d9%86%d8%a7%d9%85%d9%87-%d9%85%d9%84%db%8c-%d9%85%d8%b3%da%a9%d9%86",20240719083155,https://mudh.gov.af/dr/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D9%87-%D9%85%D9%84%DB%8C-%D9%85%D8%B3%DA%A9%D9%86,text/html,text/html,200,I5ZMS44OMMVMBHOCXUYUZO45BSQCR653,9659,371362517,crawl-data/CC-MAIN-2024-30/segments/1720763514900.59/warc/CC-MAIN-20240719074314-20240719104314-00233.warc.gz,Afghanistan,*.gov.af,"fas,eng,pus",UTF-8,,
4,"af,gov,mail)/en/node/13864",20240724090703,https://mail.gov.af/en/node/13864,text/html,text/html,200,WWGQY3A4YWEINGJCFBJHGCW4ELBGXOTU,8188,322912422,crawl-data/CC-MAIN-2024-30/segments/1720763518198.93/warc/CC-MAIN-20240724075911-20240724105911-00238.warc.gz,Afghanistan,*.gov.af,"eng,fas",UTF-8,,


In [24]:
cc_samp['url']

0                                                                                                 https://moj.gov.af/Content/files/CRPD.pdf
1         https://islamabad.mfa.gov.af/introductory-meeting-of-acting-ambassador-h-e-sardar-ahmad-shakeeb-with-afghan-students-in-pakistan/
2                                                                                             https://mfa.gov.af/en/category/breaking-news/
3                                   https://mudh.gov.af/dr/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D9%87-%D9%85%D9%84%DB%8C-%D9%85%D8%B3%DA%A9%D9%86
4                                                                                                         https://mail.gov.af/en/node/13864
                                                                        ...                                                                
15678                                                             https://parlzim.gov.zw/download/senate-hansard-08-june-2021-vol-30-no-42/
15679               

In [25]:
ccgov_blacklight_df['filename']

0                                                              moh.gov.ss_organogram.php
1                                              aipchile.dgac.gob.cl_aip_vol2_seccion_aic
2       drss.gov.zw_index.php_typography_manpower-planning-and-institutional-development
3                                                                            wasa.gov.tt
4                                  calsafer.dtsc.ca.gov_cms_candidatechemical_?rid=22199
                                              ...                                       
9052             blrc.go.tz_kak-pol%d1%8czovat%d1%8csja-stakanom-cen-v-torgovle-na-rynke
9053                                  mohs.gs.gov.mn_2023-oni-09-d%d2%afgeer-sarin-medee
9054                                 esavjetovanja.gov.hr_econ_mainscreen?entityid=27787
9055                                               meccnar.gov.gm_taxonomy_term_3?page=1
9056             ovm.gov.ua_index.php_1000-zabir-krovi-na-terytorii-holosiivskoho-raionu
Name: filename, Lengt

In [26]:
def sanitize_url(url):
    """Convert URL to filename format using the exact sanitization function"""
    if pd.isna(url):
        return None
    return url.replace("http://", "").replace("https://", "").replace("/", "_")

In [32]:
cc_samp['sanitized_url'] = cc_samp['url'].apply(sanitize_url)

merged_df = pd.merge(
        cc_samp, 
        ccgov_blacklight_df,
        left_on='sanitized_url',
        right_on='filename',
        how='inner'
    )

merged_df.drop(columns=['sanitized_url', 'clean_filename'], errors='ignore')

Unnamed: 0,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename_x,country,pattern,languages,encoding,redirect,truncated,filename_y,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
0,"af,gov,mail)/en/node/13864",20240724090703,https://mail.gov.af/en/node/13864,text/html,text/html,200,WWGQY3A4YWEINGJCFBJHGCW4ELBGXOTU,8188,322912422,crawl-data/CC-MAIN-2024-30/segments/1720763518198.93/warc/CC-MAIN-20240724075911-20240724105911-00238.warc.gz,Afghanistan,*.gov.af,"eng,fas",UTF-8,,,mail.gov.af_en_node_13864,1,0,0,0,0,0,0
1,"af,gov,mail)/index.php/en/node/2244",20240724091539,https://mail.gov.af/index.php/en/node/2244,text/html,text/html,200,4XXNELNXHPXRUBXRKRBFCBLQGHCNJLAH,9461,325979599,crawl-data/CC-MAIN-2024-30/segments/1720763518198.93/warc/CC-MAIN-20240724075911-20240724105911-00169.warc.gz,Afghanistan,*.gov.af,"pus,eng",UTF-8,,,mail.gov.af_index.php_en_node_2244,0,0,0,0,0,0,0
2,"af,gov,asa)/en/pd",20240712150729,https://asa.gov.af/en/pd,text/html,text/html,200,KCNPH4I2YAPNZO5ISDHZEEDN2DVKEKHV,10777,76158298,crawl-data/CC-MAIN-2024-30/segments/1720763514404.71/warc/CC-MAIN-20240712125648-20240712155648-00591.warc.gz,Afghanistan,*.gov.af,eng,UTF-8,,,asa.gov.af_en_pd,0,0,0,0,0,0,0
3,"af,gov,aria)/?page_id=224",20240722022138,http://aria.gov.af/?page_id=224,text/html,text/html,200,LXJ2ALDM436OIWGGEHANU7J432UANUBA,12282,5256263,crawl-data/CC-MAIN-2024-30/segments/1720763517805.92/warc/CC-MAIN-20240722003438-20240722033438-00373.warc.gz,Afghanistan,*.gov.af,"eng,fas",UTF-8,,,aria.gov.af_?page_id=224,0,0,0,0,0,0,0
4,"af,gov,moci)/index.php/ps/node/2199",20240721010153,https://moci.gov.af/index.php/ps/node/2199,text/html,text/html,200,IXEXWQMOVJIK4MEARCAYCDPSVYDXRBWI,10659,368028967,crawl-data/CC-MAIN-2024-30/segments/1720763517544.48/warc/CC-MAIN-20240720235600-20240721025600-00515.warc.gz,Afghanistan,*.gov.af,"pus,fas,eng",UTF-8,,,moci.gov.af_index.php_ps_node_2199,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3884,"zw,gov,drss)/index.php/library/eligible-members-and-application-form/chiredzi-reseach-station",20240715014914,https://drss.gov.zw/index.php/library/eligible-members-and-application-form/chiredzi-reseach-station,text/html,text/html,200,U3X46NFDKM34FJT52MB47HVPZDDBWW5J,6284,190496435,crawl-data/CC-MAIN-2024-30/segments/1720763514655.27/warc/CC-MAIN-20240715010519-20240715040519-00642.warc.gz,Zimbabwe,*.gov.zw,eng,UTF-8,,,drss.gov.zw_index.php_library_eligible-members-and-application-form_chiredzi-reseach-station,0,0,0,0,0,0,0
3885,"zw,gov,zim)/index.php/en/government-documents/category/2-transitional-stabilisation-plan",20240725012414,https://www.zim.gov.zw/index.php/en/government-documents/category/2-transitional-stabilisation-plan,text/html,text/html,200,LHSMKIKTY5QVIVQ2AMFVY34TNT5T5X4Q,9253,913142272,crawl-data/CC-MAIN-2024-30/segments/1720763518532.61/warc/CC-MAIN-20240724232540-20240725022540-00156.warc.gz,Zimbabwe,*.gov.zw,eng,UTF-8,,,www.zim.gov.zw_index.php_en_government-documents_category_2-transitional-stabilisation-plan,0,0,0,0,0,0,0
3886,"zw,gov,drss)/robots.txt",20240715011004,http://drss.gov.zw/robots.txt,text/plain,text/x-robots,200,T37DBG3MLBTFRXRKFZH64477QBZFCLTA,981,78944,crawl-data/CC-MAIN-2024-30/segments/1720763514655.27/robotstxt/CC-MAIN-20240715010519-20240715040519-00714.warc.gz,Zimbabwe,*.gov.zw,,,,,drss.gov.zw_robots.txt,0,0,0,0,0,0,0
3887,"zw,gov,mohcc)/?wpdmpro=zim_sitrep_03-sep-20",20240713060217,https://www.mohcc.gov.zw/?wpdmpro=zim_sitrep_03-sep-20,text/html,text/html,200,3ZZUBNI6XG76YYW2ZB4NIXTJBACLTP3A,27844,720599294,crawl-data/CC-MAIN-2024-30/segments/1720763514490.70/warc/CC-MAIN-20240713051758-20240713081758-00022.warc.gz,Zimbabwe,*.gov.zw,eng,UTF-8,,,www.mohcc.gov.zw_?wpdmpro=zim_sitrep_03-sep-20,0,0,0,0,0,0,0


In [28]:
cols_to_avg = [
        "ddg_join_ads", "third_party_cookies", "canvas_fingerprinting",
        "session_recording", "key_logging", "fb_pixel", "google_analytics"
    ]
country_means = merged_df.groupby("country")[cols_to_avg].mean()
country_counts = merged_df.groupby("country").size().rename("count")
country_means.join(country_counts).reset_index()

Unnamed: 0,country,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics,count
0,Afghanistan,0.829268,0.000000,0.000000,0.0,0.000000,0.000000,0.0,41
1,Albania,0.461538,0.000000,0.000000,0.0,0.153846,0.000000,0.0,13
2,Algeria,1.000000,0.121951,0.000000,0.0,0.000000,0.000000,0.0,41
3,Angola,0.692308,1.076923,0.000000,0.0,0.000000,0.000000,0.0,13
4,Argentina,1.540541,0.729730,0.054054,0.0,0.000000,0.162162,0.0,37
...,...,...,...,...,...,...,...,...,...
153,Venezuela,0.611111,0.222222,0.000000,0.0,0.000000,0.000000,0.0,18
154,Vietnam,1.923077,2.769231,0.076923,0.0,0.000000,0.000000,0.0,26
155,Yemen,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,35
156,Zambia,0.500000,0.000000,0.000000,0.0,0.026316,0.000000,0.0,38
