### Combine Blacklight

In [1]:
import json
import os
import pandas as pd
import zipfile

In [2]:
input_folder = "../data/blacklight_json"

# Initialize a list to store rows for the DataFrame
rows = []

# Iterate over each JSON file in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(input_folder, filename)
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            domain_name = filename.replace(".json", "")
            cards = data.get("groups", [])[0].get("cards", [])
            
            ddg_join_ads = 0
            third_party_cookies = 0
            canvas_fingerprinting = 0
            session_recording = 0
            key_logging = 0
            fb_pixel = 0
            google_analytics = 0

            for card in cards:
                card_type = card.get("cardType", "")
                big_number = card.get("bigNumber", 0)

                if card_type == "ddg_join_ads":
                    ddg_join_ads = big_number
                elif card_type == "cookies":
                    third_party_cookies = big_number
                elif card_type == "canvas_fingerprinters":
                    canvas_fingerprinting = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "session_recorders":
                    session_recording = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "key_logging":
                    key_logging = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "fb_pixel_events":
                    fb_pixel = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "ga":
                    google_analytics = 1 if card.get("testEventsFound", False) else 0
            
            rows.append({
                "filename": domain_name,
                "ddg_join_ads": ddg_join_ads,
                "third_party_cookies": third_party_cookies,
                "canvas_fingerprinting": canvas_fingerprinting,
                "session_recording": session_recording,
                "key_logging": key_logging,
                "fb_pixel": fb_pixel,
                "google_analytics": google_analytics,
            })
        
        except Exception as e:
            print(f"Error processing {filename}: {e}")

In [3]:
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,filename,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
0,costarmanager_com,5,10,0,1,0,0,0
1,teasource_com,11,11,0,0,0,1,1
2,1800tequila_com,8,6,0,0,0,0,0
3,mazon_com,1,0,0,0,0,0,0
4,theancestorhunt_com,2,0,0,0,0,0,0


In [4]:
df.shape

(34078, 8)

In [5]:
df.describe()

Unnamed: 0,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
count,34078.0,34078.0,34078.0,34078.0,34078.0,34078.0,34078.0
mean,6.904953,9.59936,0.061858,0.089295,0.036827,0.209901,0.042667
std,13.156795,24.257182,0.240901,0.285173,0.18834,0.407244,0.202108
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,6.0,7.0,0.0,0.0,0.0,0.0,0.0
max,93.0,256.0,1.0,1.0,1.0,1.0,1.0


In [16]:
websites = pd.read_csv("../data/yg_ind_domain.csv")[["private_domain"]].drop_duplicates()
websites.rename(columns={"private_domain": "domain_name"}, inplace=True)

In [None]:
merged_table = websites.merge(
    df,
    left_on='domain_name',
    right_on='filename', 
    how='left'
)

merged_table.fillna('NA', inplace=True)

In [15]:
merged_table.to_csv("../data/blacklight_domain.csv", index = False)