In [2]:
import json
import os
import pandas as pd
import zipfile

In [4]:
input_folder = "privacy_scraper/blacklight_json"

# Initialize a list to store rows for the DataFrame
rows = []

# Iterate over each JSON file in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(input_folder, filename)
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            domain_name = filename.replace(".json", "")
            cards = data.get("groups", [])[0].get("cards", [])
            
            ddg_join_ads = 0
            third_party_cookies = 0
            canvas_fingerprinting = 0
            session_recording = 0
            key_logging = 0
            fb_pixel = 0
            google_analytics = 0

            for card in cards:
                card_type = card.get("cardType", "")
                big_number = card.get("bigNumber", 0)

                if card_type == "ddg_join_ads":
                    ddg_join_ads = big_number
                elif card_type == "cookies":
                    third_party_cookies = big_number
                elif card_type == "canvas_fingerprinters":
                    canvas_fingerprinting = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "session_recorders":
                    session_recording = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "key_logging":
                    key_logging = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "fb_pixel_events":
                    fb_pixel = 1 if card.get("testEventsFound", False) else 0
                elif card_type == "ga":
                    google_analytics = 1 if card.get("testEventsFound", False) else 0
            
            rows.append({
                "filename": domain_name,
                "ddg_join_ads": ddg_join_ads,
                "third_party_cookies": third_party_cookies,
                "canvas_fingerprinting": canvas_fingerprinting,
                "session_recording": session_recording,
                "key_logging": key_logging,
                "fb_pixel": fb_pixel,
                "google_analytics": google_analytics,
            })
        
        except Exception as e:
            print(f"Error processing {filename}: {e}")

In [6]:
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,filename,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
0,msp.gov.ua_,1,0,0,0,0,0,0
1,www.zoda.gov.ua_,1,0,0,0,0,0,0
2,mkip.gov.ua_,4,1,0,0,0,0,0
3,rada.crimea.ua_,1,0,0,0,0,0,0
4,www.rv.gov.ua_,1,0,0,0,0,0,0


In [8]:
df.describe()

Unnamed: 0,ddg_join_ads,third_party_cookies,canvas_fingerprinting,session_recording,key_logging,fb_pixel,google_analytics
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,1.44,0.48,0.04,0.0,0.0,0.04,0.0
std,1.502221,1.388044,0.2,0.0,0.0,0.2,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,5.0,1.0,0.0,0.0,1.0,0.0


In [7]:
df.to_csv("../data/blacklight_domain.csv", index = False)