In [1]:
import os
import json
import pandas as pd

input_folder = "./website_trackers"

websites = pd.read_csv("yg_ind_domain.csv")[["private_domain"]].drop_duplicates()
websites.rename(columns={"private_domain": "domain_name"}, inplace=True)

processed_data = []

for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        domain_name = file_name.replace("_data.json", "").replace("_", ".")  # Extract domain name

        file_path = os.path.join(input_folder, file_name)
        try:
            with open(file_path, "r") as f:
                data = json.load(f)

            if not data:
                print(f"Warning: JSON file {file_name} is empty. Skipping.")
                continue

            statistics = data.get("statistics", {})
            trackers_per_page_load = statistics.get("Trackers Per Page Load", None)
            tracking_requests_per_page_load = statistics.get("Tracking Requests Per Page Load", None)
            trackers_requests_all_requests = statistics.get("Trackers Requests / All Requests", None)
            data_saved = statistics.get("Data Saved", None)

            trackers = data.get("trackers", [])
            category_counts = {}
            for tracker in trackers:
                category = tracker.get("category", "Unknown")
                category_counts[category] = category_counts.get(category, 0) + 1

            processed_data.append({
                "domain_name": domain_name,
                "Trackers Per Page Load": trackers_per_page_load,
                "Tracking Requests Per Page Load": tracking_requests_per_page_load,
                "Trackers Requests / All Requests": trackers_requests_all_requests,
                "Data Saved": data_saved,
                **category_counts
            })
        except json.JSONDecodeError:
            print(f"Error: JSON file {file_name} is invalid or corrupted. Skipping.")
        except Exception as e:
            print(f"Unexpected error while processing {file_name}: {e}")

processed_df = pd.DataFrame(processed_data)

final_df = websites.merge(processed_df, on="domain_name", how="left")

final_df.fillna("NA", inplace=True)

final_df.to_csv("final_website_whotracksme.csv", index=False)

In [2]:
final_df["Trackers Requests / All Requests"] = (
    final_df["Trackers Requests / All Requests"]
    .astype(str)
    .str.replace("%", "", regex=False)
    .replace(["NA", "nan"], None)
    .astype(float)
)

In [3]:
final_df["Data Saved"] = (
    final_df["Data Saved"]
    .astype(str)
    .str.replace("MB", "", regex=False)
    .replace(["NA", "nan"], None)
    .astype(float)
)

In [4]:
final_df = final_df.apply(lambda col: pd.to_numeric(col, errors="coerce") if col.name != "domain_name" else col)

In [5]:
final_df.describe().round(1)

Unnamed: 0,Trackers Per Page Load,Tracking Requests Per Page Load,Trackers Requests / All Requests,Data Saved,Advertising,Audio/Video Player,Customer Interaction,Hosting,Consent Management,Site Analytics,Misc,Utilities,Social Media,Adult Advertising
count,4261.0,4261.0,4261.0,4261.0,4174.0,2588.0,2934.0,4082.0,1760.0,3970.0,2274.0,2091.0,2125.0,223.0
mean,7.1,7.7,10.1,11.7,16.7,1.4,2.1,6.2,1.3,5.0,2.2,2.1,1.9,1.9
std,4.5,14.7,8.5,59.7,19.2,0.7,1.3,2.9,0.6,3.5,2.2,1.2,1.0,1.0
min,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,3.8,1.3,3.9,0.4,4.0,1.0,1.0,4.0,1.0,2.0,1.0,1.0,1.0,1.0
50%,5.9,3.6,8.0,0.9,8.0,1.0,2.0,6.0,1.0,4.0,1.0,2.0,2.0,2.0
75%,9.3,8.6,13.8,2.8,23.0,2.0,3.0,8.0,2.0,7.0,2.0,3.0,2.0,2.0
max,31.8,354.0,80.4,1042.8,125.0,5.0,10.0,17.0,5.0,21.0,18.0,7.0,8.0,5.0
