In [3]:
import os
import json
import pandas as pd

# Input folder containing the JSON files
input_folder = "./website_trackers"

# Load the websites and deduplicate
websites = pd.read_csv("yg_ind_domain.csv")[["private_domain"]].drop_duplicates()
websites.rename(columns={"private_domain": "domain_name"}, inplace=True)

# Initialize an empty list to store processed data
processed_data = []

# Process each JSON file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        domain_name = file_name.replace("_data.json", "").replace("_", ".")  # Extract domain name

        # Load the JSON file
        file_path = os.path.join(input_folder, file_name)
        try:
            with open(file_path, "r") as f:
                data = json.load(f)

            # Check if the JSON file is empty
            if not data:
                print(f"Warning: JSON file {file_name} is empty. Skipping.")
                continue

            # Extract statistics
            statistics = data.get("statistics", {})
            trackers_per_page_load = statistics.get("Trackers Per Page Load", None)
            tracking_requests_per_page_load = statistics.get("Tracking Requests Per Page Load", None)
            trackers_requests_all_requests = statistics.get("Trackers Requests / All Requests", None)
            data_saved = statistics.get("Data Saved", None)

            # Count number of trackers per category
            trackers = data.get("trackers", [])
            category_counts = {}
            for tracker in trackers:
                category = tracker.get("category", "Unknown")
                category_counts[category] = category_counts.get(category, 0) + 1

            # Add the processed data
            processed_data.append({
                "domain_name": domain_name,
                "Trackers Per Page Load": trackers_per_page_load,
                "Tracking Requests Per Page Load": tracking_requests_per_page_load,
                "Trackers Requests / All Requests": trackers_requests_all_requests,
                "Data Saved": data_saved,
                **category_counts
            })
        except json.JSONDecodeError:
            print(f"Error: JSON file {file_name} is invalid or corrupted. Skipping.")
        except Exception as e:
            print(f"Unexpected error while processing {file_name}: {e}")

# Create a DataFrame from the processed data
processed_df = pd.DataFrame(processed_data)

# Merge the processed DataFrame with the websites DataFrame (left join)
final_df = websites.merge(processed_df, on="domain_name", how="left")

# Fill missing values with NA
final_df.fillna("NA", inplace=True)

# Output the final DataFrame
print(final_df)

# Save the final DataFrame to a CSV file (optional)
final_df.to_csv("final_website_data.csv", index=False)

              domain_name Trackers Per Page Load  \
0              10best.com                     NA   
1         1800petmeds.com                     NA   
2              2uf4ta.net                     NA   
3                  3m.com                     NA   
4                 8x8.com                     NA   
...                   ...                    ...   
64069  together2night.com                     NA   
64070      webbilling.com                     NA   
64071       wildmeets.com                     NA   
64072           xnxx2.org                     NA   
64073       xxx-free.info                     NA   

      Tracking Requests Per Page Load Trackers Requests / All Requests  \
0                                  NA                               NA   
1                                  NA                               NA   
2                                  NA                               NA   
3                                  NA                               NA   
4    

In [12]:
final_df["Site Analytics"].value_counts()

NA      63642
1.0        71
2.0        55
4.0        40
5.0        40
3.0        37
8.0        35
9.0        32
6.0        27
7.0        27
10.0       23
11.0       17
12.0        8
13.0        7
15.0        4
14.0        3
19.0        2
18.0        1
16.0        1
17.0        1
20.0        1
Name: Site Analytics, dtype: int64