In [2]:
import os
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Output folder for JSON files
output_folder = "./website_trackers"
os.makedirs(output_folder, exist_ok=True)

# Base URL for WhoTracksMe
base_url = "https://www.ghostery.com/whotracksme/websites/"

# Function to scrape tracker data and statistics for a given website
def scrape_website_data(website):
    try:
        # Fetch the webpage
        url = f"{base_url}{website}"
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Initialize the result dictionary
        result = {
            "website": website,
            "trackers": [],
            "statistics": {}
        }

        # Extract tracker information
        tracker_section = soup.find("div", class_="ds-wtm-entities cards")
        if tracker_section:
            tracker_entries = tracker_section.find_all("a", href=True)
            for entry in tracker_entries:
                tracker_name = entry.find("div", class_="ds-body-m ds-color-white ds-text-ellipsis")
                tracker_meta = entry.find("div", class_="ds-color-gray-400 ds-uppercase ds-label-xs ds-text-ellipsis")

                if tracker_name and tracker_meta:
                    tracker_name = tracker_name.text.strip()
                    meta_parts = [part.strip() for part in tracker_meta.text.split("•")]
                    percentage = meta_parts[0] if len(meta_parts) > 0 else "N/A"
                    company = meta_parts[1] if len(meta_parts) > 1 else "N/A"
                    category = meta_parts[2] if len(meta_parts) > 2 else "N/A"

                    result["trackers"].append({
                        "tracker": tracker_name,
                        "percentage": percentage,
                        "company": company,
                        "category": category
                    })

        # Extract additional statistics
        stats_section = soup.find_all("div", class_="ds-column ds-wtm-section-card medium")
        if stats_section:
            for stat in stats_section:
                stat_title = stat.find("p", class_="ds-display-2xs ds-color-gray-300")
                stat_value = stat.find("p", class_="ds-display-2xl")

                if stat_title and stat_value:
                    title = stat_title.text.strip()
                    value = stat_value.text.strip()
                    result["statistics"][title] = value

        return result

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {website}: {e}")
        return None

In [3]:
# Load the websites and deduplicate
websites = pd.read_csv("yg_ind_domain.csv")["private_domain"].drop_duplicates()

# List of websites to scrape
total_websites = len(websites)

# Specify whether to overwrite existing JSON files
overwrite = False  # Change to True if you want to overwrite existing files

# Iterate over websites and save JSON files
for idx, website in enumerate(websites, start=1):

    output_file = os.path.join(output_folder, f"{website.replace('.', '_')}_data.json")

    if os.path.exists(output_file) and not overwrite:
        print(f"Skipping {website}... (File already exists)")
        continue

    if idx % 5 == 0:  # Print only after every 5 websites
        print(f"Scraping data for {website}... ({idx}/{total_websites} | {round((idx / total_websites) * 100, 2)}% done)")

    data = scrape_website_data(website)

    if data:
        with open(output_file, "w") as f:
            json.dump(data, f, indent=2)
        print(f"Saved data for {website} to {output_file}.")
    else:
        print(f"No data found for {website}.")

print("Scraping complete!")

Skipping 10best.com... (File already exists)
Skipping 1800petmeds.com... (File already exists)
Skipping 2uf4ta.net... (File already exists)
Skipping 3m.com... (File already exists)
Skipping 8x8.com... (File already exists)
Skipping a79ab95c1589a13f8a4cab612bc71f9f7.com... (File already exists)
Skipping aa.com... (File already exists)
Skipping aaa.com... (File already exists)
Skipping aaxdetect.com... (File already exists)
Skipping abrlh.com... (File already exists)
Skipping adatoolbar.com... (File already exists)
Skipping adlightning.com... (File already exists)
Skipping adobedc.net... (File already exists)
Skipping adready.com... (File already exists)
Skipping adtheorent.com... (File already exists)
Skipping advanis.ca... (File already exists)
Skipping afnyst.org... (File already exists)
Skipping afterpay.com... (File already exists)
Skipping aggle.net... (File already exists)
Skipping alchemer.com... (File already exists)
Skipping alida.com... (File already exists)
Skipping ally.com.

Error fetching data for nàilshop.to: 404 Client Error: Not Found for url: https://www.ghostery.com/whotracksme/websites/n%C3%A0ilshop.to
No data found for nàilshop.to.
Skipping okmagazine.com... (File already exists)
Skipping orlandohealth.com... (File already exists)
Skipping pdbcreativestudio.com... (File already exists)
Skipping pepsipromos.com... (File already exists)
Skipping perfectgift.com... (File already exists)
Skipping pocket-lint.com... (File already exists)
Skipping porn.co... (File already exists)
Skipping porn.com... (File already exists)
Skipping porndoe.com... (File already exists)
Skipping postageapp.net... (File already exists)
Skipping prepactionsteps.com... (File already exists)
Skipping promising-mart.co... (File already exists)
Skipping promising-mart.com... (File already exists)
Skipping pscrpt.io... (File already exists)
Skipping radioactiveglitter.com... (File already exists)
Skipping ranbaxyantitrustlitigation.com... (File already exists)
Skipping ranbaxytppl

Error fetching data for mç.org: 404 Client Error: Not Found for url: https://www.ghostery.com/whotracksme/websites/m%C3%A7.org
No data found for mç.org.
Skipping nami-mc.org... (File already exists)
Skipping operationsports.com... (File already exists)
Skipping orderfrontstreetdeli.com... (File already exists)
Skipping paddle.com... (File already exists)
Skipping racketmn.com... (File already exists)
Skipping realsport101.com... (File already exists)
Skipping renttoown.org... (File already exists)
Skipping risecannabis.com... (File already exists)
Skipping russellstreetreport.com... (File already exists)
Skipping thealphacut.com... (File already exists)
Skipping thefsdeli.com... (File already exists)
Skipping theganjachronicles.com... (File already exists)
Skipping thehotdog.org... (File already exists)
Skipping theloadout.com... (File already exists)
Skipping thesportsgrail.com... (File already exists)
Skipping truetrophies.com... (File already exists)
Skipping vaporfi.com... (File al

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Skipping ourstage.com... (File already exists)
Skipping palomamodelandtalent.com... (File already exists)
Skipping picsets.org... (File already exists)
Skipping platinomax.com... (File already exists)
Skipping popradiotop20.com... (File already exists)
Skipping providentcu.org... (File already exists)
Skipping quicklybobacafe.com... (File already exists)
Skipping quicklybobatea.com... (File already exists)
Skipping quicklyusa.com... (File already exists)
Skipping radio-locator.com... (File already exists)
Skipping radio-online.com... (File already exists)
Skipping radioclatsop.com... (File already exists)
Skipping radioink.com... (File already exists)
Skipping radioinsight.com... (File already exists)
Skipping radiolazer.com... (File already exists)
Skipping radiounica945.com... (File already exists)
Skipping realcountry1035.com... (File already exists)
Skipping reelradio.com... (File already exists)
Skipping reverseimagesearch.com... (File already exists)
Skipping rewindtv.com... (Fil