# categorize.ipynb
Break down our results by website category using the [Cloudflare Domain Intelligence API](https://developers.cloudflare.com/api/operations/domain-intelligence-get-multiple-domain-details). This classification method was used by a [previous IMC paper](https://dl.acm.org/doi/pdf/10.1145/3517745.3561418).

In [3]:
import requests
from dotenv import load_dotenv
import os
import json
import pandas as pd
from pathlib import Path

API_URL = "https://api.cloudflare.com/client/v4/accounts"

# NOTE: Create a .env file in the root directory of the project containing the following variables:
load_dotenv()
ACCOUNT_ID = os.getenv("ACCOUNT_ID")
BEARER_TOKEN = os.getenv("BEARER_TOKEN")

print(f"ACCOUNT_ID: {ACCOUNT_ID}")
print(f"BEARER_TOKEN: {BEARER_TOKEN}")

ACCOUNT_ID: e66df1360bcc8b1778627a7d5ed3b9d8
BEARER_TOKEN: OR85df5YVJXMpAWdMuDH73BqNzWir4ur1cKEMkFB


In [4]:
def fetch_domain_data(domains: list[str]) -> dict:
    """
    Sends a GET request to the Cloudflare Domain Intelligence API with a list of domains and returns the parsed JSON response.

    Args:
        domains (list): List of domain names to fetch data for.

    Returns:
        dict: Parsed JSON response from the API.
    """
    url = f"{API_URL}/{ACCOUNT_ID}/intel/domain/bulk"

    # Prepare the query parameters
    params = {
        'domain': domains
    }

    # Set up the headers with the Bearer token
    headers = {
        'Authorization': f"Bearer {BEARER_TOKEN}",
        'Content-Type': 'application/json'
    }

    # Make the GET request
    response = requests.get(url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code == 200 and response.json()['success']:
        # Parse the response JSON
        result = response.json()['result']
        return result
    else:
        raise RuntimeError(f"Failed to fetch data: {response.status_code}")

def save_to_json(data, filename='categories.json'):
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)

def read_json(filename='categories.json'):
    with open(filename, 'r') as file:
        return json.load(file)

In [5]:
# Assuming the file 'differences.csv' is available in the environment
file_path = 'KJ2GW.txt'

domains = []
with open(file_path) as file:
    for line in file:
        domains.append(line.strip())
print(len(domains))

10000


In [6]:
BATCH_SIZE = 10
MAX_RETRIES = 10

all_results = read_json()
for i in range(0, len(domains), BATCH_SIZE):
    cached_results = read_json()
    if domains[i] in [result['domain'] for result in cached_results]:
        print(f"Skipping batch: {i+1}-{i+BATCH_SIZE} as it's already cached")
        continue

    print(f"Requesting batch: {i+1}-{i+BATCH_SIZE} out of {len(domains)}")
    batch = domains[i:i+BATCH_SIZE]
    for attempt in range(MAX_RETRIES):
        try:
            result = fetch_domain_data(batch)
            all_results.extend(result)
            save_to_json(all_results)
            break
        except RuntimeError as e:
            print(f"Failed to request batch: {i+1}-{i+BATCH_SIZE} on attempt {attempt+1}. {e}")
            continue

Skipping batch: 1-10 as it's already cached
Skipping batch: 11-20 as it's already cached
Skipping batch: 21-30 as it's already cached
Skipping batch: 31-40 as it's already cached
Skipping batch: 41-50 as it's already cached
Skipping batch: 51-60 as it's already cached
Skipping batch: 61-70 as it's already cached
Skipping batch: 71-80 as it's already cached
Skipping batch: 81-90 as it's already cached
Skipping batch: 91-100 as it's already cached
Skipping batch: 101-110 as it's already cached
Skipping batch: 111-120 as it's already cached
Skipping batch: 121-130 as it's already cached
Skipping batch: 131-140 as it's already cached
Skipping batch: 141-150 as it's already cached
Skipping batch: 151-160 as it's already cached
Skipping batch: 161-170 as it's already cached
Skipping batch: 171-180 as it's already cached
Skipping batch: 181-190 as it's already cached
Skipping batch: 191-200 as it's already cached
Skipping batch: 201-210 as it's already cached
Skipping batch: 211-220 as it's a

In [8]:
results = [result['domain'] for result in read_json()]
print(len(results))

for domain in domains:
    if domain not in results:
        print(f"Domain {domain} not found in results")
        break

10000
