In [None]:
import os
import sqlite3
import json
import re
from urllib.parse import urlparse
from collections import Counter

def extract_domains_from_har(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        har_data = json.load(file)
        entries = har_data.get('log', {}).get('entries', [])
        domains = []

        for entry in entries:
            response = entry.get('response', {})
            responseHeaders = response.get('headers', [])

            # Analyze cookies
            for header in responseHeaders:
                if header['name'].lower() == 'set-cookie':
                    cookies = header['value']

                    # Extract domain from the cookie string using a regular expression
                    domain_match = re.search(r'Domain=([^;]+)', cookies)
                    domain = domain_match.group(1) if domain_match else None

                    if domain:
                        domains.append(domain)

        return domains

def process_folder(folder_path):
    all_domains = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.har'):
            file_path = os.path.join(folder_path, filename)
            domains = extract_domains_from_har(file_path)
            all_domains.extend(domains)

    return all_domains

folder_paths = ['../archives/news/desktop/', '../archives/news/mobile/']

# Create SQLite database and table
with sqlite3.connect('third_party_analysis.db') as db_conn:
    db_cursor = db_conn.cursor()

    db_cursor.execute('''
        CREATE TABLE IF NOT EXISTS cookie_counts (
            domain TEXT PRIMARY KEY,
            count INTEGER
        )
    ''')
    all_domains = []
    
    # Process each folder and update the domain counts
    for folder_path in folder_paths:
        all_domains += process_folder(folder_path)

    domain_counts = Counter(all_domains)

    # Insert domain counts into the SQLite database
    for domain, count in domain_counts.items():
        db_cursor.execute('INSERT OR REPLACE INTO cookie_counts VALUES (?, ?)', (domain, count))


In [None]:
db_conn = sqlite3.connect('third_party_analysis.db')
db_cursor = db_conn.cursor()

# Execute the query to retrieve and order the records
db_cursor.execute('SELECT domain, count FROM cookie_counts ORDER BY count DESC')
sorted_records = db_cursor.fetchall()

# Print the sorted records
print("Count\tThird party domain")
print("------------------------------------")
for domain, count in sorted_records:
    print(f"{count}\t{domain}")

# Close the database connection
db_conn.close()