In [None]:
import os
import sqlite3
import json
import re
from urllib.parse import urlparse
from collections import Counter

def extractDomains(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        harData = json.load(file)
        entries = harData.get('log', {}).get('entries', [])
        domains = []

        for entry in entries:
            response = entry.get('response', {})
            responseHeaders = response.get('headers', [])

            # Analyze cookies
            for header in responseHeaders:
                if header['name'].lower() == 'set-cookie':
                    cookies = header['value']

                    # Extract domain from the cookie string using a regular expression
                    match = re.search(r'Domain=([^;]+)', cookies)
                    domain = match.group(1) if match else None

                    if domain:
                        domains.append(domain)

        return domains

def processFolder(folder_path):
    all_domains = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.har'):
            path = os.path.join(folder_path, filename)
            domains = extractDomains(path)
            all_domains.extend(domains)

    return all_domains

paths = ['../archives/news/desktop/', '../archives/news/mobile/']

with sqlite3.connect('third_party_analysis.db') as dbConn:
    dbCursor = dbConn.cursor()

    dbCursor.execute('''
        CREATE TABLE IF NOT EXISTS cookie_counts (
            domain TEXT PRIMARY KEY,
            count INTEGER
        )
    ''')
    allDomains = []
    
    for folder_path in paths:
        allDomains += processFolder(folder_path)

    domainCounter = Counter(allDomains)

    for domain, count in domainCounter.items():
        dbCursor.execute('INSERT OR REPLACE INTO cookie_counts VALUES (?, ?)', (domain, count))


In [None]:
dbConn = sqlite3.connect('third_party_analysis.db')
dbCursor = dbConn.cursor()

dbCursor.execute('SELECT domain, count FROM cookie_counts ORDER BY count DESC')
sortedRecords = dbCursor.fetchall()

print("Count\tThird party domain")
print("------------------------------------")
for domain, count in sortedRecords:
    print(f"{count}\t{domain}")

# Close the database connection
dbConn.close()