In [1]:
import os
import sqlite3
import json
import re
from urllib.parse import urlparse
from collections import Counter

def extractDomains(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        harData = json.load(file)
        entries = harData.get('log', {}).get('entries', [])
        domains = []

        for entry in entries:
            response = entry.get('response', {})
            responseHeaders = response.get('headers', [])

            # Analyze cookies
            for header in responseHeaders:
                if header['name'].lower() == 'set-cookie':
                    cookies = header['value']

                    # Extract domain from the cookie string using a regular expression
                    match = re.search(r'Domain=([^;]+)', cookies)
                    domain = match.group(1) if match else None
                    

                    if domain:
                        if domain.startswith('.'):
                            domain = domain[1:]
                            
                        domains.append(domain)

        return domains

def processFolder(folder_path):
    all_domains = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.har'):
            path = os.path.join(folder_path, filename)
            domains = extractDomains(path)
            all_domains.extend(domains)

    return all_domains

paths = ['../archives/news/desktop/', '../archives/news/mobile/']

with sqlite3.connect('third_party_analysis.db') as dbConn:
    dbCursor = dbConn.cursor()

    dbCursor.execute('''
        CREATE TABLE IF NOT EXISTS cookie_counts (
            domain TEXT PRIMARY KEY,
            count INTEGER
        )
    ''')
    allDomains = []
    
    for folder_path in paths:
        allDomains += processFolder(folder_path)

    domainCounter = Counter(allDomains)

    for domain, count in domainCounter.items():
        dbCursor.execute('INSERT OR REPLACE INTO cookie_counts VALUES (?, ?)', (domain, count))


In [2]:
dbConn = sqlite3.connect('third_party_analysis.db')
dbCursor = dbConn.cursor()

dbCursor.execute('SELECT domain, count FROM cookie_counts ORDER BY count DESC')
sortedRecords = dbCursor.fetchall()

print("Count\tThird party domain")
print("------------------------------------")
for domain, count in sortedRecords:
    print(f"{count}\t{domain}")

# Close the database connection
dbConn.close()

Count	Third party domain
------------------------------------
307	.adnxs.com
291	.cnn.com
272	.rubiconproject.com
195	.xplosion.de
188	casalemedia.com
165	.taboola.com
164	id5-sync.com
81	.zeit.de
80	cxense.com
76	omnitagjs.com
75	yieldlab.net
71	.amazon-adsystem.com
70	.rfihub.com
64	.yahoo.com
54	.openx.net
52	telegraph.co.uk
47	.seedtag.com
42	ads.stickyadstv.com
40	.demdex.net
40	prod.svc.y6b.de
34	eyeota.net
28	.youtube.com
28	faz.net
27	.contextweb.com
27	the-ozone-project.com
26	.analytics.yahoo.com
26	bidr.io
24	d.adup-tech.com
22	.twitter.com
22	bild.de
21	everesttech.net
19	t.co
19	.theguardian.com
18	.linkedin.com
18	rezync.com
18	.semasio.net
17	.lijit.com
17	zemanta.com
16	.3lift.com
16	.dpm.demdex.net
16	.360yield.com
16	.telegraph.co.uk
15	tagger.opecloud.com
14	.pinterest.com
14	.turn.com
14	.tapad.com
14	.df-srv.de
14	aniview.com
13	.w55c.net
13	.connectad.io
12	.faz.net
12	.creativecdn.com
9	.teads.tv
9	dianomi.com
9	outbrain.com
9	.google.com
9	.google.de
8	undertone