In [3]:
import os
import re
import sqlite3
import json
from urllib.parse import urlparse
from collections import Counter

def extractRequests(entry, host):
    request = entry.get('request', {})
    if 'url' in request:
        request_url_parts = urlparse(request['url']).hostname.rsplit('.')
        if (request_url_parts[-1] == 'uk' and request_url_parts[-2] == 'co'):
            request_url = request_url_parts[-3] + '.' + request_url_parts[-2] + '.' + request_url_parts[-1]
        else:
            request_url = request_url_parts[-2] + '.' + request_url_parts[-1]

        if '_resourceType' in entry:
            if host not in request_url and entry['_resourceType'] not in ['stylesheet', 'font', 'image']:
                return request_url
            
        else:
            if host not in request_url:
                return request_url

    return None

def processFolder(folder_path):
    allDomains = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.har'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                harData = json.load(file)
                entries = harData.get('log', {}).get('entries', [])
                for entry in entries:
                    match = re.search(r'(m|www)\.(.*)(\.har)', filename)
                    host = match.group(2) if match else None
                    
                    domain = extractRequests(entry, host)
                    if domain:
                        allDomains.append(domain)
    return allDomains

dbConnect = sqlite3.connect('third_party_analysis.db')
dbCursor = dbConnect.cursor()

dbCursor.execute('''
    CREATE TABLE IF NOT EXISTS domain_counts (
        domain TEXT PRIMARY KEY,
        count INTEGER
    )
''')

paths = ['../archives/news/desktop/', '../archives/news/mobile/']

allDomains = []

for folder_path in paths:
    allDomains += processFolder(folder_path)

allDomainsCounter = Counter(allDomains)

for domain, count in allDomainsCounter.items():
    dbCursor.execute('INSERT OR REPLACE INTO domain_counts VALUES (?, ?)', (domain, count))

dbConnect.commit()
dbConnect.close()

print(f"Total number of unique domains: {len(allDomainsCounter)}")


Total number of unique domains: 332


In [4]:
dbConnect = sqlite3.connect('third_party_analysis.db')
dbCursor = dbConnect.cursor()

dbCursor.execute('SELECT domain, count FROM domain_counts ORDER BY count DESC')
sorted_records = dbCursor.fetchall()

print("Count\tThird party domain")
print("------------------------------------")
for domain, count in sorted_records:
    print(f"{count}\t{domain}")

dbConnect.close()


Count	Third party domain
------------------------------------
1110	googlesyndication.com
969	doubleclick.net
599	asadcdn.com
538	adsafeprotected.com
519	google.com
472	adobedtm.com
370	guim.co.uk
326	amazon-adsystem.com
319	adnxs.com
296	rubiconproject.com
254	google-analytics.com
235	outbrain.com
234	googletagservices.com
196	criteo.com
196	idcdn.de
188	taboola.com
188	gstatic.com
158	2mdn.net
153	googletagmanager.com
149	cxense.com
143	ioam.de
140	adform.net
132	cnn.io
131	otto.de
126	presage.io
120	pubmatic.com
117	bounceexchange.com
116	cookielaw.org
113	glomex.com
109	privacy-mgmt.com
107	xplosion.de
107	piano.io
107	permutive.com
106	casalemedia.com
105	yieldlab.net
104	ampproject.org
101	doubleverify.com
101	tinypass.com
95	smartadserver.com
90	k5a.io
84	cloudfront.net
83	dwcdn.net
82	bildstatic.de
81	cleverpush.com
80	openx.net
80	opencmp.net
79	teads.tv
76	googleapis.com
76	cloudflare.com
73	chartbeat.com
72	dianomi.com
72	adspirit.de
68	chartbeat.net
65	moatads.com
65	google.