### Setup
import statements and general analysis setup

In [1]:
import json
from haralyzer import HarParser
from urllib.parse import urlparse

In [2]:
host = "cnn.com"
type = "news"
har_file_path = "../archives/" + type + "/www." + host + ".har"

# Read the content of the HAR file and convert it to a dictionary
with open(har_file_path, "r", encoding="utf-8") as har_file:
    har_data = json.load(har_file)

har_parser = HarParser(har_data)
entries = har_parser.har_data['entries']

print("Number of entries: {}".format(len(entries)))

Number of entries: 773


### Device Data in Requests

In [3]:
all_user_agents = []

for entry in entries:
    request = entry.get('request', {})
    requestHeaders = request.get('headers', [])

    # Analyze user agents
    user_agent_header = next((header for header in requestHeaders if header['name'].lower() == 'user-agent'), None)
    if user_agent_header:
        user_agent_value = user_agent_header['value']
        all_user_agents.append(user_agent_value)

print("Count: " + str(len(all_user_agents)))
print("percentage of reqeuets with user agent: " + str(round((len(all_user_agents) / len(entries) * 100))) + "%\n")
print("All user agents:")
for user_agent in list(set(all_user_agents)):
    print(user_agent)


Count: 773
percentage of reqeuets with user agent: 100%

All user agents:
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36


### Cookies
find all set-cookie values in response headers

In [4]:
for entry in entries:
    response = entry.get('response', {})
    responseHeaders = response.get('headers', [])

    # Analyze cookies
    for header in responseHeaders:
        if header['name'].lower() == 'set-cookie':
            cookies = header['value']
            print(f"Set-Cookie: {cookies}")

Set-Cookie: SIDCC=ABTWhQF6U6-fTB5GXcu92BRqnGFn2zl-oMRd9SyHtq7ZfwmCo-JM2fG5IC5aI_r1uvDKlGfhDA; expires=Tue, 17-Dec-2024 21:59:27 GMT; path=/; domain=.google.com; priority=high
Set-Cookie: __Secure-1PSIDCC=ABTWhQH-ewNmYo0TYxYR9FHNuq_14obNleE3sxoKKn__yule6kyX_lFADgATJwVYDU5i6922; expires=Tue, 17-Dec-2024 21:59:27 GMT; path=/; domain=.google.com; Secure; HttpOnly; priority=high
Set-Cookie: __Secure-3PSIDCC=ABTWhQGhixOiyaE90J0XP616BhQcb2qqYd8gSPHVOwH30FvWQO_zUlak066rkor2TGfpgPiy; expires=Tue, 17-Dec-2024 21:59:27 GMT; path=/; domain=.google.com; Secure; HttpOnly; priority=high; SameSite=none
Set-Cookie: SIDCC=ABTWhQFkaWDh3a-DoKRLjYF1UWz_tnrPaLjKWU_os3id0TMxNGQvBk59dBGWADb4Ya5Tbt6Mbg; expires=Tue, 17-Dec-2024 21:59:27 GMT; path=/; domain=.google.com; priority=high
Set-Cookie: __Secure-1PSIDCC=ABTWhQHzr-Win8szZD0mAmIm7PPPYRv-YQFb4hVDKRemTWfdL-sNXwd4X3rf_VI0uev9m5bf; expires=Tue, 17-Dec-2024 21:59:27 GMT; path=/; domain=.google.com; Secure; HttpOnly; priority=high
Set-Cookie: __Secure-3PSIDCC=

### Third Party Requests
find requests to third party domains of all types except stylesheet, image and font

In [5]:
from collections import Counter

all_request_domains = []

# Annahme: entries ist eine Liste von Einträgen
for entry in entries:
    request = entry.get('request', {})
    response = entry.get('response', {})

    # Analyse von Drittanbieteranfragen
    if 'url' in request:
        request_url = urlparse(request['url']).hostname

        if host not in request_url and entry['_resourceType'] not in ['stylesheet', 'font', 'image']:
            all_request_domains.append(request_url)

# Verwende Counter, um die Anzahl der Vorkommen jeder Domain zu zählen
domain_counter = Counter(all_request_domains)

# Sortiere die Domänen nach der Anzahl der Vorkommen in absteigender Reihenfolge
sorted_domains = domain_counter.most_common()

print(f"Total number of unique domains: {len(sorted_domains)}")

for domain, count in sorted_domains:
    print(f"{domain} ({count})")


Total number of unique domains: 100
production.dataviz.cnn.io (44)
cdn.cookielaw.org (41)
wbd-api.arkoselabs.com (22)
pagead2.googlesyndication.com (18)
tpc.googlesyndication.com (17)
assets.bounceexchange.com (16)
s0.2mdn.net (12)
securepubads.g.doubleclick.net (11)
www.google.com (10)
registry.api.cnn.io (8)
zion.api.cnn.io (8)
pixel.adsafeprotected.com (8)
static.chartbeat.com (6)
www.dianomi.com (6)
eus.rubiconproject.com (6)
static.adsafeprotected.com (6)
cdn.optimizely.com (5)
logx.optimizely.com (5)
prebid.adnxs.com (5)
ib.adnxs.com (5)
cnn.bounceexchange.com (5)
signal-metrics-collector-beta.s-onetag.com (4)
arkose.daex.identityservices.io (4)
bidder.criteo.com (4)
htlb.casalemedia.com (4)
fastlane.rubiconproject.com (4)
hbopenbid.pubmatic.com (4)
aax.amazon-adsystem.com (4)
59bb126f2e6aaa0128ba4f4742f0acf0.safeframe.googlesyndication.com (4)
www.googletagservices.com (4)
ad.doubleclick.net (4)
get.s-onetag.com (3)
sb.scorecardresearch.com (3)
secure.quantserve.com (3)
tag.boun

### Query strings

In [6]:
all_query_strings = []

for entry in entries:
    request = entry.get('request', {})

    # Analyze query strings
    if 'queryString' in request:
        query_string_list = request['queryString']
        if query_string_list:
            # Extract query strings from the list
            query_strings = [param['name'] + '=' + param['value'] for param in query_string_list]
            all_query_strings.extend(query_strings)


# Print or use unique_query_strings as needed
for query_string in list(set(all_query_strings)):
    print(query_string)

xssi=t
c=16x9
google_nid=index
gdprc=CP2-uZgP2-uZgAcABBENAfEwAP_AAAAAAChQH7wJYAFAAWAA0ADMAHwAhABcADIAGgARQAkwBMAE4AKAAUgAtgBhgEGAQgAjoBRgFIAK0AgEBBwEIAIsAR0AnYBSQCxAF1AMCAfoBGoC0YF5AXmAxkBlgDLwGqgN1AfuAAAAsJAJAAWACCAGQAaABMAEIAI4AgAC8wG6hAAgBBwCdgLyAfuOgGgALABAAC6AGQAaABMAFGARYAjoBYgF5gMsAaqA3UcAHACgAI4AgEBBwEIAJ2AwQB-5CAMAAsAFwATABHALRAaqQABgFiAXkAwQlAFAAWAEwARwAowF5kgAgBAACDgMsAfuUAFAAKABcAI4AgABBwCxAF1AXkAwQB-5SAWAAsAEEAMgA0ACYAFIAUYBFgCOgLzAZYA3UtABAEcA.f_gAAAAAAAAA
gdpr=true
sai=AMfl-YQWnl9BaKqH5XtsydiaVkYx4cI5f7qaUVpaLuTE52mF_SjhupFMlrtcTtjCpjGE-KYM7gXiEkgzINWHkLgsTpzlEAl8ZXCJYfg-zTFibebOAumzcSPaih4MpyjLUB4
idt=8701
cp=0
o=1043
v90=international
tv=%7Bc:xbYmtX,pingTime:1,time:23548,type:p,env:%7Bar:self.0%7D,clog:%5B%7Bpiv:0,vs:o,r:r.f,w:970,h:250,t:20308%7D,%7Br:r,t:21117%7D,%7Bpiv:100,vs:i,r:,t:22521%7D%5D,es:0,sc:1,ha:1,fgad:1,fif:1,gmnp:0,for:1,b11:0,cnod:1,gm:1,slTimes:%7Bi:1030,o:22519,n:0,pp:0,pm:0%7D,slEvents:%5B%7Bsl:o,t:20307,wc:-6.0.1038.702,ac:20.2.970.250,am:i,cc:20.

### Contains test mail
check if any request contains aSHA256 or base64 endcoded version of our test mail address or password