### Setup
import statements and general analysis setup

In [1]:
import json
from haralyzer import HarParser
from urllib.parse import urlparse

In [2]:
host = "merkur.de"
type = "news/desktop"
path = "../archives/" + type + "/www." + host + ".har"

# Read the content of the HAR file and convert it to a dictionary
with open(path, "r", encoding="utf-8") as har_file:
    harData = json.load(har_file)

parser = HarParser(harData)
entries = parser.har_data['entries']

print("Number of entries: {}".format(len(entries)))

Number of entries: 1785


### Device Data in Requests

In [3]:
userAgents = []

for entry in entries:
    request = entry.get('request', {})
    requestHeaders = request.get('headers', [])

    user_agent_header = next((header for header in requestHeaders if header['name'].lower() == 'user-agent'), None)
    if user_agent_header:
        userAgentValue = user_agent_header['value']
        userAgents.append(userAgentValue)

print("Count: " + str(len(userAgents)))
print("percentage of reqeuets with user agent: " + str(round((len(userAgents) / len(entries) * 100))) + "%\n")
print("All user agents:")
for userAgent in list(set(userAgents)):
    print(userAgent)


Count: 1785
percentage of reqeuets with user agent: 100%

All user agents:
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36


### Cookies
find all set-cookie values in response headers

In [4]:
import re

for entry in entries:
    response = entry.get('response', {})
    responseHeaders = response.get('headers', [])

    for header in responseHeaders:
        if header['name'].lower() == 'set-cookie':
            cookies = header['value']

            domain_match = re.search(r'Domain=([^;]+)', cookies)
            domain = domain_match.group(1) if domain_match else None

            if domain:
                print(f"Set-Cookie: {cookies}")
                print(f"Domain: {domain}")


Set-Cookie: pid=BSRABDbsBsRFESJABsJCBSUZBs7FBsRsBSRFWDJABifABfrr; Domain=.xplosion.de; Expires=Tue, 17-Dec-2024 21:43:30 GMT; Path=/; Secure; SameSite=None
Domain: .xplosion.de
Set-Cookie: pid_short=55gHSVL3Bbn0w5+k4OfAR0Ly__rr; Domain=.xplosion.de; Expires=Tue, 17-Dec-2024 21:43:30 GMT; Path=/; Secure; SameSite=None
Domain: .xplosion.de
Set-Cookie: pid_signature=WsIDBDaCHDRFES70wdasEiRCWs+Dws5jWiB0HSUAwCwFwS_kWQB3Bfrr; Domain=.xplosion.de; Expires=Tue, 17-Dec-2024 21:43:30 GMT; Path=/; Secure; SameSite=None
Domain: .xplosion.de
Set-Cookie: ep=ZYC1E_7Ycsno-p_MkxoA; Domain=.xplosion.de; Expires=Tue, 17-Dec-2024 21:43:30 GMT; Path=/; Secure; SameSite=None
Domain: .xplosion.de
Set-Cookie: icu=ChgI1YszEAoYASABKAEw7O6CrAY4AUABSAEKGAi3kzMQChgEIAQoBDC264KsBjgEQARIBAoYCLyTMxAKGAIgAigCMOvsgqwGOAJAAkgCChgIrog9EAoYGyAbKBswsPmCrAY4G0AbSBsKGAjuvU4QChgBIAEoATDZ8YKsBjgBQAFIAQoYCPOZWRAKGAEgASgBMIP6gqwGOAFAAUgBEIP6gqwGGCM.; SameSite=None; Path=/; Max-Age=7776000; Expires=Sun, 17-Mar-2024 21:43:31 GMT; 

### Third Party Requests
find requests to third party domains of all types except stylesheet, image and font

In [5]:
from collections import Counter

allDomains = []

for entry in entries:
    request = entry.get('request', {})
    response = entry.get('response', {})

    if 'url' in request:
        url = urlparse(request['url']).hostname

        if host not in url and entry['_resourceType'] not in ['stylesheet', 'font', 'image']:
            allDomains.append(url)

# Verwende Counter, um die Anzahl der Vorkommen jeder Domain zu zählen
counter = Counter(allDomains)

# Sortiere die Domänen nach der Anzahl der Vorkommen in absteigender Reihenfolge
sortedDomains = counter.most_common()

print(f"Total number of unique domains: {len(sortedDomains)}")

for domain, count in sortedDomains:
    print(f"{domain} ({count})")


Total number of unique domains: 125
idcdn.de (62)
tpc.googlesyndication.com (45)
securepubads.g.doubleclick.net (37)
fundingchoicesmessages.google.com (35)
cdn.opencmp.net (32)
pagead2.googlesyndication.com (31)
player.glomex.com (27)
eus.rubiconproject.com (25)
cdntrf.com (21)
datawrapper.dwcdn.net (18)
orbidder.otto.de (18)
prg.smartadserver.com (18)
static.cleverpush.com (17)
cl.k5a.io (14)
fastlane.rubiconproject.com (14)
www.googletagservices.com (14)
ib.adnxs.com (13)
www.google-analytics.com (12)
c.amazon-adsystem.com (12)
ad.doubleclick.net (12)
t.visx.net (11)
token.rubiconproject.com (11)
secure-assets.rubiconproject.com (10)
csync.smartadserver.com (9)
ups.xplosion.de (8)
ad.yieldlab.net (8)
gum.criteo.com (8)
ams3-ib.adnxs.com (8)
www.gstatic.com (7)
sdk-02.moengage.com (7)
s.seedtag.com (7)
accounts.user.id (7)
cdn.jsdelivr.net (6)
bidder.criteo.com (6)
rtb.openx.net (6)
b1h-euc1.zemanta.com (6)
83f5957986b2feb0086a5c1db5962ec4.safeframe.googlesyndication.com (6)
idat.prod

### Query strings

In [6]:
allQueryStrings = []

for entry in entries:
    request = entry.get('request', {})

    if 'queryString' in request:
        queryList = request['queryString']
        if queryList:
            # Extract query strings from the list
            queryStrings = [param['name'] + '=' + param['value'] for param in queryList]
            allQueryStrings.extend(queryStrings)


# Print or use unique_query_strings as needed
for queryString in list(set(allQueryStrings)):
    print(queryString)

pg_hs=5863
xai=AKAOjssXfTF8CqN6E_bGABwur4SY2eX8JTft1jRMBOHDy_bN161mOMKUF-zAjre3rXHqCpJSPu2Fqnj11R2RlpXEzuQCrQ9M5TnO29vWtCfyZ_DqjCCrYV4YVe09aFRSvK6OysZxBzXTC91A3wtje8-9UPLBQaI
nonce=U6cVqZgGtD_fRSmcxr4LDgXuKlNeqnClph6yUMy6c9A
redirect=https%3A%2F%2Fuip.semasio.net%2Fadition%2F1%2Finfo%3FsType%3Dsync%26sExtCookieId%3D%25%25COOKIE%25%25%26sInitiator%3Dyl
puid=1~lqbg38na
pvsid=3439307552563019
p=70
dt=1702935814029
RefererUrl=https%3A%2F%2Fwww.merkur.de%2Fpolitik%2Fsanktionen-beschlossen-eu-verbietet-einfuhr-von-putins-diamanten-ukraine-krieg-finanzierung-alrosa-zr-92735410.html
chm=1
ptg=
t=2000
vn=eu-central-1
cs=CP2-uZgP2-uZgAVACADEAfEsAP_gAAAAAAYgJAtV7D9cbGlDMXp3YNtkWIUX19ABpsQgABaBE6AByCOAcIwG02EyIAyoBCACABAAoVIBIAAEGAFQAEAAQIgBADHgIgCEgAAKIABAABMRAwAAAAoKAAAAEAAIhEA5IgCAmCqwQErkREgAQoAAAgABAAAAAIABAoMABAEIABAAAgAAgQAAAAAAAMAAAAACARAAgAIAAABgkCQABAACwAKwAcAB4AEEALwA0ACIAEwAKoAb4A9AD9AIQAiYBHAE0AMCAYYBlADngH4AfoBFACNQEiASUAlIBPwC5gF6AMUAbQA4gCRAFDwKPApEBeYDBgGSANZAeOA-kdArAAWABUADgAIIA

### Contains test mail
check if any request contains aSHA256 or base64 endcoded version of our test mail address or password