### Setup
import statements and general analysis setup

In [None]:
import json
from haralyzer import HarParser
from urllib.parse import urlparse

In [None]:
host = "merkur.de"
type = "news/desktop"
har_file_path = "../archives/" + type + "/www." + host + ".har"

# Read the content of the HAR file and convert it to a dictionary
with open(har_file_path, "r", encoding="utf-8") as har_file:
    har_data = json.load(har_file)

har_parser = HarParser(har_data)
entries = har_parser.har_data['entries']

print("Number of entries: {}".format(len(entries)))

### Device Data in Requests

In [None]:
all_user_agents = []

for entry in entries:
    request = entry.get('request', {})
    requestHeaders = request.get('headers', [])

    # Analyze user agents
    user_agent_header = next((header for header in requestHeaders if header['name'].lower() == 'user-agent'), None)
    if user_agent_header:
        user_agent_value = user_agent_header['value']
        all_user_agents.append(user_agent_value)

print("Count: " + str(len(all_user_agents)))
print("percentage of reqeuets with user agent: " + str(round((len(all_user_agents) / len(entries) * 100))) + "%\n")
print("All user agents:")
for user_agent in list(set(all_user_agents)):
    print(user_agent)


### Cookies
find all set-cookie values in response headers

In [None]:
import re

for entry in entries:
    response = entry.get('response', {})
    responseHeaders = response.get('headers', [])

    # Analyze cookies
    for header in responseHeaders:
        if header['name'].lower() == 'set-cookie':
            cookies = header['value']

            # Extract domain from the cookie string using a regular expression
            domain_match = re.search(r'Domain=([^;]+)', cookies)
            domain = domain_match.group(1) if domain_match else None

            if domain:
                print(f"Set-Cookie: {cookies}")
                print(f"Domain: {domain}")


### Third Party Requests
find requests to third party domains of all types except stylesheet, image and font

In [None]:
from collections import Counter

all_request_domains = []

# Annahme: entries ist eine Liste von Einträgen
for entry in entries:
    request = entry.get('request', {})
    response = entry.get('response', {})

    # Analyse von Drittanbieteranfragen
    if 'url' in request:
        request_url = urlparse(request['url']).hostname

        if host not in request_url and entry['_resourceType'] not in ['stylesheet', 'font', 'image']:
            all_request_domains.append(request_url)

# Verwende Counter, um die Anzahl der Vorkommen jeder Domain zu zählen
domain_counter = Counter(all_request_domains)

# Sortiere die Domänen nach der Anzahl der Vorkommen in absteigender Reihenfolge
sorted_domains = domain_counter.most_common()

print(f"Total number of unique domains: {len(sorted_domains)}")

for domain, count in sorted_domains:
    print(f"{domain} ({count})")


### Query strings

In [None]:
all_query_strings = []

for entry in entries:
    request = entry.get('request', {})

    # Analyze query strings
    if 'queryString' in request:
        query_string_list = request['queryString']
        if query_string_list:
            # Extract query strings from the list
            query_strings = [param['name'] + '=' + param['value'] for param in query_string_list]
            all_query_strings.extend(query_strings)


# Print or use unique_query_strings as needed
for query_string in list(set(all_query_strings)):
    print(query_string)

### Contains test mail
check if any request contains aSHA256 or base64 endcoded version of our test mail address or password