### Setup
import statements and general analysis setup

In [3]:
import json
from haralyzer import HarParser
from urllib.parse import urlparse

In [4]:
host = "bbc.com"
har_file_path = "../archives/www." + host + ".har"

# Read the content of the HAR file and convert it to a dictionary
with open(har_file_path, "r", encoding="utf-8") as har_file:
    har_data = json.load(har_file)

har_parser = HarParser(har_data)
entries = har_parser.har_data['entries']

print("Number of entries: {}".format(len(entries)))

Number of entries: 100


### Device Data in Requests

In [5]:
all_user_agents = []

for entry in entries:
    request = entry.get('request', {})
    requestHeaders = request.get('headers', [])

    # Analyze user agents
    user_agent_header = next((header for header in requestHeaders if header['name'].lower() == 'user-agent'), None)
    if user_agent_header:
        user_agent_value = user_agent_header['value']
        all_user_agents.append(user_agent_value)

print("Count: " + str(len(all_user_agents)))
print("percentage of reqeuets with user agent: " + str(round((len(all_user_agents) / len(entries) * 100))) + "%\n")
print("All user agents:")
for user_agent in list(set(all_user_agents)):
    print(user_agent)


Count: 100
percentage of reqeuets with user agent: 100%

All user agents:
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36


### Cookies
find all set-cookie values in response headers

In [6]:
for entry in entries:
    response = entry.get('response', {})
    responseHeaders = response.get('headers', [])

    # Analyze cookies
    for header in responseHeaders:
        if header['name'].lower() == 'set-cookie':
            cookies = header['value']
            print(f"Set-Cookie: {cookies}")

Set-Cookie: AWSALB=8hEzRFfYRCZnqciJVvGHP4dcN9jXX/AjZzm4NZiuSwzZ7YOmHnLUydAoWW6QCC6SLpeJDd6+O86YsY8bbe+SSlh2IiLByNZXgCCm8enRyg97Z9fT4qtF7b+VEWw+; Expires=Thu, 21 Dec 2023 18:52:04 GMT; Path=/
Set-Cookie: AWSALBCORS=8hEzRFfYRCZnqciJVvGHP4dcN9jXX/AjZzm4NZiuSwzZ7YOmHnLUydAoWW6QCC6SLpeJDd6+O86YsY8bbe+SSlh2IiLByNZXgCCm8enRyg97Z9fT4qtF7b+VEWw+; Expires=Thu, 21 Dec 2023 18:52:04 GMT; Path=/; SameSite=None; Secure


### Third Party Requests
find requests to third party domains of all types except stylesheet, image and font

In [7]:
all_request_domains = []

for entry in entries:
    request = entry.get('request', {})
    response = entry.get('response', {})

    # Analyze third-party requests  
    if 'url' in request:
        request_url = urlparse(request['url']).hostname

        if host not in request_url and entry['_resourceType'] != 'stylesheet' and entry['_resourceType'] != 'font' and entry['_resourceType'] != 'image':
            all_request_domains.append(entry['_resourceType'] + " -> " + request_url)

print(f"Total number of requests: {len(all_request_domains)}")
for domain in list(set(all_request_domains)):
    print(domain)

Total number of requests: 70
document -> edigitalsurvey.com
script -> weather.files.bbci.co.uk
script -> scripts.webcontentassessor.com
script -> prebid.the-ozone-project.com
script -> m.files.bbci.co.uk
script -> emp.bbci.co.uk
script -> static.files.bbci.co.uk
other -> static.files.bbci.co.uk
script -> assets.zephr.com
script -> nav.files.bbci.co.uk
preflight -> cdn.privacy-mgmt.com
script -> static.bbci.co.uk
script -> mybbc-analytics.files.bbci.co.uk
script -> cdn.permutive.com
script -> uk-script.dotmetrics.net
fetch -> idcta.api.bbc.co.uk
script -> securepubads.g.doubleclick.net
script -> bbc.gscontxt.net
script -> idcta.api.bbc.co.uk
script -> sb.scorecardresearch.com
script -> cdn.privacy-mgmt.com
xhr -> cdn.privacy-mgmt.com
fetch -> gn-flagpoles.api.bbci.co.uk
script -> cdn.adsafeprotected.com


### Query strings

In [8]:
all_query_strings = []

for entry in entries:
    request = entry.get('request', {})

    # Analyze query strings
    if 'queryString' in request:
        query_string_list = request['queryString']
        if query_string_list:
            # Extract query strings from the list
            query_strings = [param['name'] + '=' + param['value'] for param in query_string_list]
            all_query_strings.extend(query_strings)


# Print or use unique_query_strings as needed
for query_string in list(set(all_query_strings)):
    print(query_string)

x=1440
body=%7B%22accountId%22%3A1786%2C%22campaignEnv%22%3A%22prod%22%2C%22campaigns%22%3A%7B%22ccpa%22%3A%7B%22alwaysDisplayDNS%22%3Afalse%2C%22status%22%3A%22rejectedNone%22%2C%22hasLocalData%22%3A36%2C%22targetingParams%22%3A%7B%7D%7D%2C%22gdpr%22%3A%7B%22consentStatus%22%3A%7B%22hasConsentData%22%3Atrue%2C%22consentedToAll%22%3Atrue%2C%22consentedToAny%22%3Atrue%2C%22rejectedAny%22%3Afalse%7D%2C%22hasLocalData%22%3Atrue%2C%22targetingParams%22%3A%7B%7D%7D%7D%2C%22clientMMSOrigin%22%3A%22https%3A%2F%2Fcdn.privacy-mgmt.com%22%2C%22hasCSP%22%3Atrue%2C%22includeData%22%3A%7B%22localState%22%3A%7B%22type%22%3A%22string%22%7D%2C%22actions%22%3A%7B%22type%22%3A%22RecordString%22%7D%2C%22cookies%22%3A%7B%22type%22%3A%22RecordString%22%7D%7D%2C%22propertyHref%22%3A%22https%3A%2F%2Fwww.bbc.com%2Fweather%22%7D
ptrt=https://www.bbc.com/weather
localState=%7B%22gdpr%22%3A%7B%22mmsCookies%22%3A%5B%22_sp_v1_ss%3D1%3AH4sIAAAAAAAAAItWqo5RKimOUbKKxsrIAzEMamN1YpRSQcy80pwcILsErKC6lgwJpVgAEA5-UnQAAAA%

### Contains test mail
check if any request contains aSHA256 or base64 endcoded version of our test mail address or password