Load all files into a dictionary.

In [9]:
import os
import json
from urllib.parse import urlsplit

#dir_optOut = "../data/000/"
#dir_optIn = "../data/001/"
dir_optOut = "../data/optOut/"
dir_optIn = "../data/optIn/"

def load_dir(dir):
    data = {}
    for file_name in os.listdir(dir):
        if file_name.endswith(".json"):
            file = open(os.path.join(dir, file_name))
            file_data = json.load(file)
            file.close()
            if "data" in file_data:
                url = urlsplit(file_data["initialUrl"]).netloc
                data.update({url: file_data})
    return data

data_optOut = load_dir(dir_optOut)
data_optIn = load_dir(dir_optIn)

print(str(len(data_optIn)) + " files loaded from optIn")
print(str(len(data_optOut)) + " files loaded from optOut")

8039 files loaded from optIn
8009 files loaded from optOut


Filter out files where CMPs did not succeed.

In [10]:
def filter_cmps(data):
    t = 0
    websites = tuple(data.keys())
    for website in websites:
        if "cmps" in data[website]["data"]:
            t += 1
            for cmp in data[website]["data"]["cmps"]:
                if not cmp["succeeded"]:
                    data.pop(website)
                    break

filter_cmps(data_optIn)
filter_cmps(data_optOut)

print(str(len(data_optIn)) + " files remained after filtering in optIn")
print(str(len(data_optOut)) + " files remained after filtering in optOut")

7304 files remained after filtering in optIn
7140 files remained after filtering in optOut


Make sure both dictionaries have the same websites.

In [11]:
optIn_keys = tuple(data_optIn.keys())
optOut_keys = tuple(data_optOut.keys())

def match_dictionaries(data, keys1, keys2):
    websites = tuple(data.keys())
    for website in websites:
        if (not website in keys1) or (not website in keys2):
            data.pop(website)

match_dictionaries(data_optIn, optIn_keys, optOut_keys)
match_dictionaries(data_optOut, optIn_keys, optOut_keys)

print(str(len(data_optIn)) + " files remained after filtering in optIn")
print(str(len(data_optOut)) + " files remained after filtering in optOut")

6636 files remained after filtering in optIn
6636 files remained after filtering in optOut


Show some info about sendBeacon method for each website.
Output format contains:
  - website
  - #times sendBeacon called
  - bits of data send

In [12]:
from operator import itemgetter

def get_sendBeacon_info(data):
    website_sendBeacon_info = {}

    for website in data:
        nr_sendBeacon_calls = len(data[website]["data"]["apis"]["callStats"])
        size = 0
        for savedCall in data[website]["data"]["apis"]["savedCalls"]:
            for argument in savedCall["arguments"]:
                size += len(argument)
        website_sendBeacon_info.update({website: (nr_sendBeacon_calls, size)})
    return website_sendBeacon_info

optIn_sendBeacon_info = get_sendBeacon_info(data_optIn)
optOut_sendBeacon_info = get_sendBeacon_info(data_optOut)

Compare optIn results with optOut results.

In [13]:
def merge_dictionaries(dict_optIn, dict_optOut):
    merged_dict = {}
    for elem in dict_optOut:
        merged_dict.update({elem: {"optOut": dict_optOut[elem], "optIn": (0,0)}})
    for elem in dict_optIn:
        if elem in merged_dict:
            merged_dict[elem].update({"optIn": dict_optIn[elem]})
        else:
            merged_dict.update({elem: {"optOut": (0,0), "optIn": dict_optIn[elem]}})
    return merged_dict
    
merged_dict = merge_dictionaries(optIn_sendBeacon_info, optOut_sendBeacon_info)

List number of websites that send more sendBeacon data when consent mode is optOut.

In [14]:
optIn_bigger = 0
optIn_size = 0
optOut_bigger = 0
optOut_size = 0
same_size = 0

for website in merged_dict:
    optIn = merged_dict[website]["optIn"]
    optOut = merged_dict[website]["optOut"]

    if optIn[1] > optOut[1]:
        optIn_bigger += 1
        optIn_size += optIn[1] - optOut[1]
    elif optIn[1] < optOut[1]:
        optOut_bigger += 1
        optOut_size += optOut[1] - optIn[1]
    else:
        same_size += 1

print(str(optIn_bigger) + " websites send an average of " + str(round(optIn_size/optIn_bigger)) + " bytes more data with consent mode optIn")
print(str(optOut_bigger) + " websites send an average of " + str(round(optOut_size/optOut_bigger)) + " bytes more data with consent mode optOut")
print(str(same_size) + " websites have no difference")


1888 websites send an average of 4665 bytes more data with consent mode optIn
1884 websites send an average of 5308 bytes more data with consent mode optOut
2864 websites have no difference


Compare cookie behaviour based on consent mode, starting with loading all the cookie usage into a dictionary.

In [15]:
optIn_bigger = 0
optIn_size = 0
optOut_bigger = 0
optOut_size = 0
same_nr_cookies = 0

for website in merged_dict:
    nr_cookies_optIn = len(data_optIn[website]["data"]["cookies"])
    nr_cookies_optOut = len(data_optOut[website]["data"]["cookies"])
        
    if nr_cookies_optIn > nr_cookies_optOut:
        optIn_bigger += 1
        optIn_size += nr_cookies_optIn - nr_cookies_optOut
    elif nr_cookies_optIn < nr_cookies_optOut:
        optOut_bigger += 1
        optOut_size += nr_cookies_optOut - nr_cookies_optIn
    else:
        same_nr_cookies += 1

print(str(optIn_bigger) + " websites set an average of " + str(round(optIn_size/optIn_bigger)) + " cookies more with consent mode optIn")
print(str(optOut_bigger) + " websites set an average of " + str(round(optOut_size/optOut_bigger)) + " cookies more with consent mode optOut")
print(str(same_size) + " websites have no difference")

2088 websites set an average of 7 cookies more with consent mode optIn
1967 websites set an average of 7 cookies more with consent mode optOut
2864 websites have no difference
