In [1]:
# !pip3 install pandas
# python -m ipykernel install --user --name=.venv --display-name "Python (.venv) Notes"
import pandas as pd
import json
import copy

pd.reset_option('display.max_rows')

In [2]:
# collect scan data from https://app.cookieyes.com/manage-cookies 
# https://app.cookieyes.com/manage-cookies/scan-history/1347165

In [3]:
# Replace 'your_file.json' with the path to your JSON file
file_path = '2024.08.09._scan.json'

# Open the file and load its contents
with open(file_path, 'r') as file:
    cookieyes_data = json.load(file)

all_about_cookies = {}
for category in cookieyes_data["categories"]:
    for key, val in category["cookies"].items():
        all_about_cookies[val["cookie_id"]] = {
            "cookie_id": val["cookie_id"],
            "scan_category": category["name"],
        }

In [4]:
df = pd.DataFrame(all_about_cookies)
df.T.sort_values(by='cookie_id')

Unnamed: 0,cookie_id,scan_category
ANONCHK,ANONCHK,Advertisement
AUTH_SESSION_ID,AUTH_SESSION_ID,Uncategorized
AUTH_SESSION_ID_LEGACY,AUTH_SESSION_ID_LEGACY,Uncategorized
AWSALB,AWSALB,Performance
AWSALBCORS,AWSALBCORS,Necessary
...,...,...
yt-remote-session-app,yt-remote-session-app,Functional
yt-remote-session-name,yt-remote-session-name,Functional
yt.innertube::nextId,yt.innertube::nextId,Advertisement
yt.innertube::requests,yt.innertube::requests,Advertisement


In [5]:
# collect current cookies from CookieYes
# https://app.cookieyes.com/api/v2/websites/735756/categories

file_path = '2024.08.09._cookieyes.json'

# Open the file and load its contents
with open(file_path, 'r') as file:
    cookieyes_data = json.load(file)

cookieyes_cookies = []
cookieyes_categories = {}
for category in cookieyes_data:
    cookieyes_categories[category["id"]] = category["name"]["en"]
    cookieyes_cookies.extend(copy.deepcopy(cookie) for cookie in category["cookies"])

useless_keys = [
    "id", "description", "default_duration", "website_id", "url_pattern_updated",
    "created_at", "updated_at", "data_migrated_at",
    "type", "category_id", "status"
]

for cookie in cookieyes_cookies:
    cookie["category"] = cookieyes_categories[cookie["category_id"]]
    for key in useless_keys:
      if key in cookie: 
          del cookie[key]

    id = cookie["cookie_id"]
    if(all_about_cookies.get(id) != None):
        all_about_cookies[id] = {**all_about_cookies[id], **cookie}
    else:
        all_about_cookies[id] = {**cookie}


all_about_cookies['ANONCHK']

{'cookie_id': 'ANONCHK',
 'scan_category': 'Advertisement',
 'duration': {'en': '10 minutes'},
 'domain': '.c.clarity.ms',
 'script_slug': 'bing',
 'url_pattern': 'bing.com',
 'created_from_scan': 1,
 'category': 'Marketing cookies'}

In [6]:
df = pd.DataFrame(all_about_cookies)
df.T.sort_values(by='cookie_id')

Unnamed: 0,cookie_id,scan_category,duration,domain,script_slug,url_pattern,created_from_scan,category
1P_JAR,1P_JAR,,{'en': '1 month'},.google.com,,google.com,0,Analytics cookies
ADDED,ADDED,,{'en': 'persistent'},.google.*,,.google.*,0,Analytics cookies
AEC,AEC,,{'en': '6 months'},.google.com,,google.com,0,Analytics cookies
ANONCHK,ANONCHK,Advertisement,{'en': '10 minutes'},.c.clarity.ms,bing,bing.com,1,Marketing cookies
APISID,APISID,,{'en': '2 years'},.google.com,,google.com,0,Analytics cookies
...,...,...,...,...,...,...,...,...
yt-remote-session-app,yt-remote-session-app,Functional,{'en': 'session'},youtube.com,youtube,youtube.com,1,Functional cookies
yt-remote-session-name,yt-remote-session-name,Functional,{'en': 'session'},youtube.com,youtube,youtube.com,1,Functional cookies
yt.innertube::nextId,yt.innertube::nextId,Advertisement,{'en': 'Never Expires'},youtube.com,youtube,youtube.com,1,Marketing cookies
yt.innertube::requests,yt.innertube::requests,Advertisement,{'en': 'Never Expires'},youtube.com,youtube,youtube.com,1,Marketing cookies


In [7]:
categories_mapping = {
    'Advertisement': 'Marketing', # We move all Advertisement cookies to Marketing cookies
    'Performance': 'Marketing', # We move all Performance cookies to Marketing cookies

    'Marketing cookies': 'Marketing',
    'Analytics cookies': 'Analytics',
    'Functional cookies': 'Functional',
    'Necessary cookies': 'Necessary',
}

moved_cookies = sorted([
        item for item in all_about_cookies.values() 
        if item.get('scan_category') != None 
            and not (item['scan_category'] == 'Uncategorized' and item['category'] == 'Necessary cookies')
            and categories_mapping.get(item['scan_category'], item['scan_category']) != categories_mapping.get(item['category'], item['category'])
    ],
    key=lambda x: x['cookie_id']
)

pd.set_option('display.max_rows', None)
df_moved_cookies = pd.DataFrame(moved_cookies)
df_moved_cookies.sort_values(by='cookie_id')


Unnamed: 0,cookie_id,scan_category,duration,domain,script_slug,url_pattern,created_from_scan,category
0,AWSALB,Performance,{'en': '7 days'},docs.tripleten.com,aws,,1,Necessary cookies
1,MSPTC,Uncategorized,{'en': '1 year 24 days'},.bing.com,,bing.com,1,Marketing cookies
2,__gtm_campaign_url,Uncategorized,{'en': 'session'},.tripleten.com,,googletagmanager.com,1,Analytics cookies
3,tableau_public_negotiated_locale,Functional,{'en': 'session'},public.tableau.com,tableau,,1,Necessary cookies


In [8]:
# List of suspicious cookies

# 0	AWSALB	Performance	{'en': '7 days'}	docs.tripleten.com	aws	None	1	Necessary cookies
# Ok. We can move any cookie to Necessary

# 1	MSPTC	Uncategorized	{'en': '1 year 24 days'}	.bing.com	None	bing.com	1	Marketing cookies
# Need check url_pattern - OK (this domain is blocked in the same category)

# 2	__gtm_campaign_url	Uncategorized	{'en': 'session'}	.tripleten.com	None	googletagmanager.com	1	Analytics cookies
# Need check url_pattern -  changed google-analytics.com|googletagmanager.com/gtag/js

# 3	tableau_public_negotiated_locale	Functional	{'en': 'session'}	public.tableau.com	tableau	None	1	Necessary cookies
# Ok. We can move any cookie to Necessary


In [9]:
added_cookies = sorted([
        item for item in all_about_cookies.values() 
        if item.get('scan_category') == None 
            and item['category'] != 'Necessary cookies' # we need to check only not necessary cookies
    ],
    key=lambda x: x['cookie_id']
)

pd.set_option('display.max_rows', None)
df_added_cookies = pd.DataFrame(added_cookies)
df_added_cookies.sort_values(by='cookie_id')

Unnamed: 0,cookie_id,duration,domain,script_slug,url_pattern,created_from_scan,category
0,1P_JAR,{'en': '1 month'},.google.com,,google.com,0,Analytics cookies
1,ADDED,{'en': 'persistent'},.google.*,,.google.*,0,Analytics cookies
2,AEC,{'en': '6 months'},.google.com,,google.com,0,Analytics cookies
3,APISID,{'en': '2 years'},.google.com,,google.com,0,Analytics cookies
4,DSID,{'en': '2 weeks'},doubleclick.net,,doubleclick.net,0,Marketing cookies
5,DV,{'en': '1 hour'},google.com,,google.com,0,Analytics cookies
6,NID,{'en': '2 years'},google.com,,google.com,0,Analytics cookies
7,SAPISID,{'en': 'Persistent'},google.*,,google.*,0,Analytics cookies
8,SID,{'en': '2 years'},.google.*,,.google.*,0,Analytics cookies
9,SIDCC,{'en': '2 years'},.google.*,,.google.*,0,Analytics cookies


In [72]:
# need to collect all url pattens
url_pattern_dict = {}

for item in all_about_cookies.values():
    url_pattern = item['url_pattern']
    category = item['category']

    if url_pattern == None:
        continue
    
    # Add the scan_category to the list associated with the url_pattern
    if url_pattern in url_pattern_dict:
        if category not in url_pattern_dict[url_pattern]:
            url_pattern_dict[url_pattern].append(category)
    else:
        url_pattern_dict[url_pattern] = [category]

url_pattern_dict

{'googletagmanager.com': ['Analytics cookies'],
 'bing.com': ['Marketing cookies', 'Analytics cookies'],
 'tiktok.com|analytics.tiktok.com/i18n/pixel/config.js': ['Marketing cookies'],
 'doubleclick.net': ['Marketing cookies', 'Analytics cookies'],
 'rudderlabs.com': ['Marketing cookies'],
 'youtube.com': ['Marketing cookies', 'Functional cookies'],
 'google-analytics.com|googletagmanager.com/gtag/js': ['Analytics cookies'],
 'facebook.net': ['Analytics cookies'],
 'js.hs-analytics.net': ['Analytics cookies'],
 'hubspot.com': ['Analytics cookies'],
 'tildacdn.com/js/tilda-stat-1.0.min.js': ['Analytics cookies'],
 'woopra.com': ['Analytics cookies'],
 'clarity.ms': ['Analytics cookies'],
 'typeform.com': ['Analytics cookies'],
 'google.com': ['Analytics cookies'],
 'tomi.ai': ['Analytics cookies'],
 'google.*': ['Analytics cookies', 'Marketing cookies'],
 '.google.*': ['Analytics cookies'],
 'reddit.com': ['Marketing cookies']}

In [None]:
# '.google.*': ['Analytics cookies'],
# 'google.*': ['Analytics cookies', 'Marketing cookies'],
# 'google.com': ['Analytics cookies'],

# 'doubleclick.net': ['Marketing cookies', 'Analytics cookies'],

# 'bing.com': ['Marketing cookies', 'Analytics cookies'],

# this cookies splites by CookieYes and I think it should work in a right way
# 'youtube.com': ['Marketing cookies', 'Functional cookies'],

# All rest are fine
# 'clarity.ms': ['Analytics cookies'],
# 'facebook.net': ['Analytics cookies'],
# 'google-analytics.com|googletagmanager.com/gtag/js': ['Analytics cookies'],
# 'googletagmanager.com': ['Analytics cookies'],
# 'hubspot.com': ['Analytics cookies'],
# 'js.hs-analytics.net': ['Analytics cookies'],
# 'reddit.com': ['Marketing cookies']
# 'rudderlabs.com': ['Marketing cookies'],
# 'tiktok.com|analytics.tiktok.com/i18n/pixel/config.js': ['Marketing cookies'],
# 'tildacdn.com/js/tilda-stat-1.0.min.js': ['Analytics cookies'],
# 'tomi.ai': ['Analytics cookies'],
# 'typeform.com': ['Analytics cookies'],
# 'woopra.com': ['Analytics cookies'],