## Imports

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from pathlib import Path
import datetime


import time
import sys
import os
module_path = os.path.abspath(os.path.join('..', 'utils'))


if module_path not in sys.path:
    print(f"Adding {module_path} to sys.path")
    sys.path.append(module_path)
from plot_utils import show_hist, plot_heatmap_sorted, plot_facets_sorted

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

Adding /Users/user/dev/dsa-analysis/analysis/utils to sys.path


## Data

In [5]:
# generated by:
# uv run python scripts/run_pipeline.py --start 2025-01-01 --end 2025-01-07 --ratio 0.04
samples_dir = Path('../../data/2025-01-01_2025-01-07_0.04')
sample_files = list(samples_dir.glob('sample_*.parquet'))
# Load all sample files into a single DataFrame
print(f"Found {len(sample_files)} sample files.")
start = time.time()
dfs = []
for file in sample_files:
    df = pd.read_parquet(file, engine="fastparquet")
    dfs.append(df)
    print(f"Loaded {file.name} with shape {df.shape}")
df = pd.concat(dfs, ignore_index=True)
end = time.time()
print(f"Loaded all data in {end - start:.2f} seconds.")
print(f"Combined DataFrame shape: {df.shape}")
df.shape

Found 7 sample files.
Loaded sample_2025-01-07_2025-01-07.parquet with shape (2099994, 37)
Loaded sample_2025-01-03_2025-01-03.parquet with shape (2199995, 37)
Loaded sample_2025-01-04_2025-01-04.parquet with shape (2029653, 37)
Loaded sample_2025-01-02_2025-01-02.parquet with shape (2299987, 37)
Loaded sample_2025-01-05_2025-01-05.parquet with shape (2300005, 37)
Loaded sample_2025-01-06_2025-01-06.parquet with shape (2200013, 37)
Loaded sample_2025-01-01_2025-01-01.parquet with shape (1800000, 37)
Loaded all data in 30.27 seconds.
Combined DataFrame shape: (14929647, 37)


(14929647, 37)

## Sample statistics

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14929647 entries, 0 to 14929646
Data columns (total 37 columns):
 #   Column                            Dtype   
---  ------                            -----   
 0   uuid                              object  
 1   decision_visibility               object  
 2   decision_visibility_other         object  
 3   end_date_visibility_restriction   object  
 4   decision_monetary                 object  
 5   decision_monetary_other           float64 
 6   end_date_monetary_restriction     object  
 7   decision_provision                object  
 8   end_date_service_restriction      object  
 9   decision_account                  object  
 10  end_date_account_restriction      object  
 11  account_type                      object  
 12  decision_ground                   object  
 13  decision_ground_reference_url     object  
 14  illegal_content_legal_ground      object  
 15  illegal_content_explanation       object  
 16  incompatible_con

In [7]:
df.columns

Index(['uuid', 'decision_visibility', 'decision_visibility_other',
       'end_date_visibility_restriction', 'decision_monetary',
       'decision_monetary_other', 'end_date_monetary_restriction',
       'decision_provision', 'end_date_service_restriction',
       'decision_account', 'end_date_account_restriction', 'account_type',
       'decision_ground', 'decision_ground_reference_url',
       'illegal_content_legal_ground', 'illegal_content_explanation',
       'incompatible_content_ground', 'incompatible_content_explanation',
       'incompatible_content_illegal', 'category', 'category_addition',
       'category_specification', 'category_specification_other',
       'content_type', 'content_type_other', 'content_language',
       'content_date', 'territorial_scope', 'application_date',
       'decision_facts', 'source_type', 'source_identity',
       'automated_detection', 'automated_decision', 'platform_name',
       'platform_uid', 'created_at'],
      dtype='object')

## Analysis

In [None]:

relevant_platforms = ["AliExpress","Amazon Store","AppStore","Booking.com", "Reddit", "Facebook","Google Play","Google Maps","Instagram","LinkedIn","Pinterest","Snapchat","TikTok","Twitter","Wikipedia","YouTube","Zalando", "X"                    
]

print(relevant_platforms)



['AliExpress', 'Amazon Store', 'AppStore', 'Booking.com', 'Reddit', 'Facebook', 'Google Play', 'Google Maps', 'Instagram', 'LinkedIn', 'Pinterest', 'Snapchat', 'TikTok', 'Twitter', 'Wikipedia', 'YouTube', 'Zalando', 'X']


In [9]:
tiktok = df[df["platform_name"]=="TikTok"]
tiktok["category"].value_counts(normalize=True)

category
STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE                           0.457460
STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH                           0.430189
STATEMENT_CATEGORY_VIOLENCE                                            0.044476
STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS    0.028228
STATEMENT_CATEGORY_PROTECTION_OF_MINORS                                0.013980
STATEMENT_CATEGORY_SCAMS_AND_FRAUD                                     0.010694
STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS              0.006432
STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT                   0.005974
STATEMENT_CATEGORY_SELF_HARM                                           0.001125
STATEMENT_CATEGORY_ANIMAL_WELFARE                                      0.001064
STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS                 0.000342
STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR                            0.000027
STATEMENT_CATEGORY_RISK_FOR_PUB

In [10]:
list(tiktok["category"].value_counts(normalize=True)[3:].index)

['STATEMENT_CATEGORY_NEGATIVE_EFFECTS_ON_CIVIC_DISCOURSE_OR_ELECTIONS',
 'STATEMENT_CATEGORY_PROTECTION_OF_MINORS',
 'STATEMENT_CATEGORY_SCAMS_AND_FRAUD',
 'STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY_VIOLATIONS',
 'STATEMENT_CATEGORY_PORNOGRAPHY_OR_SEXUALIZED_CONTENT',
 'STATEMENT_CATEGORY_SELF_HARM',
 'STATEMENT_CATEGORY_ANIMAL_WELFARE',
 'STATEMENT_CATEGORY_INTELLECTUAL_PROPERTY_INFRINGEMENTS',
 'STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR',
 'STATEMENT_CATEGORY_RISK_FOR_PUBLIC_SECURITY',
 'STATEMENT_CATEGORY_UNSAFE_AND_ILLEGAL_PRODUCTS']

In [22]:
tiktok["content_type"].value_counts()

content_type
["CONTENT_TYPE_TEXT"]                         386646
["CONTENT_TYPE_VIDEO"]                        326042
["CONTENT_TYPE_IMAGE"]                         45325
["CONTENT_TYPE_OTHER"]                         26044
["CONTENT_TYPE_AUDIO"]                           629
["CONTENT_TYPE_IMAGE","CONTENT_TYPE_TEXT"]        21
["CONTENT_TYPE_TEXT","CONTENT_TYPE_IMAGE"]         2
Name: count, dtype: int64

In [18]:
small_categories_tiktok = tiktok[tiktok["category"].isin(list(tiktok["category"].value_counts(normalize=True)[8:].index))]
# small_categories_tiktok.groupby("category")["source_type"].value_counts().reset_index()
small_categories_tiktok

Unnamed: 0,uuid,decision_visibility,decision_visibility_other,end_date_visibility_restriction,decision_monetary,decision_monetary_other,end_date_monetary_restriction,decision_provision,end_date_service_restriction,decision_account,...,territorial_scope,application_date,decision_facts,source_type,source_identity,automated_detection,automated_decision,platform_name,platform_uid,created_at
86050,e45f9081-feb8-4b2e-adc4-2ed5f3d5634c,"[""DECISION_VISIBILITY_CONTENT_REMOVED""]",,,,,,,,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-07 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7457172299481979680,2025-01-07 13:57:21
90814,54e5deb5-0166-498b-ad2f-01b2b6f20b69,,,,,,,DECISION_PROVISION_PARTIAL_SUSPENSION,2025-01-14 00:00:00,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-07 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7457172322865191712,2025-01-07 13:57:23
90823,502204e1-530f-49ae-85a9-2f0e4f932ec5,"[""DECISION_VISIBILITY_CONTENT_REMOVED""]",,,,,,,,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-07 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7457172329454377760,2025-01-07 13:57:23
95591,d150f13c-d126-45bd-850e-68f3fac18620,"[""DECISION_VISIBILITY_CONTENT_REMOVED""]",,,,,,,,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-07 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7457172210541779745,2025-01-07 13:57:25
95602,2cd8cdf6-6c3b-4ff5-9a79-284162f9d9e9,,,,,,,DECISION_PROVISION_PARTIAL_SUSPENSION,2025-01-07 00:00:00,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-07 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7457172210541796129,2025-01-07 13:57:25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14861183,e33f3b42-75df-4f91-980c-97d86a8732d1,"[""DECISION_VISIBILITY_OTHER""]",Video not eligible for recommendation in the F...,,,,,,,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-01 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_FULLY,TikTok,7455013547894954785,2025-01-01 18:20:22
14866874,4b459ec3-58a4-4cf4-9f03-b28861cb9918,"[""DECISION_VISIBILITY_CONTENT_REMOVED""]",,,,,,,,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-01 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7455013570955221793,2025-01-01 18:20:27
14872859,7e642d30-5f49-4b69-8fce-cde692571147,"[""DECISION_VISIBILITY_CONTENT_REMOVED""]",,,,,,,,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-01 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7455013602764905249,2025-01-01 18:20:32
14887823,3baa7799-cead-4c57-a5bb-97d161fdc007,"[""DECISION_VISIBILITY_CONTENT_REMOVED""]",,,,,,,,,...,"[""AT"",""BE"",""BG"",""CY"",""CZ"",""DE"",""DK"",""EE"",""ES"",...",2025-01-01 00:00:00,The decision was taken pursuant to own-initiat...,SOURCE_VOLUNTARY,,Yes,AUTOMATED_DECISION_NOT_AUTOMATED,TikTok,7455013679499631393,2025-01-01 18:20:41


In [21]:
for platform in relevant_platforms[:]:
    
    tmp = df[df["platform_name"]==platform]
    print(f"{platform}, with {tmp.shape[0]:,} samples:")
    print(tmp["decision_provision"].value_counts()/tmp.shape[0])
    print()

AliExpress, with 94,799 samples:
decision_provision
DECISION_PROVISION_TOTAL_SUSPENSION     0.000148
DECISION_PROVISION_TOTAL_TERMINATION    0.000074
Name: count, dtype: float64

Amazon Store, with 147,849 samples:
decision_provision
DECISION_PROVISION_PARTIAL_SUSPENSION    0.667931
Name: count, dtype: float64

AppStore, with 0 samples:
Series([], Name: count, dtype: float64)

Booking.com, with 503 samples:
Series([], Name: count, dtype: float64)

Reddit, with 266 samples:
Series([], Name: count, dtype: float64)

Facebook, with 389,848 samples:
Series([], Name: count, dtype: float64)

Google Play, with 1,058 samples:
decision_provision
DECISION_PROVISION_PARTIAL_TERMINATION    0.000945
Name: count, dtype: float64

Google Maps, with 55,822 samples:
decision_provision
DECISION_PROVISION_TOTAL_TERMINATION      0.013418
DECISION_PROVISION_PARTIAL_TERMINATION    0.000573
Name: count, dtype: float64

Instagram, with 63,767 samples:
Series([], Name: count, dtype: float64)

LinkedIn, with 154 