In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from pathlib import Path

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

In [25]:
samples_dir = Path('../../data/2025-01-01_2025-01-07_0.04')
sample_files = list(samples_dir.glob('sample_*.parquet'))
# Load all sample files into a single DataFrame
print(f"Found {len(sample_files)} sample files.")
dfs = []
for file in sample_files:
    df = pd.read_parquet(file, engine="fastparquet")
    dfs.append(df)
    print(f"Loaded {file.name} with shape {df.shape}")
df = pd.concat(dfs, ignore_index=True)

print(f"Combined DataFrame shape: {df.shape}")
df.shape

Found 3 sample files.
Loaded sample_2025-01-03_2025-01-03.parquet with shape (2199995, 37)
Loaded sample_2025-01-02_2025-01-02.parquet with shape (2299987, 37)
Loaded sample_2025-01-01_2025-01-01.parquet with shape (1800000, 37)
Combined DataFrame shape: (6299982, 37)


(6299982, 37)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6299982 entries, 0 to 6299981
Data columns (total 37 columns):
 #   Column                            Dtype   
---  ------                            -----   
 0   uuid                              object  
 1   decision_visibility               object  
 2   decision_visibility_other         object  
 3   end_date_visibility_restriction   object  
 4   decision_monetary                 object  
 5   decision_monetary_other           float64 
 6   end_date_monetary_restriction     float64 
 7   decision_provision                object  
 8   end_date_service_restriction      object  
 9   decision_account                  object  
 10  end_date_account_restriction      object  
 11  account_type                      object  
 12  decision_ground                   object  
 13  decision_ground_reference_url     object  
 14  illegal_content_legal_ground      object  
 15  illegal_content_explanation       object  
 16  incompatible_conte

In [27]:
df.columns

Index(['uuid', 'decision_visibility', 'decision_visibility_other',
       'end_date_visibility_restriction', 'decision_monetary',
       'decision_monetary_other', 'end_date_monetary_restriction',
       'decision_provision', 'end_date_service_restriction',
       'decision_account', 'end_date_account_restriction', 'account_type',
       'decision_ground', 'decision_ground_reference_url',
       'illegal_content_legal_ground', 'illegal_content_explanation',
       'incompatible_content_ground', 'incompatible_content_explanation',
       'incompatible_content_illegal', 'category', 'category_addition',
       'category_specification', 'category_specification_other',
       'content_type', 'content_type_other', 'content_language',
       'content_date', 'territorial_scope', 'application_date',
       'decision_facts', 'source_type', 'source_identity',
       'automated_detection', 'automated_decision', 'platform_name',
       'platform_uid', 'created_at'],
      dtype='object')

In [29]:
# Filter out Google Shopping


# Calculate counts and proportions in one step
platform_counts = (df["platform_name"]
                   .value_counts()
                   .to_frame("count")
                   .assign(ratio=lambda x: x["count"] / x["count"].sum())
                   .sort_values("count", ascending=False)
                   .reset_index())

platform_counts

Unnamed: 0,platform_name,count,ratio
0,Google Shopping,5396556,0.8565986
1,TikTok,401247,0.06369018
2,Facebook,206381,0.03275898
3,Pinterest,106844,0.01695941
4,Amazon Store,83355,0.01323099
5,"SIA ""JOOM""",28604,0.00454033
6,Instagram,22070,0.003503185
7,Vinted UAB,8327,0.00132175
8,BlaBlaCar,7754,0.001230797
9,Google Maps,5530,0.0008777803


In [None]:
relevant_platforms = ["AliExpress", "Amazon Store", "Facebook", "Google Maps", "Google Shopping", "Instagram", "Pinterest", "TikTok", "YouTube"]
list(platform_counts["platform_name"])[:20]

['Google Shopping',
 'TikTok',
 'Facebook',
 'Pinterest',
 'Amazon Store',
 'SIA "JOOM"',
 'Instagram',
 'Vinted UAB',
 'BlaBlaCar',
 'Google Maps',
 'YouTube',
 'Wallapop',
 'Temu',
 'Discord Netherlands B.V.',
 'Idealo',
 'Trustpilot',
 'leboncoin',
 'Kleinanzeigen',
 'Badoo',
 'Snapchat']

In [14]:
df[["platform_name","category"]].groupby(["platform_name","category"]).size().reset_index(name='count').sort_values(by='count', ascending=False)

Unnamed: 0,platform_name,category,count
42,Google Shopping,STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE,3580428
35,Facebook,STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE,129743
144,TikTok,STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE,114897
136,TikTok,STATEMENT_CATEGORY_ILLEGAL_OR_HARMFUL_SPEECH,97846
0,Amazon Store,STATEMENT_CATEGORY_SCOPE_OF_PLATFORM_SERVICE,44042
...,...,...,...
126,Threads,STATEMENT_CATEGORY_DATA_PROTECTION_AND_PRIVACY...,1
117,Snapchat,STATEMENT_CATEGORY_SELF_HARM,1
152,Vinted UAB,STATEMENT_CATEGORY_NON_CONSENSUAL_BEHAVIOUR,1
149,Vinted UAB,STATEMENT_CATEGORY_ANIMAL_WELFARE,1


In [43]:
df["decision_account"].value_counts()

decision_account
DECISION_ACCOUNT_SUSPENDED     199685
DECISION_ACCOUNT_TERMINATED     20269
Name: count, dtype: int64

In [44]:
df["decision_monetary"].value_counts()

decision_monetary
DECISION_MONETARY_SUSPENSION    454
Name: count, dtype: int64

In [45]:
df["decision_monetary_other"].value_counts()

Series([], Name: count, dtype: int64)

In [46]:
df["decision_facts"].value_counts(normalize=True)

decision_facts
When reviewing content or accounts to determine whether they are illegal or violate our policies, we take various information into consideration when making a decision, including product data, website quality, merchant information, account information (e.g., past history of policy violations), and other information provided through reporting mechanisms (where applicable) and own-initiative reviews.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [47]:
df["decision_provision"].value_counts(normalize=True)

decision_provision
DECISION_PROVISION_PARTIAL_SUSPENSION     0.710594
DECISION_PROVISION_TOTAL_TERMINATION      0.248923
DECISION_PROVISION_TOTAL_SUSPENSION       0.035027
DECISION_PROVISION_PARTIAL_TERMINATION    0.005455
Name: proportion, dtype: float64

In [32]:
df["decision_visibility"].value_counts(normalize=True)

decision_visibility
["DECISION_VISIBILITY_CONTENT_DISABLED"]                                                        9.055704e-01
["DECISION_VISIBILITY_CONTENT_REMOVED"]                                                         5.573856e-02
["DECISION_VISIBILITY_OTHER"]                                                                   3.579024e-02
["DECISION_VISIBILITY_CONTENT_DEMOTED"]                                                         1.390764e-03
["DECISION_VISIBILITY_CONTENT_INTERACTION_RESTRICTED"]                                          8.609608e-04
["DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED"]                                                  5.182599e-04
["DECISION_VISIBILITY_CONTENT_LABELLED"]                                                        8.176723e-05
[]                                                                                              2.164427e-05
["DECISION_VISIBILITY_CONTENT_REMOVED","DECISION_VISIBILITY_CONTENT_AGE_RESTRICTED"]            1.683443e-05