In [1]:
"""Quick diagnostic: trace where the sample size dropped from 615,392 to 73,855."""
import pandas as pd

In [2]:
DATA_DIR = r"c:\Users\nicta\Desktop\API_Paper_ChiPlay\mod_data"
authors = pd.read_parquet(f"{DATA_DIR}/Authors.parquet")
mods = pd.read_parquet(f"{DATA_DIR}/CleanedModData.parquet")

In [3]:
print("=== AUTHORS TABLE ===")
print(f"Total rows: {len(authors):,}")
print(f"deleted=True: {authors['deleted'].sum():,}")
print(f"deleted=False: {(~authors['deleted']).sum():,}")
print(f"last_active is null: {authors['last_active'].isna().sum():,}")
has_active = authors['last_active'].notna()
print(f"last_active >= 2024-01-01: {(has_active & (authors['last_active'] >= '2024-01-01')).sum():,}")

=== AUTHORS TABLE ===
Total rows: 122,859
deleted=True: 2,918
deleted=False: 119,941
last_active is null: 13,013
last_active >= 2024-01-01: 73,970


In [4]:
# Apply the SQL WHERE clause step by step
step1 = authors[~authors['deleted']]
print(f"\nStep 1 - deleted=0: {len(step1):,}")


Step 1 - deleted=0: 119,941


In [5]:
step2 = step1[step1['last_active'].notna()]
print(f"Step 2 - last_active NOT NULL: {len(step2):,}")

Step 2 - last_active NOT NULL: 107,278


In [6]:
step3 = step2[step2['last_active'] >= '2024-01-01']
print(f"Step 3 - last_active >= 2024-01-01: {len(step3):,}")

Step 3 - last_active >= 2024-01-01: 73,855


In [7]:
# Now check: how many of these have mods in CleanedModData?
mod_member_ids = set(mods['member_id'].unique())
has_mods = step3['member_id'].isin(mod_member_ids)
print(f"\nOf {len(step3):,} filtered authors:")
print(f"  With mods in CleanedModData: {has_mods.sum():,}")
print(f"  Without mods in CleanedModData: {(~has_mods).sum():,}")


Of 73,855 filtered authors:
  With mods in CleanedModData: 73,855
  Without mods in CleanedModData: 0


The SQL query uses LEFT JOIN ... GROUP BY
But if users have no mods, they'd still appear (with NULLs for mod fields)
Unless there's a downstream filter

In [8]:
print(f"\n=== MODS TABLE ===")
print(f"Total mods: {len(mods):,}")
print(f"Unique member_ids in mods: {mods['member_id'].nunique():,}")
print(f"Unique member_ids in Authors: {authors['member_id'].nunique():,}")


=== MODS TABLE ===
Total mods: 483,939
Unique member_ids in mods: 122,859
Unique member_ids in Authors: 122,859


In [9]:
# mod_count from Authors table (API-provided, separate from CleanedModData)
print(f"\n=== mod_count in Authors (filtered, step3) ===")
print(f"mod_count == 0: {(step3['mod_count'] == 0).sum():,}")
print(f"mod_count > 0: {(step3['mod_count'] > 0).sum():,}")
print(f"owned_mod_count == 0: {(step3['owned_mod_count'] == 0).sum():,}")
print(f"owned_mod_count > 0: {(step3['owned_mod_count'] > 0).sum():,}")


=== mod_count in Authors (filtered, step3) ===
mod_count == 0: 12,715
mod_count > 0: 61,140
owned_mod_count == 0: 0
owned_mod_count > 0: 73,855


In [10]:
# Check what the parquet Authors file actually is
# Is this ALL users from the API or already filtered?
print(f"\n=== Is Authors.parquet already filtered? ===")
print(f"Total Authors rows: {len(authors):,}")
print(f"Any deleted=True? {authors['deleted'].any()}")
print(f"Min last_active: {authors['last_active'].min()}")
print(f"Max last_active: {authors['last_active'].max()}")
print(f"Min joined: {authors['joined'].min()}")
print(f"Max joined: {authors['joined'].max()}")


=== Is Authors.parquet already filtered? ===
Total Authors rows: 122,859
Any deleted=True? True
Min last_active: 2003-07-16 13:17:11
Max last_active: 2025-02-23 22:46:21
Min joined: 2003-07-15 17:01:46
Max joined: 2025-02-01 13:39:27


In [11]:
# Look at the downstream filters in notebook 03
# After the SQL query, what additional filtering happens?
print(f"\n=== Checking for additional filters ===")
# Users with mod_count > 0 in Authors BUT no mods in CleanedModData
step3_with_mods = step3[step3['mod_count'] > 0]
in_cleaned = step3_with_mods['member_id'].isin(mod_member_ids)
print(f"Authors with mod_count>0: {len(step3_with_mods):,}")
print(f"  Of those, in CleanedModData: {in_cleaned.sum():,}")
print(f"  Of those, NOT in CleanedModData: {(~in_cleaned).sum():,}")


=== Checking for additional filters ===
Authors with mod_count>0: 61,140
  Of those, in CleanedModData: 61,140
  Of those, NOT in CleanedModData: 0


In [None]:
# Check: what if the 615,392 was on the SQL server (with the full Authors table)?
# The parquet Authors has only 122,859 rows
# The SQL server Authors had 1,028,417 rows
# After filtering: deleted=0, last_active >= 2024-01-01 -> some number
# On the parquet (122,859), after filtering -> 73,855
# So: 122,859 is likely already a subset
print(f"\n=== CONCLUSION ===")
print(f"The parquet Authors.parquet has {len(authors):,} rows")
print(f"The paper says N_raw = 1,028,417")
print(f"So Authors.parquet is likely ALREADY a filtered export,")
print(f"not the full table from the SQL server.")
print(f"")
print(f"After SQL filters (deleted=0, last_active >= 2024-01-01): {len(step3):,}")
print(f"After LEFT JOIN with CleanedModData (users WITH mods): {has_mods.sum():,}")
print(f"")
print(f"The 615,392 likely came from the full SQL server Authors table")
print(f"with the same WHERE clause applied to all 1,028,417 users.")
print(f"The 73,855 represents users who ALSO have mods in CleanedModData.")