# Read all HYROX results CSV files, creating one file per division.

In [13]:
import os
import re
from collections import defaultdict

root_folder = r"Datasets\Hyrox"

raw_divisions = set()
division_seasons = defaultdict(set)   # NEW: track seasons per division

suffix_words = {
    "overall", "saturday", "sunday", "friday",
    "thursday", "wednesday"
}

# -----------------------------
# STEP 1: Extract divisions + Season
# -----------------------------
for subdir, _, files in os.walk(root_folder):

    # Extract season name from folder path
    season = os.path.basename(subdir)

    for file in files:
        if not file.lower().endswith(".csv"):
            continue

        name = file.replace(".csv", "")

        if name.lower() in ["doubles", "singles", "hyrox_pro"]:
            continue

        parts = name.split("_")

        # Find first HYROX
        hyrox_index = None
        for i, p in enumerate(parts):
            if p.lower() == "hyrox":
                hyrox_index = i
                break

        if hyrox_index is None:
            continue

        division_parts = parts[hyrox_index:]

        # Remove trailing weekday/overall
        while division_parts and division_parts[-1].lower() in suffix_words:
            division_parts = division_parts[:-1]

        division = "_".join(division_parts)
        raw_divisions.add(division)

        # Store season info
        division_seasons[division].add(season)


# -----------------------------
# STEP 2: Clean divisions
# -----------------------------
clean_divisions = set()
cleaned_seasons_map = defaultdict(set)

for d in raw_divisions:
    clean_d = d.upper()
    clean_d = clean_d.replace("-", "_")
    clean_d = re.sub(r"_+$", "", clean_d)
    clean_d = re.sub(r"_+", "_", clean_d)

    clean_divisions.add(clean_d)

    # map cleaned version to seasons
    for season in division_seasons[d]:
        cleaned_seasons_map[clean_d].add(season)


# -----------------------------
# STEP 3: Detect categories
# -----------------------------
categories = defaultdict(list)

for d in clean_divisions:

    if "DOUBLES" in d:
        categories["DOUBLES"].append(d)

    elif "RELAY" in d or "TEAM" in d:
        categories["RELAY / TEAM"].append(d)

    elif "GORUCK" in d:
        categories["GORUCK"].append(d)

    elif "ADAPTIVE" in d:
        categories["ADAPTIVE"].append(d)

    elif "YOUNGSTARS" in d:
        categories["YOUTH"].append(d)

    elif "ELITE" in d:
        categories["ELITE"].append(d)

    elif "PRO" in d:
        categories["PRO (SINGLES)"].append(d)

    elif d == "HYROX":
        categories["OPEN (SINGLES)"].append(d)

    else:
        categories["OTHER / SPECIAL"].append(d)


# -----------------------------
# STEP 4: Print results
# -----------------------------
print("\n===== DIVISION DETECTION SUMMARY =====\n")

for cat in sorted(categories.keys()):
    print(f"\n{cat} ({len(categories[cat])})")
    print("-" * 40)
    for div in sorted(categories[cat]):
        seasons = ", ".join(sorted(cleaned_seasons_map[div]))
        print(f"{div}  | Seasons: {seasons}")

print("\nTotal cleaned divisions:", len(clean_divisions))


===== DIVISION DETECTION SUMMARY =====


ADAPTIVE (1)
----------------------------------------
HYROX_ADAPTIVE  | Seasons: Processed dataset, Season_7, Season_8

DOUBLES (4)
----------------------------------------
HYROX_DOUBLES  | Seasons: Processed dataset, Season_1, Season_2, Season_3, Season_4, Season_5, Season_6, Season_7, Season_8
HYROX_ELITE_15_DOUBLES  | Seasons: Processed dataset, Season_7, Season_8
HYROX_GORUCK_DOUBLES  | Seasons: Processed dataset, Season_5, Season_6
HYROX_PRO_DOUBLES  | Seasons: Processed dataset, Season_6, Season_7, Season_8

ELITE (2)
----------------------------------------
HYROX_ELITE  | Seasons: Processed dataset, Season_3, Season_4, Season_5, Season_6, Season_7
HYROX_ELITE_15  | Seasons: Processed dataset, Season_7, Season_8

GORUCK (1)
----------------------------------------
HYROX_GORUCK  | Seasons: Processed dataset, Season_5, Season_6

OPEN (SINGLES) (1)
----------------------------------------
HYROX  | Seasons: Processed dataset, Season_1, Season

In [14]:
import os
import pandas as pd
import re
from collections import defaultdict

root_folder = r"Datasets\Hyrox"
output_folder = os.path.join(root_folder, "Processed dataset")

os.makedirs(output_folder, exist_ok=True)

division_files = defaultdict(list)
division_datasets = {}
empty_files_detected = []

suffix_words = {
    "overall", "saturday", "sunday", "friday",
    "thursday", "wednesday"
}

# ---------------------------------
# STEP 1: Group files by division
# ---------------------------------
for subdir, _, files in os.walk(root_folder):

    # ✅ Extract Season from folder name
    season = os.path.basename(subdir)

    for file in files:

        if not file.lower().endswith(".csv"):
            continue

        if "processed dataset" in subdir.lower():
            continue

        filename_without_ext = file.replace(".csv", "")

        if filename_without_ext.lower() in ["doubles", "singles", "hyrox_pro"]:
            continue

        parts = filename_without_ext.split("_")

        hyrox_index = None
        for i, p in enumerate(parts):
            if p.lower() == "hyrox":
                hyrox_index = i
                break

        if hyrox_index is None:
            continue

        division_parts = parts[hyrox_index:]

        while division_parts and division_parts[-1].lower() in suffix_words:
            division_parts = division_parts[:-1]

        division = "_".join(division_parts)

        division = division.upper()
        division = division.replace("-", "_")
        division = re.sub(r"_+$", "", division)
        division = re.sub(r"_+", "_", division)

        file_path = os.path.join(subdir, file)

        # ✅ Store season together with file info
        division_files[division].append((file_path, filename_without_ext, season))


# ---------------------------------
# STEP 2: Create dataset per division
# ---------------------------------
for division, files in division_files.items():

    dfs = []

    for file_path, filename_without_ext, season in files:
        try:
            df = pd.read_csv(file_path)

            if df.empty:
                empty_files_detected.append(filename_without_ext)
                continue

            df["source_file"] = filename_without_ext

            # ✅ ADD SEASON COLUMN HERE
            df["Season"] = season

            dfs.append(df)

        except:
            continue

    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)

        output_path = os.path.join(output_folder, f"{division}.csv")
        combined_df.to_csv(output_path, index=False)

        division_datasets[division] = combined_df
        globals()[division] = combined_df


# ---------------------------------
# FINAL SUMMARY
# ---------------------------------
print("\n============================")
print("EMPTY FILES SUMMARY")
print("============================")
print(f"Total empty files: {len(empty_files_detected)}\n")

if empty_files_detected:
    for file in sorted(empty_files_detected):
        print(f"- {file}")
else:
    print("No empty files detected.")

print("\n============================")
print("DATASETS CREATED")
print("============================")

if division_datasets:
    for name in sorted(division_datasets.keys()):
        print(f"{name} → {len(division_datasets[name])} rows")
else:
    print("No datasets created.")


EMPTY FILES SUMMARY
Total empty files: 197

- 2019_Hannover_HYROX_TEAM-CHALLENGE
- 2021_Amsterdam_Hyrox_Team_Relay
- 2021_Austin_HYROX_ELITE
- 2021_Berlin_Hyrox_Team_Relay
- 2021_Birmingham_Hyrox_Team_Relay
- 2021_Dallas_Hyrox_Team_Relay
- 2021_Hamburg_Hyrox_Team_Relay
- 2021_Madrid_HYROX_ELITE
- 2021_New_York_Hyrox_Team_Relay
- 2021_Orlando_HYROX_ELITE
- 2021_Stuttgart_Hyrox_Team_Relay
- 2022_Basel_HYROX_TEAM_RELAY
- 2022_Birmingham_HYROX_TEAM_RELAY
- 2022_Bremen_Hyrox_Team_Relay
- 2022_Chicago_Hyrox_Team_Relay
- 2022_Dallas_Hyrox_Team_Relay
- 2022_Essen_HYROX_TEAM_RELAY
- 2022_Essen_Hyrox_Team_Relay
- 2022_Frankfurt_Hyrox_Team_Relay
- 2022_Karlsruhe_Hyrox_Team_Relay
- 2022_Las_Vegas_HYROX_ELITE
- 2022_Leipzig_HYROX_TEAM_RELAY
- 2022_London_HYROX_TEAM_RELAY
- 2022_London_Hyrox_Team_Relay
- 2022_Los_Angeles_HYROX_TEAM_RELAY
- 2022_Los_Angeles_Hyrox_Team_Relay
- 2022_Maastricht_Hyrox_Team_Relay
- 2022_Manchester_Hyrox_Team_Relay
- 2022_München_Hyrox_Team_Relay
- 2022_New_York_HYROX_TEA

In [23]:
HYROX_YOUNGSTARS_8_9_YRS.tail()

Unnamed: 0,Race,Division,Gender,Rank Overall,Rank Age Group,Name,Nation,Age Group,Total Time,source_file,Season,Day,is_overall
372,2026 Amsterdam - Youngstars,HYROX YOUNGSTARS 8-9 YRS,Women,166,166,"Garcia Marin, Noa",ESP,08-09,00:19:16,2026_Amsterdam_-_Youngstars_HYROX_YOUNGSTARS_8-9_YRS,Season_8,Overall,False
373,2026 Amsterdam - Youngstars,HYROX YOUNGSTARS 8-9 YRS,Women,167,167,"Hiemstra, Yinthe",NED,08-09,00:19:25,2026_Amsterdam_-_Youngstars_HYROX_YOUNGSTARS_8-9_YRS,Season_8,Overall,False
374,2026 Amsterdam - Youngstars,HYROX YOUNGSTARS 8-9 YRS,Women,168,168,"Reijven, Victoria",NED,08-09,00:21:26,2026_Amsterdam_-_Youngstars_HYROX_YOUNGSTARS_8-9_YRS,Season_8,Overall,False
375,2026 Amsterdam - Youngstars,HYROX YOUNGSTARS 8-9 YRS,Women,169,169,"Claver, Beau",NED,08-09,00:21:47,2026_Amsterdam_-_Youngstars_HYROX_YOUNGSTARS_8-9_YRS,Season_8,Overall,False
376,2026 Amsterdam - Youngstars,HYROX YOUNGSTARS 8-9 YRS,Women,170,170,"Geurts, Loues",NED,08-09,00:23:13,2026_Amsterdam_-_Youngstars_HYROX_YOUNGSTARS_8-9_YRS,Season_8,Overall,False


In [24]:
HYROX_DOUBLES.tail()

Unnamed: 0,Race,Division,Gender,Rank Overall,Rank Age Group,Name,Nation,Age Group,Total Time,source_file,Season,Day,is_overall
213475,2026 Osaka,HYROX DOUBLES,Women,72,9,"지선 윤, 하라 나",,35-39,01:23:15,2026_Osaka_HYROX_DOUBLES_-_Overall,Season_8,Overall,True
213476,2026 Osaka,HYROX DOUBLES,Women,70,28,"지현 임, 효연 김",,25-29,01:22:55,2026_Osaka_HYROX_DOUBLES_-_Overall,Season_8,Overall,True
213477,2026 Osaka,HYROX DOUBLES,Women,38,14,"지혜 김, 소미 김",,25-29,01:16:43,2026_Osaka_HYROX_DOUBLES_-_Overall,Season_8,Overall,True
213478,2026 Osaka,HYROX DOUBLES,Women,22,10,"지혜 한, 수빈 한",,30-34,01:14:56,2026_Osaka_HYROX_DOUBLES_-_Overall,Season_8,Overall,True
213479,2026 Osaka,HYROX DOUBLES,Women,30,12,"지희 명, 지수 김",,25-29,01:15:22,2026_Osaka_HYROX_DOUBLES_-_Overall,Season_8,Overall,True


# CHECK 1: Global data quality summary

In [15]:
total_rows_all_divisions = 0
total_duplicate_rows_global = 0
files_with_duplicates = set()

for division, df in division_datasets.items():

    total_rows_all_divisions += len(df)

    duplicate_mask = df.duplicated(keep=False)

    if duplicate_mask.any():

        duplicates_df = df.loc[duplicate_mask]
        total_duplicate_rows_global += len(duplicates_df)

        dup_files = duplicates_df["source_file"].unique()
        files_with_duplicates.update(dup_files)


# -------------------------
# FILE COUNTS
# -------------------------

all_files_processed = set()

for division, df in division_datasets.items():
    all_files_processed.update(df["source_file"].unique())

all_files_processed.update(empty_files_detected)

total_files = len(all_files_processed)

empty_files_count = len(empty_files_detected)
duplicate_files_count = len(files_with_duplicates)

files_with_issues = set(empty_files_detected).union(files_with_duplicates)
total_files_with_issues = len(files_with_issues)

# -------------------------
# PRINT SUMMARY
# -------------------------

print("\n============================")
print("GLOBAL DATA QUALITY SUMMARY (RAW)")
print("============================")

print(f"Total rows across all divisions: {total_rows_all_divisions}")
print(f"Total duplicate rows detected: {total_duplicate_rows_global}")

if total_rows_all_divisions > 0:
    print(f"Duplicate rows percentage: {(total_duplicate_rows_global / total_rows_all_divisions) * 100:.2f}%")

print("\n----------------------------")
print("FILE-LEVEL QUALITY")
print("----------------------------")

print(f"Total files processed: {total_files}")

print(f"Empty files: {empty_files_count}")
print(f"Files with duplicate rows: {duplicate_files_count}")
print(f"Total files with issues (empty OR duplicates): {total_files_with_issues}")

if total_files > 0:
    print(f"Percentage of files with issues: {(total_files_with_issues / total_files) * 100:.2f}%")


GLOBAL DATA QUALITY SUMMARY (RAW)
Total rows across all divisions: 792093
Total duplicate rows detected: 22255
Duplicate rows percentage: 2.81%

----------------------------
FILE-LEVEL QUALITY
----------------------------
Total files processed: 1366
Empty files: 197
Files with duplicate rows: 224
Total files with issues (empty OR duplicates): 418
Percentage of files with issues: 30.60%


#### Notes:  
* Cologne has all rows duplicated in the doubles (https://results.hyrox.com/season-7/?page=1&event=HDP_COLOGNE25_OVERALL&num_results=100&pid=list&pidp=ranking_nav&ranking=time_finish_netto&search%5Bsex%5D=M&search%5Bage_class%5D=%25&search%5Bnation%5D=%25)  
* Same in other datasets.

In [16]:
for division, df in division_datasets.items():
    df.drop_duplicates(inplace=True)

# CHECK 2: Races where scraping only ran the first page

In [17]:
# List of datasets to check
datasets_to_check = [
    "HYROX",
    "HYROX_DOUBLES",
    "HYROX_PRO",
    "HYROX_PRO_DOUBLES",
    "HYROX_ADAPTIVE",
    "HYROX_ELITE_15_DOUBLES",
    "HYROX_GORUCK",
    "HYROX_GORUCK_DOUBLES",
    "HYROX_YOUNGSTARS_8_9_YRS"
]

for ds_name in datasets_to_check:
    df = globals().get(ds_name)
    if df is None:
        print(f"{ds_name} → dataset not found in memory")
        continue

    # Only check if required columns exist
    required_cols = ["Race", "Division", "Gender"]
    if not all(col in df.columns for col in required_cols):
        print(f"{ds_name} → missing columns for grouping")
        continue

    counts = df.groupby(required_cols).size()
    counts_100 = counts[counts == 100]

    print(f"\n{ds_name} → combinations with exactly 100 rows:")
    if len(counts_100) == 0:
        print("  None")
    else:
        for idx, count in counts_100.items():
            print(f"  {idx} → {count} rows")


HYROX → combinations with exactly 100 rows:
  ('2022 Karlsruhe', 'HYROX', 'Women') → 100 rows
  ('2023 Stuttgart', 'HYROX', 'Women') → 100 rows

HYROX_DOUBLES → combinations with exactly 100 rows:
  ('2024 Cape Town', 'HYROX DOUBLES', 'Women') → 100 rows
  ('2026 Auckland', 'HYROX DOUBLES - Saturday', 'Mixed') → 100 rows

HYROX_PRO → combinations with exactly 100 rows:
  ('2023 Manchester', 'HYROX PRO', 'Men') → 100 rows
  ('2025 Manchester', 'HYROX PRO', 'Women') → 100 rows

HYROX_PRO_DOUBLES → combinations with exactly 100 rows:
  ('2025 Toulouse', 'HYROX PRO DOUBLES', 'Men') → 100 rows

HYROX_ADAPTIVE → combinations with exactly 100 rows:
  None

HYROX_ELITE_15_DOUBLES → combinations with exactly 100 rows:
  None

HYROX_GORUCK → combinations with exactly 100 rows:
  None

HYROX_GORUCK_DOUBLES → combinations with exactly 100 rows:
  None

HYROX_YOUNGSTARS_8_9_YRS → combinations with exactly 100 rows:
  None


### It's okay. They do have exactly 100 rows.

# CHECK 5: Missing genders in races. 

In [18]:
# ---------------------------------
# SPLIT 'Division' INTO 'Division' AND 'Day'
# ---------------------------------

for division_name, df in division_datasets.items():

    if 'Division' in df.columns:
        # Split by ' - ', max 1 split
        split_cols = df['Division'].str.split(' - ', n=1, expand=True)

        # First part is actual Division
        df['Division'] = split_cols[0]

        # Second part is Day, default to 'Overall' if missing
        if split_cols.shape[1] > 1:
            df['Day'] = split_cols[1].fillna('Overall')
        else:
            df['Day'] = 'Overall'

        # Update the dataset in memory
        division_datasets[division_name] = df
        globals()[division_name] = df

print("Division column split into 'Division' and 'Day' for all datasets. Missing day values set to 'Overall'.")

Division column split into 'Division' and 'Day' for all datasets. Missing day values set to 'Overall'.


In [19]:
HYROX.tail()

Unnamed: 0,Race,Division,Gender,Rank Overall,Rank Age Group,Name,Nation,Age Group,Total Time,source_file,Season,Day
430713,2026 Vienna,HYROX,Women,970,251,"Putri, Thalita",INA,25-29,02:40:49,2026_Vienna_HYROX,Season_8,Overall
430714,2026 Vienna,HYROX,Women,971,252,"Simmons, Jaye",USA,25-29,02:45:49,2026_Vienna_HYROX,Season_8,Overall
430715,2026 Vienna,HYROX,Women,972,218,"Topuzi, Esmeralda",ALB,30-34,02:48:52,2026_Vienna_HYROX,Season_8,Overall
430716,2026 Vienna,HYROX,Women,973,152,"Reinstadler, Lisa",ITA,16-24,02:54:42,2026_Vienna_HYROX,Season_8,Overall
430717,2026 Vienna,HYROX,Women,974,253,"Kohler, Ophelia",AUT,25-29,03:50:23,2026_Vienna_HYROX,Season_8,Overall


In [20]:
# ---------------------------------
# CHECK MISSING GENDERS FOR ALL DATASETS
# ---------------------------------

# Define expected genders per type of division
expected_genders_map = {
    'HYROX_DOUBLES': ['Men', 'Women', 'Mixed'],
    'HYROX_PRO': ['Men', 'Women'],
    'HYROX': ['Men', 'Women'],
    'HYROX_GORUCK': ['Men', 'Women', 'Mixed'],
    'HYROX_GORUCK_DOUBLES': ['Men', 'Women', 'Mixed'],
    'HYROX_PRO_DOUBLES': ['Men', 'Women', 'Mixed'],
    'HYROX_ADAPTIVE': ['Men', 'Women'],
    'HYROX_ELITE_15_DOUBLES': ['Men', 'Women', 'Mixed'],
    'HYROX_YOUNGSTARS_8_9_YRS': ['Men', 'Women']
}

def check_missing_genders(df, expected_genders):
    missing_info = {}
    for (race, division), group in df.groupby(['Race', 'Division']):
        present_genders = set(group['Gender'].dropna().unique())
        missing = set(expected_genders) - present_genders
        if missing:
            missing_info[(race, division)] = missing
    return missing_info

# Loop over all datasets
all_missing_genders = {}

for division_name, df in division_datasets.items():
    expected_genders = expected_genders_map.get(division_name, ['Men', 'Women'])
    missing_info = check_missing_genders(df, expected_genders)
    if missing_info:
        all_missing_genders[division_name] = missing_info

# Print summary
print("\n==============================")
print("MISSING GENDERS SUMMARY PER DIVISION")
print("==============================\n")

for division_name, missing_info in all_missing_genders.items():
    print(f"Division: {division_name}")
    for (race, div), genders in missing_info.items():
        print(f"  Race: {race}, Division: {div} → Missing: {list(genders)}")
    print()


MISSING GENDERS SUMMARY PER DIVISION

Division: HYROX
  Race: 2024 Doha All Women's Race, Division: HYROX → Missing: ['Men']
  Race: 2025 Chicago, Division: HYROX → Missing: ['Women']
  Race: 2025 Gent, Division: HYROX → Missing: ['Women']

Division: HYROX_DOUBLES
  Race: 2024 Doha All Women's Race, Division: HYROX DOUBLES → Missing: ['Men', 'Mixed']
  Race: 2024 Hong Kong, Division: HYROX DOUBLES → Missing: ['Mixed']
  Race: 2024 Melbourne, Division: HYROX DOUBLES → Missing: ['Mixed']
  Race: 2025 Cape Town, Division: HYROX DOUBLES → Missing: ['Mixed']
  Race: 2025 Melbourne, Division: HYROX DOUBLES → Missing: ['Mixed']
  Race: 2025 Seoul, Division: HYROX DOUBLES → Missing: ['Mixed']
  Race: 2025 Vienna, Division: HYROX DOUBLES → Missing: ['Mixed']

Division: HYROX_GORUCK
  Race: 2022 Los Angeles, Division: HYROX GORUCK → Missing: ['Mixed']
  Race: 2022 New York, Division: HYROX GORUCK → Missing: ['Mixed']
  Race: 2023 Chicago, Division: HYROX GORUCK → Missing: ['Mixed']
  Race: 2023

In [21]:
# Expected divisions for every race
expected_divisions = {'HYROX', 'HYROX_PRO', 'HYROX_DOUBLES'}

# Collect all races across the relevant division datasets
all_races = set()
for division_name, df in division_datasets.items():
    all_races.update(df['Race'].unique())

# Check which divisions each race has
missing_divisions_per_race = {}

for race in sorted(all_races):
    divisions_present = set()
    for division_name, df in division_datasets.items():
        if race in df['Race'].values:
            divisions_present.add(division_name)
    
    missing = expected_divisions - divisions_present
    if missing:
        missing_divisions_per_race[race] = missing

# Print summary
if missing_divisions_per_race:
    print("Some races are missing expected divisions:")
    for race, missing in missing_divisions_per_race.items():
        print(f"  {race}: missing {', '.join(sorted(missing))}")
else:
    print("All races have HYROX, HYROX_PRO, and HYROX_DOUBLES divisions.")

Some races are missing expected divisions:
  2022 Las Vegas: missing HYROX
  2023 Chicago - North American Open Championship: missing HYROX_PRO
  2023 Frankfurt: missing HYROX, HYROX_DOUBLES
  2023 Hamburg: missing HYROX_DOUBLES
  2023 Köln: missing HYROX_DOUBLES
  2023 Maastricht European Championships: missing HYROX_PRO
  2023 Malaga: missing HYROX_DOUBLES
  2024 Amsterdam: missing HYROX_DOUBLES
  2024 Birmingham: missing HYROX_DOUBLES
  2024 Doha: missing HYROX, HYROX_DOUBLES
  2024 Doha All Women's Race: missing HYROX_PRO
  2024 Dublin: missing HYROX_DOUBLES
  2024 Glasgow: missing HYROX_DOUBLES
  2024 Hong Kong: missing HYROX_PRO
  2024 Maastricht: missing HYROX_DOUBLES
  2024 Stockholm: missing HYROX_DOUBLES
  2024 Toronto: missing HYROX_DOUBLES
  2024 Vienna - European Championship: missing HYROX_DOUBLES, HYROX_PRO
  2024 Washington - North American Championships: missing HYROX_PRO
  2025 Atlanta: missing HYROX_DOUBLES
  2025 Bangkok: missing HYROX, HYROX_DOUBLES
  2025 Belgium:

# CHECK 4: Duplicated races - overall + Saturday or other day

In [22]:
import pandas as pd

# Columns to check for duplicates (ignoring Division)
subset_cols = ['Race', 'Gender', 'Name', 'Nation', 'Age Group', 'Total Time']

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

# Days to identify non-overall files
day_suffixes = ['SATURDAY', 'SUNDAY', 'FRIDAY', 'THURSDAY', 'WEDNESDAY']

# Process each division dataset
for division, df in division_datasets.items():
    
    print(f"\n==== {division} ====")
    
    # Step 1: Mark rows as Overall or Day
    df['is_overall'] = df['source_file'].str.upper().str.contains('OVERALL')
    
    # Step 2: Identify duplicates based on subset_cols
    duplicate_mask = df.duplicated(subset=subset_cols, keep=False)
    duplicates_df = df.loc[duplicate_mask].copy()
    
    if duplicates_df.empty:
        print("No duplicates found.")
        continue
    
    # Step 3: Handle day vs overall
    # For each duplicate group, keep the overall row if it exists
    def keep_overall(group):
        overall_rows = group[group['is_overall']]
        if not overall_rows.empty:
            return overall_rows
        return group
    
    cleaned_duplicates = duplicates_df.groupby(subset_cols, dropna=False, group_keys=False).apply(keep_overall)
    
    # Step 4: Show some summary
    total_duplicates = len(duplicates_df)
    total_after_cleaning = len(cleaned_duplicates)
    removed_rows = total_duplicates - total_after_cleaning
    
    print(f"Total duplicate rows detected: {total_duplicates}")
    print(f"Rows kept after resolving day vs overall: {total_after_cleaning}")
    print(f"Rows removed: {removed_rows}")
    
    # Example of duplicates: show Rank Overall if present
    if 'Rank Overall' in df.columns:
        print("Example Rank Overall values for duplicates:")
        print(cleaned_duplicates['Rank Overall'].dropna().unique()[:5])
    
    # Step 5: Drop duplicates from the main dataset, keeping the cleaned version
    # First remove all original duplicates
    df.drop_duplicates(subset=subset_cols, keep=False, inplace=True)
    # Then append the cleaned duplicates (keeping Overall if available)
    df = pd.concat([df, cleaned_duplicates], ignore_index=True)
    
    # Step 6: Update dataset in memory
    division_datasets[division] = df
    globals()[division] = df
    
    print(f"Dataset {division} cleaned. Total rows now: {len(df)}")


==== HYROX ====


  cleaned_duplicates = duplicates_df.groupby(subset_cols, dropna=False, group_keys=False).apply(keep_overall)


Total duplicate rows detected: 202640
Rows kept after resolving day vs overall: 103060
Rows removed: 99580
Example Rank Overall values for duplicates:
[434 435 393 394 224]
Dataset HYROX cleaned. Total rows now: 325115

==== HYROX_DOUBLES ====


  cleaned_duplicates = duplicates_df.groupby(subset_cols, dropna=False, group_keys=False).apply(keep_overall)


Total duplicate rows detected: 50286
Rows kept after resolving day vs overall: 27266
Rows removed: 23020
Example Rank Overall values for duplicates:
[25 26 42 43 18]
Dataset HYROX_DOUBLES cleaned. Total rows now: 213480

==== HYROX_PRO ====


  cleaned_duplicates = duplicates_df.groupby(subset_cols, dropna=False, group_keys=False).apply(keep_overall)


Total duplicate rows detected: 37244
Rows kept after resolving day vs overall: 18636
Rows removed: 18608
Example Rank Overall values for duplicates:
[35 36 27 28 47]
Dataset HYROX_PRO cleaned. Total rows now: 65803

==== HYROX_GORUCK ====
No duplicates found.

==== HYROX_GORUCK_DOUBLES ====
No duplicates found.

==== HYROX_PRO_DOUBLES ====
Total duplicate rows detected: 13224
Rows kept after resolving day vs overall: 6617
Rows removed: 6607
Example Rank Overall values for duplicates:
[ 63  29 181 210  54]
Dataset HYROX_PRO_DOUBLES cleaned. Total rows now: 26656

==== HYROX_ADAPTIVE ====
Total duplicate rows detected: 104
Rows kept after resolving day vs overall: 55
Rows removed: 49
Example Rank Overall values for duplicates:
[6 4 3 2 1]
Dataset HYROX_ADAPTIVE cleaned. Total rows now: 236

==== HYROX_ELITE_15_DOUBLES ====
No duplicates found.

==== HYROX_YOUNGSTARS_10_11_YRS ====
No duplicates found.

==== HYROX_YOUNGSTARS_12_13_YRS ====
No duplicates found.

==== HYROX_YOUNGSTARS_14_15

  cleaned_duplicates = duplicates_df.groupby(subset_cols, dropna=False, group_keys=False).apply(keep_overall)
  cleaned_duplicates = duplicates_df.groupby(subset_cols, dropna=False, group_keys=False).apply(keep_overall)
