# Read all HYROX results CSV files, combining DOUBLES and SINGLES.

In [1]:
import os
import re
from collections import defaultdict

root_folder = r"Datasets\Hyrox"

raw_divisions = set()

suffix_words = {
    "overall", "saturday", "sunday", "friday",
    "thursday", "wednesday"
}

# -----------------------------
# STEP 1: Extract divisions
# -----------------------------
for subdir, _, files in os.walk(root_folder):
    for file in files:
        if not file.lower().endswith(".csv"):
            continue

        name = file.replace(".csv", "")

        if name.lower() in ["doubles", "singles", "hyrox_pro"]:
            continue

        parts = name.split("_")

        # Find first HYROX
        hyrox_index = None
        for i, p in enumerate(parts):
            if p.lower() == "hyrox":
                hyrox_index = i
                break

        if hyrox_index is None:
            continue

        division_parts = parts[hyrox_index:]

        # Remove trailing weekday/overall
        while division_parts and division_parts[-1].lower() in suffix_words:
            division_parts = division_parts[:-1]

        division = "_".join(division_parts)
        raw_divisions.add(division)


# -----------------------------
# STEP 2: Clean divisions
# -----------------------------
clean_divisions = set()

for d in raw_divisions:
    d = d.upper()
    d = d.replace("-", "_")
    d = re.sub(r"_+$", "", d)
    d = re.sub(r"_+", "_", d)
    clean_divisions.add(d)


# -----------------------------
# STEP 3: Detect categories
# -----------------------------
categories = defaultdict(list)

for d in clean_divisions:

    if "DOUBLES" in d:
        categories["DOUBLES"].append(d)

    elif "RELAY" in d or "TEAM" in d:
        categories["RELAY / TEAM"].append(d)

    elif "GORUCK" in d:
        categories["GORUCK"].append(d)

    elif "ADAPTIVE" in d:
        categories["ADAPTIVE"].append(d)

    elif "YOUNGSTARS" in d:
        categories["YOUTH"].append(d)

    elif "ELITE" in d:
        categories["ELITE"].append(d)

    elif "PRO" in d:
        categories["PRO (SINGLES)"].append(d)

    elif d == "HYROX":
        categories["OPEN (SINGLES)"].append(d)

    else:
        categories["OTHER / SPECIAL"].append(d)


# -----------------------------
# STEP 4: Print results
# -----------------------------
print("\n===== DIVISION DETECTION SUMMARY =====\n")

for cat in sorted(categories.keys()):
    print(f"\n{cat} ({len(categories[cat])})")
    print("-" * 40)
    for div in sorted(categories[cat]):
        print(div)

print("\nTotal cleaned divisions:", len(clean_divisions))


===== DIVISION DETECTION SUMMARY =====


ADAPTIVE (1)
----------------------------------------
HYROX_ADAPTIVE

DOUBLES (4)
----------------------------------------
HYROX_DOUBLES
HYROX_ELITE_15_DOUBLES
HYROX_GORUCK_DOUBLES
HYROX_PRO_DOUBLES

ELITE (2)
----------------------------------------
HYROX_ELITE
HYROX_ELITE_15

GORUCK (1)
----------------------------------------
HYROX_GORUCK

OPEN (SINGLES) (1)
----------------------------------------
HYROX

OTHER / SPECIAL (1)
----------------------------------------
HYROX_MIXED_ASIEN_CHAMPIONSHIP_INVITATIONAL

PRO (SINGLES) (1)
----------------------------------------
HYROX_PRO

RELAY / TEAM (4)
----------------------------------------
HYROX_CORPORATE_RELAY
HYROX_SECONDARY_SCHOOL_RELAY
HYROX_TEAM_CHALLENGE
HYROX_TEAM_RELAY

YOUTH (1)
----------------------------------------
HYROX_YOUNGSTARS_8_9_YRS

Total cleaned divisions: 16


In [2]:
import os
import pandas as pd
import re
from collections import defaultdict

root_folder = r"Datasets\Hyrox"
output_folder = os.path.join(root_folder, "Processed dataset")

os.makedirs(output_folder, exist_ok=True)

division_files = defaultdict(list)
division_datasets = {}
empty_files_detected = []

suffix_words = {
    "overall", "saturday", "sunday", "friday",
    "thursday", "wednesday"
}

# ---------------------------------
# STEP 1: Group files by division
# ---------------------------------
for subdir, _, files in os.walk(root_folder):
    for file in files:

        if not file.lower().endswith(".csv"):
            continue

        if "processed dataset" in subdir.lower():
            continue

        filename_without_ext = file.replace(".csv", "")

        if filename_without_ext.lower() in ["doubles", "singles", "hyrox_pro"]:
            continue

        parts = filename_without_ext.split("_")

        hyrox_index = None
        for i, p in enumerate(parts):
            if p.lower() == "hyrox":
                hyrox_index = i
                break

        if hyrox_index is None:
            continue

        division_parts = parts[hyrox_index:]

        while division_parts and division_parts[-1].lower() in suffix_words:
            division_parts = division_parts[:-1]

        division = "_".join(division_parts)

        division = division.upper()
        division = division.replace("-", "_")
        division = re.sub(r"_+$", "", division)
        division = re.sub(r"_+", "_", division)

        file_path = os.path.join(subdir, file)
        division_files[division].append((file_path, filename_without_ext))


# ---------------------------------
# STEP 2: Create dataset per division
# ---------------------------------
for division, files in division_files.items():

    dfs = []

    for file_path, filename_without_ext in files:
        try:
            df = pd.read_csv(file_path)

            if df.empty:
                empty_files_detected.append(filename_without_ext)
                continue

            df["source_file"] = filename_without_ext
            dfs.append(df)

        except:
            continue

    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)

        output_path = os.path.join(output_folder, f"{division}.csv")
        combined_df.to_csv(output_path, index=False)

        division_datasets[division] = combined_df
        globals()[division] = combined_df


# ---------------------------------
# FINAL SUMMARY
# ---------------------------------
print("\n============================")
print("EMPTY FILES SUMMARY")
print("============================")
print(f"Total empty files: {len(empty_files_detected)}\n")

if empty_files_detected:
    for file in sorted(empty_files_detected):
        print(f"- {file}")
else:
    print("No empty files detected.")

print("\n============================")
print("DATASETS CREATED")
print("============================")

if division_datasets:
    for name in sorted(division_datasets.keys()):
        print(f"{name} → {len(division_datasets[name])} rows")
else:
    print("No datasets created.")


EMPTY FILES SUMMARY
Total empty files: 148

- 2019_Hannover_HYROX_TEAM-CHALLENGE
- 2021_Austin_HYROX_ELITE
- 2021_New_York_Hyrox_Team_Relay
- 2021_Orlando_HYROX_ELITE
- 2022_Basel_HYROX_TEAM_RELAY
- 2022_Birmingham_HYROX_TEAM_RELAY
- 2022_Bremen_Hyrox_Team_Relay
- 2022_Essen_HYROX_TEAM_RELAY
- 2022_Essen_Hyrox_Team_Relay
- 2022_Las_Vegas_HYROX_ELITE
- 2022_Leipzig_HYROX_TEAM_RELAY
- 2022_London_HYROX_TEAM_RELAY
- 2022_Los_Angeles_HYROX_TEAM_RELAY
- 2022_Maastricht_Hyrox_Team_Relay
- 2022_Manchester_Hyrox_Team_Relay
- 2022_New_York_HYROX_TEAM_RELAY
- 2022_Wien_Hyrox_Team_Relay
- 2023_Amsterdam_HYROX_TEAM_RELAY
- 2023_Barcelona_HYROX_TEAM_RELAY
- 2023_Chicago_HYROX_ELITE
- 2023_Chicago_HYROX_TEAM_RELAY
- 2023_Dallas_HYROX_GORUCK
- 2023_Dallas_HYROX_GORUCK_DOUBLES
- 2023_Dallas_HYROX_TEAM_RELAY
- 2023_Dallas_HYROX_TEAM_RELAY
- 2023_Dublin_HYROX_TEAM_RELAY
- 2023_Glasgow_HYROX_TEAM_RELAY
- 2023_Hamburg_HYROX_TEAM_RELAY
- 2023_Hong_Kong_HYROX_TEAM_RELAY
- 2023_Köln_HYROX_TEAM_RELAY
- 2023_

In [3]:
HYROX_DOUBLES.tail()

Unnamed: 0,Race,Division,Gender,Rank Overall,Rank Age Group,Name,Nation,Age Group,Total Time,source_file
181099,2026 Auckland,HYROX DOUBLES - Thursday,Women,476,114,"Clementine Smart, Samantha Collings",,30-34,02:26:31,2026_Auckland_HYROX_DOUBLES_-_Thursday
181100,2026 Auckland,HYROX DOUBLES - Thursday,Women,477,115,"Andrea Munokoatini, Jasmine Young",,30-34,02:26:34,2026_Auckland_HYROX_DOUBLES_-_Thursday
181101,2026 Auckland,HYROX DOUBLES - Thursday,Women,478,105,"Holley Hamill, Janine Hamill",,35-39,02:29:25,2026_Auckland_HYROX_DOUBLES_-_Thursday
181102,2026 Auckland,HYROX DOUBLES - Thursday,Women,479,26,"Connie Valivaka, Santana Webster",,45-49,02:36:10,2026_Auckland_HYROX_DOUBLES_-_Thursday
181103,2026 Auckland,HYROX DOUBLES - Thursday,Women,480,27,"Charlene Rewiri-Ulufonua, Penni Gray",,45-49,03:02:12,2026_Auckland_HYROX_DOUBLES_-_Thursday


In [4]:
# ---------------------------------
# GLOBAL DATA QUALITY SUMMARY (RAW DATA)
# ---------------------------------

total_rows_all_divisions = 0
total_duplicate_rows_global = 0
files_with_duplicates = set()

for division, df in division_datasets.items():

    total_rows_all_divisions += len(df)

    duplicate_mask = df.duplicated(keep=False)

    if duplicate_mask.any():

        duplicates_df = df.loc[duplicate_mask]
        total_duplicate_rows_global += len(duplicates_df)

        dup_files = duplicates_df["source_file"].unique()
        files_with_duplicates.update(dup_files)


# -------------------------
# FILE COUNTS
# -------------------------

all_files_processed = set()

for division, df in division_datasets.items():
    all_files_processed.update(df["source_file"].unique())

all_files_processed.update(empty_files_detected)

total_files = len(all_files_processed)

empty_files_count = len(empty_files_detected)
duplicate_files_count = len(files_with_duplicates)

files_with_issues = set(empty_files_detected).union(files_with_duplicates)
total_files_with_issues = len(files_with_issues)

# -------------------------
# PRINT SUMMARY
# -------------------------

print("\n============================")
print("GLOBAL DATA QUALITY SUMMARY (RAW)")
print("============================")

print(f"Total rows across all divisions: {total_rows_all_divisions}")
print(f"Total duplicate rows detected: {total_duplicate_rows_global}")

if total_rows_all_divisions > 0:
    print(f"Duplicate rows percentage: {(total_duplicate_rows_global / total_rows_all_divisions) * 100:.2f}%")

print("\n----------------------------")
print("FILE-LEVEL QUALITY")
print("----------------------------")

print(f"Total files processed: {total_files}")

print(f"Empty files: {empty_files_count}")
print(f"Files with duplicate rows: {duplicate_files_count}")
print(f"Total files with issues (empty OR duplicates): {total_files_with_issues}")

if total_files > 0:
    print(f"Percentage of files with issues: {(total_files_with_issues / total_files) * 100:.2f}%")


GLOBAL DATA QUALITY SUMMARY (RAW)
Total rows across all divisions: 572286
Total duplicate rows detected: 12291
Duplicate rows percentage: 2.15%

----------------------------
FILE-LEVEL QUALITY
----------------------------
Total files processed: 1088
Empty files: 148
Files with duplicate rows: 211
Total files with issues (empty OR duplicates): 356
Percentage of files with issues: 32.72%


# CHECK 1: Duplicated rows.

In [5]:
for division, df in division_datasets.items():
    df.drop_duplicates(inplace=True)

#### Notes:  
* Cologne has all rows duplicated in the doubles (https://results.hyrox.com/season-7/?page=1&event=HDP_COLOGNE25_OVERALL&num_results=100&pid=list&pidp=ranking_nav&ranking=time_finish_netto&search%5Bsex%5D=M&search%5Bage_class%5D=%25&search%5Bnation%5D=%25)  
* Same in other datasets.

In [5]:
# ---------------------------------
# GLOBAL DATA HEALTH SUMMARY
# ---------------------------------

total_rows_all_divisions = 0
total_duplicate_rows = 0
files_with_duplicates = set()

for division, df in division_datasets.items():

    # Count total rows AFTER cleaning
    total_rows_all_divisions += len(df)

    # Detect duplicates again on original logic (before drop)
    duplicate_mask = df.duplicated(keep=False)

    if duplicate_mask.any():
        duplicates_df = df.loc[duplicate_mask]
        total_duplicate_rows += len(duplicates_df)

        dup_files = duplicates_df["source_file"].unique()
        files_with_duplicates.update(dup_files)

# Total unique files processed
all_files_processed = set()

for division, df in division_datasets.items():
    all_files_processed.update(df["source_file"].unique())

# Add empty files to total file count
all_files_processed.update(empty_files_detected)

total_files = len(all_files_processed)

# Files with issues
files_with_issues = set(empty_files_detected).union(files_with_duplicates)

n_files_with_issues = len(files_with_issues)

# Percentages
if total_files > 0:
    pct_files_with_issues = (n_files_with_issues / total_files) * 100
else:
    pct_files_with_issues = 0


print("\n============================")
print("GLOBAL DATA QUALITY SUMMARY")
print("============================")

print(f"Total rows across all divisions: {total_rows_all_divisions}")
print(f"Total duplicate rows detected: {total_duplicate_rows}")

if total_rows_all_divisions > 0:
    print(f"Duplicate rows percentage: {(total_duplicate_rows / total_rows_all_divisions) * 100:.2f}%")

print("\nTotal files processed:", total_files)
print("Files with issues (empty or duplicates):", n_files_with_issues)
print(f"Percentage of files with issues: {pct_files_with_issues:.2f}%")


GLOBAL DATA QUALITY SUMMARY
Total rows across all divisions: 566108
Total duplicate rows detected: 0
Duplicate rows percentage: 0.00%

Total files processed: 1088
Files with issues (empty or duplicates): 145
Percentage of files with issues: 13.33%


# CHECK 2: Races where scraping only ran the first page

In [6]:
# Check row counts per combination of Race, Division, and Gender in DOUBLES
doubles_counts_gender = HYROX.groupby(['Race', 'Division', 'Gender']).size()
doubles_100 = doubles_counts_gender[doubles_counts_gender == 100]
print("DOUBLES dataset counts (exactly 100 rows for Race+Division+Gender):")
print(doubles_100)

# Check row counts per combination of Race, Division, and Gender in HYROX_pro
pro_counts_gender = HYROX_PRO.groupby(['Race', 'Division', 'Gender']).size()
pro_100 = pro_counts_gender[pro_counts_gender == 100]
print("\nHYROX_pro dataset counts (exactly 100 rows for Race+Division+Gender):")
print(pro_100)

DOUBLES dataset counts (exactly 100 rows for Race+Division+Gender):
Race            Division  Gender
2023 Stuttgart  HYROX     Women     100
dtype: int64

HYROX_pro dataset counts (exactly 100 rows for Race+Division+Gender):
Race             Division   Gender
2023 Manchester  HYROX PRO  Men       100
2025 Manchester  HYROX PRO  Women     100
dtype: int64


In [None]:
import os

root_folder = r"Datasets\Hyrox"

# Function to find and delete CSVs containing a specific combination
def delete_csvs_with_combinations(df, combinations):
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith(".csv") and file not in ["DOUBLES.csv", "HYROX_pro.csv"]:
                file_path = os.path.join(subdir, file)
                temp_df = pd.read_csv(file_path)
                
                # Check if any of the "exactly 100 rows" combinations are fully contained in this CSV
                for comb in combinations.index:
                    race, division, gender = comb
                    matching_rows = temp_df[
                        (temp_df['Race'] == race) &
                        (temp_df['Division'] == division) &
                        (temp_df['Gender'] == gender)
                    ]
                    if len(matching_rows) == 100:
                        print(f"Deleting {file_path} (matches {comb})")
                        os.remove(file_path)
                        break  # move to next file once deleted

# Delete for DOUBLES
if not doubles_100.empty:
    delete_csvs_with_combinations(DOUBLES_df, doubles_100)

# Delete for HYROX_pro
if not pro_100.empty:
    delete_csvs_with_combinations(HYROX_pro_df, pro_100)

# CHECK 3: Empty datasets

In [None]:
import os
import pandas as pd

root_folder = r"Datasets\Hyrox"

# Delete CSVs with only headers
for subdir, _, files in os.walk(root_folder):
    for file in files:
        if file.lower().endswith(".csv") and file not in ["DOUBLES.csv", "HYROX_pro.csv"]:
            file_path = os.path.join(subdir, file)
            temp_df = pd.read_csv(file_path)
            if temp_df.shape[0] == 0:  # only header
                os.remove(file_path)
                print(f"Deleted empty file: {file_path}")

print("Empty CSV files removed.")

# CHECK 4: Duplicated races - overall + Saturday or other day

In [None]:
# Columns to check for duplicates (ignoring Division)
subset_cols = ['Race', 'Gender', 'Name', 'Nation', 'Age Group', 'Total Time']

# Set display options for wide output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

# Show duplicate rows in DOUBLES based on subset
duplicates_doubles_rows = DOUBLES_df[DOUBLES_df.duplicated(subset=subset_cols, keep=False)]
print(f"DOUBLES dataset duplicate rows based on {subset_cols} ({len(duplicates_doubles_rows)} total):")
print(duplicates_doubles_rows)

# Show duplicate rows in HYROX_pro based on subset
duplicates_pro_rows = HYROX_pro_df[HYROX_pro_df.duplicated(subset=subset_cols, keep=False)]
print(f"HYROX_pro dataset duplicate rows based on {subset_cols} ({len(duplicates_pro_rows)} total):")
print(duplicates_pro_rows)

# CHECK 5: Missing genders in races. 

In [None]:
# Function to check missing genders per Race+Division
def check_missing_genders(df, expected_genders):
    missing_info = {}
    for (race, division), group in df.groupby(['Race', 'Division']):
        present_genders = set(group['Gender'].unique())
        missing = set(expected_genders) - present_genders
        if missing:
            missing_info[(race, division)] = missing
    return missing_info

# DOUBLES dataset (expected genders: Men, Women, Mixed)
doubles_missing_genders = check_missing_genders(DOUBLES_df, ['Men', 'Women', 'Mixed'])
print("DOUBLES missing genders per Race+Division:")
print(doubles_missing_genders)

# HYROX_pro dataset (expected genders: Men, Women)
pro_missing_genders = check_missing_genders(HYROX_pro_df, ['Men', 'Women'])
print("\nHYROX_pro missing genders per Race+Division:")
print(pro_missing_genders)