# Read all HYROX results CSV files, combining DOUBLES and SINGLES.

In [1]:
import os
import pandas as pd

root_folder = r"Datasets\Hyrox"

doubles_dfs = []
pro_dfs = []

# Walk through all CSV files
for subdir, _, files in os.walk(root_folder):
    for file in files:
        if file.lower().endswith(".csv"):
            # Ignore the already-saved merged files
            if file.lower() in ["doubles.csv", "hyrox_pro.csv"]:
                continue
            
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path)
            
            if "hyrox_pro_doubles" in file.lower():
                doubles_dfs.append(df)
            else:
                pro_dfs.append(df)

# Keep datasets in memory
if doubles_dfs:
    DOUBLES_df = pd.concat(doubles_dfs, ignore_index=True)
    print(f"DOUBLES dataset ready with {len(DOUBLES_df)} rows.")
else:
    DOUBLES_df = pd.DataFrame()  # empty dataframe
    print("No DOUBLES files found, empty dataset created.")

if pro_dfs:
    HYROX_pro_df = pd.concat(pro_dfs, ignore_index=True)
    print(f"HYROX_pro dataset ready with {len(HYROX_pro_df)} rows.")
else:
    HYROX_pro_df = pd.DataFrame()  # empty dataframe
    print("No HYROX_pro files found, empty dataset created.")

# Save datasets
DOUBLES_df.to_csv(os.path.join(root_folder, "DOUBLES.csv"), index=False)
HYROX_pro_df.to_csv(os.path.join(root_folder, "HYROX_pro.csv"), index=False)

DOUBLES dataset ready with 20213 rows.
HYROX_pro dataset ready with 36986 rows.


# CHECK 1: Duplicated rows.

In [2]:
# Check for duplicates
duplicates_doubles = DOUBLES_df.duplicated().sum()
duplicates_pro = HYROX_pro_df.duplicated().sum()

print(f"DOUBLES dataset: {duplicates_doubles} duplicate rows.")
print(f"HYROX_pro dataset: {duplicates_pro} duplicate rows.")

DOUBLES dataset: 656 duplicate rows.
HYROX_pro dataset: 225 duplicate rows.


In [13]:
# Show all columns horizontally
pd.set_option('display.max_columns', None)  # show all columns
pd.set_option('display.width', 2000)        # wide output so it doesn't wrap
pd.set_option('display.max_colwidth', None) # show full column contents

# Show duplicate rows in DOUBLES
duplicates_doubles_rows = DOUBLES_df[DOUBLES_df.duplicated(keep=False)]
print(f"DOUBLES dataset duplicate rows ({len(duplicates_doubles_rows)} total):")
print(duplicates_doubles_rows)

# Show duplicate rows in HYROX_pro
duplicates_pro_rows = HYROX_pro_df[HYROX_pro_df.duplicated(keep=False)]
print(f"HYROX_pro dataset duplicate rows ({len(duplicates_pro_rows)} total):")
print(duplicates_pro_rows)

DOUBLES dataset duplicate rows (0 total):
Empty DataFrame
Columns: [Race, Division, Gender, Rank Overall, Rank Age Group, Name, Nation, Age Group, Total Time]
Index: []
HYROX_pro dataset duplicate rows (0 total):
Empty DataFrame
Columns: [Race, Division, Gender, Rank Overall, Rank Age Group, Name, Nation, Age Group, Total Time]
Index: []


#### Notes:  
* Cologne has all rows duplicated in the doubles (https://results.hyrox.com/season-7/?page=1&event=HDP_COLOGNE25_OVERALL&num_results=100&pid=list&pidp=ranking_nav&ranking=time_finish_netto&search%5Bsex%5D=M&search%5Bage_class%5D=%25&search%5Bnation%5D=%25)  
* Same in other datasets.

In [4]:
# Remove duplicate rows in DOUBLES
DOUBLES_df = DOUBLES_df.drop_duplicates(keep='first').reset_index(drop=True)
print(f"DOUBLES dataset cleaned. New row count: {len(DOUBLES_df)}")

# Remove duplicate rows in HYROX_pro
HYROX_pro_df = HYROX_pro_df.drop_duplicates(keep='first').reset_index(drop=True)
print(f"HYROX_pro dataset cleaned. New row count: {len(HYROX_pro_df)}")

DOUBLES dataset cleaned. New row count: 19557
HYROX_pro dataset cleaned. New row count: 36761


# CHECK 2: Races where scraping only ran the first page

In [5]:
# Check row counts per combination of Race, Division, and Gender in DOUBLES
doubles_counts_gender = DOUBLES_df.groupby(['Race', 'Division', 'Gender']).size()
doubles_100 = doubles_counts_gender[doubles_counts_gender == 100]
print("DOUBLES dataset counts (exactly 100 rows for Race+Division+Gender):")
print(doubles_100)

# Check row counts per combination of Race, Division, and Gender in HYROX_pro
pro_counts_gender = HYROX_pro_df.groupby(['Race', 'Division', 'Gender']).size()
pro_100 = pro_counts_gender[pro_counts_gender == 100]
print("\nHYROX_pro dataset counts (exactly 100 rows for Race+Division+Gender):")
print(pro_100)

DOUBLES dataset counts (exactly 100 rows for Race+Division+Gender):
Series([], dtype: int64)

HYROX_pro dataset counts (exactly 100 rows for Race+Division+Gender):
Series([], dtype: int64)


In [6]:
import os

root_folder = r"Datasets\Hyrox"

# Function to find and delete CSVs containing a specific combination
def delete_csvs_with_combinations(df, combinations):
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith(".csv") and file not in ["DOUBLES.csv", "HYROX_pro.csv"]:
                file_path = os.path.join(subdir, file)
                temp_df = pd.read_csv(file_path)
                
                # Check if any of the "exactly 100 rows" combinations are fully contained in this CSV
                for comb in combinations.index:
                    race, division, gender = comb
                    matching_rows = temp_df[
                        (temp_df['Race'] == race) &
                        (temp_df['Division'] == division) &
                        (temp_df['Gender'] == gender)
                    ]
                    if len(matching_rows) == 100:
                        print(f"Deleting {file_path} (matches {comb})")
                        os.remove(file_path)
                        break  # move to next file once deleted

# Delete for DOUBLES
if not doubles_100.empty:
    delete_csvs_with_combinations(DOUBLES_df, doubles_100)

# Delete for HYROX_pro
if not pro_100.empty:
    delete_csvs_with_combinations(HYROX_pro_df, pro_100)

# CHECK 3: Empty datasets

In [7]:
import os
import pandas as pd

root_folder = r"Datasets\Hyrox"

# Delete CSVs with only headers
for subdir, _, files in os.walk(root_folder):
    for file in files:
        if file.lower().endswith(".csv") and file not in ["DOUBLES.csv", "HYROX_pro.csv"]:
            file_path = os.path.join(subdir, file)
            temp_df = pd.read_csv(file_path)
            if temp_df.shape[0] == 0:  # only header
                os.remove(file_path)
                print(f"Deleted empty file: {file_path}")

print("Empty CSV files removed.")

Empty CSV files removed.


# CHECK 4: Duplicated races - overall + Saturday or other day

In [14]:
# Columns to check for duplicates (ignoring Division)
subset_cols = ['Race', 'Gender', 'Name', 'Nation', 'Age Group', 'Total Time']

# Set display options for wide output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

# Show duplicate rows in DOUBLES based on subset
duplicates_doubles_rows = DOUBLES_df[DOUBLES_df.duplicated(subset=subset_cols, keep=False)]
print(f"DOUBLES dataset duplicate rows based on {subset_cols} ({len(duplicates_doubles_rows)} total):")
print(duplicates_doubles_rows)

# Show duplicate rows in HYROX_pro based on subset
duplicates_pro_rows = HYROX_pro_df[HYROX_pro_df.duplicated(subset=subset_cols, keep=False)]
print(f"HYROX_pro dataset duplicate rows based on {subset_cols} ({len(duplicates_pro_rows)} total):")
print(duplicates_pro_rows)

DOUBLES dataset duplicate rows based on ['Race', 'Gender', 'Name', 'Nation', 'Age Group', 'Total Time'] (10 total):
              Race                     Division Gender  Rank Overall Rank Age Group                                                                                  Name  Nation Age Group Total Time
5513  2025 Cologne  HYROX PRO DOUBLES - Overall    Men           187             95                          Karsten Haase, Karsten Haase, Florian Fitzek, Florian Fitzek     NaN     30-34   01:35:35
5514  2025 Cologne  HYROX PRO DOUBLES - Overall    Men           188             95                          Karsten Haase, Karsten Haase, Florian Fitzek, Florian Fitzek     NaN     30-34   01:35:35
5515  2025 Cologne  HYROX PRO DOUBLES - Overall    Men           188             25  Matthew Birchenough, Matthew Birchenough, Filipe Mota Da Silva, Filipe Mota Da Silva     NaN     40-44   01:39:57
5516  2025 Cologne  HYROX PRO DOUBLES - Overall    Men           189             25  Mat

# CHECK 5: Missing genders in races. 

In [9]:
# Function to check missing genders per Race+Division
def check_missing_genders(df, expected_genders):
    missing_info = {}
    for (race, division), group in df.groupby(['Race', 'Division']):
        present_genders = set(group['Gender'].unique())
        missing = set(expected_genders) - present_genders
        if missing:
            missing_info[(race, division)] = missing
    return missing_info

# DOUBLES dataset (expected genders: Men, Women, Mixed)
doubles_missing_genders = check_missing_genders(DOUBLES_df, ['Men', 'Women', 'Mixed'])
print("DOUBLES missing genders per Race+Division:")
print(doubles_missing_genders)

# HYROX_pro dataset (expected genders: Men, Women)
pro_missing_genders = check_missing_genders(HYROX_pro_df, ['Men', 'Women'])
print("\nHYROX_pro missing genders per Race+Division:")
print(pro_missing_genders)

DOUBLES missing genders per Race+Division:
{('2024 Amsterdam', 'HYROX PRO DOUBLES - Overall'): {'Mixed'}, ('2024 Anaheim', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Beijing', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Birmingham', 'HYROX PRO DOUBLES - Overall'): {'Mixed'}, ('2024 Brisbane', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Cape Town', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Chicago Navy Pier', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Ciudad de Mexico', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Dallas', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Dublin', 'HYROX PRO DOUBLES - Overall'): {'Mixed'}, ('2024 Frankfurt', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Hamburg', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Incheon', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 London', 'HYROX PRO DOUBLES - Overall'): {'Mixed'}, ('2024 Madrid', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Manchester', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Marseille', 'HYROX PRO DOUBLES'): {'Mixed'}, ('2024 Melbourne', 'HYROX PRO DOUBLE