In [1]:
import os
import pandas as pd

root_folder = r"Datasets\Hyrox"

doubles_dfs = []
pro_dfs = []

# Walk through all CSV files
for subdir, _, files in os.walk(root_folder):
    for file in files:
        if file.lower().endswith(".csv"):
            # Ignore the already-saved merged files
            if file.lower() in ["doubles.csv", "hyrox_pro.csv"]:
                continue
            
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path)
            
            if "hyrox_pro_doubles" in file.lower():
                doubles_dfs.append(df)
            else:
                pro_dfs.append(df)

# Keep datasets in memory
if doubles_dfs:
    DOUBLES_df = pd.concat(doubles_dfs, ignore_index=True)
    print(f"DOUBLES dataset ready with {len(DOUBLES_df)} rows.")
else:
    DOUBLES_df = pd.DataFrame()  # empty dataframe
    print("No DOUBLES files found, empty dataset created.")

if pro_dfs:
    HYROX_pro_df = pd.concat(pro_dfs, ignore_index=True)
    print(f"HYROX_pro dataset ready with {len(HYROX_pro_df)} rows.")
else:
    HYROX_pro_df = pd.DataFrame()  # empty dataframe
    print("No HYROX_pro files found, empty dataset created.")

# Save datasets
DOUBLES_df.to_csv(os.path.join(root_folder, "DOUBLES.csv"), index=False)
HYROX_pro_df.to_csv(os.path.join(root_folder, "HYROX_pro.csv"), index=False)

DOUBLES dataset ready with 20235 rows.
HYROX_pro dataset ready with 32884 rows.


In [2]:
# Check for duplicates
duplicates_doubles = DOUBLES_df.duplicated().sum()
duplicates_pro = HYROX_pro_df.duplicated().sum()

print(f"DOUBLES dataset: {duplicates_doubles} duplicate rows.")
print(f"HYROX_pro dataset: {duplicates_pro} duplicate rows.")

DOUBLES dataset: 656 duplicate rows.
HYROX_pro dataset: 225 duplicate rows.


In [3]:
# Show all columns horizontally
pd.set_option('display.max_columns', None)  # show all columns
pd.set_option('display.width', 2000)        # wide output so it doesn't wrap
pd.set_option('display.max_colwidth', None) # show full column contents

# Show duplicate rows in DOUBLES
duplicates_doubles_rows = DOUBLES_df[DOUBLES_df.duplicated(keep=False)]
print(f"DOUBLES dataset duplicate rows ({len(duplicates_doubles_rows)} total):")
print(duplicates_doubles_rows)

# Show duplicate rows in HYROX_pro
duplicates_pro_rows = HYROX_pro_df[HYROX_pro_df.duplicated(keep=False)]
print(f"HYROX_pro dataset duplicate rows ({len(duplicates_pro_rows)} total):")
print(duplicates_pro_rows)

DOUBLES dataset duplicate rows (1312 total):
                  Race                     Division Gender Rank Overall Rank Age Group                                                                  Name Nation Age Group Total Time
5623      2025 Cologne  HYROX PRO DOUBLES - Overall    Men            1              1  Tobias Lautwein, Tobias Lautwein, Christof Brenner, Christof Brenner    NaN     30-34   00:52:07
5624      2025 Cologne  HYROX PRO DOUBLES - Overall    Men            1              1  Tobias Lautwein, Tobias Lautwein, Christof Brenner, Christof Brenner    NaN     30-34   00:52:07
5625      2025 Cologne  HYROX PRO DOUBLES - Overall    Men            2              1                  Kieron White, Kieron White, Kevin Woods, Kevin Woods    NaN      MU29   00:53:47
5626      2025 Cologne  HYROX PRO DOUBLES - Overall    Men            2              1                  Kieron White, Kieron White, Kevin Woods, Kevin Woods    NaN      MU29   00:53:47
5627      2025 Cologne  HYROX 

#### Notes:  
* Cologne has all rows duplicated in the doubles (https://results.hyrox.com/season-7/?page=1&event=HDP_COLOGNE25_OVERALL&num_results=100&pid=list&pidp=ranking_nav&ranking=time_finish_netto&search%5Bsex%5D=M&search%5Bage_class%5D=%25&search%5Bnation%5D=%25)  
* Same in other datasets.

In [4]:
# Remove duplicate rows in DOUBLES
DOUBLES_df = DOUBLES_df.drop_duplicates(keep='first').reset_index(drop=True)
print(f"DOUBLES dataset cleaned. New row count: {len(DOUBLES_df)}")

# Remove duplicate rows in HYROX_pro
HYROX_pro_df = HYROX_pro_df.drop_duplicates(keep='first').reset_index(drop=True)
print(f"HYROX_pro dataset cleaned. New row count: {len(HYROX_pro_df)}")

DOUBLES dataset cleaned. New row count: 19579
HYROX_pro dataset cleaned. New row count: 32659


In [5]:
# Check row counts per combination of Race and Division in DOUBLES
doubles_counts = DOUBLES_df.groupby(['Race', 'Division']).size()
doubles_200_300 = doubles_counts[doubles_counts.isin([200, 300])]
print("DOUBLES dataset counts (exactly 200 or 300 rows):")
print(doubles_200_300)

# Check row counts per combination of Race and Division in HYROX_pro
pro_counts = HYROX_pro_df.groupby(['Race', 'Division']).size()
pro_200_300 = pro_counts[pro_counts.isin([200, 300])]
print("\nHYROX_pro dataset counts (exactly 200 or 300 rows):")
print(pro_200_300)

DOUBLES dataset counts (exactly 200 or 300 rows):
Series([], dtype: int64)

HYROX_pro dataset counts (exactly 200 or 300 rows):
Series([], dtype: int64)


In [6]:
import os

root_folder = r"Datasets\Hyrox"

# Function to delete files with exactly 200 or 300 rows for a given combination
def delete_exact_rows(target_df, dataset_name):
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith(".csv") and dataset_name.lower() in file.lower():
                file_path = os.path.join(subdir, file)
                df = pd.read_csv(file_path)

                # Count rows per Race + Division
                counts = df.groupby(['Race', 'Division']).size()
                if any(counts.isin([200, 300])):
                    os.remove(file_path)
                    print(f"Deleted {dataset_name} file with exact 200/300 rows: {file_path}")

# Delete in DOUBLES files
delete_exact_rows(DOUBLES_df, "HYROX_PRO_DOUBLES")

# Delete in HYROX_pro files
delete_exact_rows(HYROX_pro_df, "HYROX_PRO")