## Scraping & Verifying HYROX Results Across Seasons

**Competitions:** HYROX Seasons 1‚Äì8 (Official Results Portal)  
**Purpose:** Scrape all available race results and verify that every race, division, and gender has been successfully collected  
**Methods:** Selenium automation, controlled multi-threading, dynamic pagination handling, structured CSV export, division detection, empty-file tracking, and race-level completeness checks  
**Author:** [Victoria Friss de Kereki](https://www.linkedin.com/in/victoria-friss-de-kereki/)  

---

**Notebook first written:** `23/02/2026`  
**Last updated:** `27/02/2026`  

> This notebook builds a **robust scraping and verification pipeline** for HYROX competition results.
> 
> The workflow:
> 
> - üåê Scrapes race results directly from the official HYROX results platform  
> - üóÇ Organises outputs by **Season, Race, and Division**  
> - üîÑ Handles dynamic page loading and pagination safely  
> - üìÅ Saves structured CSV files for each race-division combination  
> - ‚ö†Ô∏è Tracks empty divisions and failed scrapes  
> - üèÅ Verifies that every race includes all the available divisions
> 
> The objective of this notebook is to ensure **complete and reliable data extraction**, creating a solid foundation for downstream cleaning, validation, and analytical modelling in subsequent notebooks.

------------------

### Scrape many at a time, for seasons and divisions available, even the empty ones.

In [None]:
import time
import random
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# ==========================
# CONFIG
# ==========================

BASE_URLS = {f"Season_{i}": f"https://results.hyrox.com/season-{i}/" for i in range(1, 9)}
SAVE_ROOT = r"Datasets\Hyrox"
MAX_THREADS = 20  # adjust as needed

def human_pause(a=2, b=5):
    time.sleep(random.uniform(a, b))

# ==========================
# SELENIUM DRIVER
# ==========================

def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--headless=new")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
    return webdriver.Chrome(options=options)

# ==========================
# SCRAPE PAGE ROWS
# ==========================

def scrape_pages(driver, race_name, division, gender_label, race_results):
    page_number = 1
    scraped_any = False

    while True:
        # Wait until rows appear or "no results" text shows
        try:
            WebDriverWait(driver, 25).until(
                lambda d: "There are currently no results available" in d.page_source
                or len(d.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")) >= 1
            )
        except TimeoutException:
            print(f"[{race_name}] Timeout waiting for {division} - {gender_label}")
            return False

        # No results message?
        try:
            no_result_elem = driver.find_element(By.XPATH,
                "//*[contains(text(),'There are currently no results available')]")
            if no_result_elem.is_displayed():
                print(f"[{race_name}] No results for {division} - {gender_label}")
                return False
        except NoSuchElementException:
            pass

        # Rows
        rows = driver.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")
        rows = [r for r in rows if "list-group-header" not in r.get_attribute("class")]

        if not rows:
            print(f"[{race_name}] No rows found for {division} - {gender_label}")
            return False

        for row in rows:
            try:
                # Modern layout
                rank = row.find_element(By.CSS_SELECTOR, ".place-primary").text
                age_rank = row.find_element(By.CSS_SELECTOR, ".place-secondary").text
                name = row.find_element(By.CSS_SELECTOR, "h4.type-fullname").text
                try:
                    nation = row.find_element(By.CSS_SELECTOR, ".nation__abbr").text
                except:
                    nation = ""
                age_group = row.find_element(By.CSS_SELECTOR, ".type-age_class").text.replace("Age Group","").strip()
                total_time = row.find_element(By.CSS_SELECTOR, ".type-time").text.replace("Total","").strip()
            except:
                # 2018 fallback layout (Youngstars)
                parts = [p.strip() for p in row.text.split("\n") if p.strip()]
                if len(parts) < 5:
                    continue
                rank = parts[0]
                age_rank = ""
                name = parts[2]
                age_group = parts[3].replace("‚Äì","").strip()
                total_time = parts[4][-8:]
                nation = ""

            race_results.append([
                race_name, division, gender_label,
                rank, age_rank, name, nation,
                age_group, total_time
            ])
            scraped_any = True

        print(f"[{race_name}] Page {page_number} scraped for {division} - {gender_label}")
        page_number += 1

        # Pagination
        try:
            next_btn = driver.find_element(By.XPATH, "//a[text()='>']")
            driver.execute_script("arguments[0].click();", next_btn)
            human_pause(2,5)
        except NoSuchElementException:
            break

    return scraped_any

# ==========================
# SCRAPE A RACE
# ==========================
def scrape_race(season, base_url, race_name, race_value):
    driver = create_driver()
    season_folder = os.path.join(SAVE_ROOT, season)
    os.makedirs(season_folder, exist_ok=True)

    safe_name = race_name.replace(" ", "_").replace("/", "-")
    print(f"\n[{race_name}] Starting scraping")

    # Get all divisions
    driver.get(base_url)
    human_pause(2,5)
    Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
    human_pause(1,3)
    try:
        select_division = Select(driver.find_element(By.ID, "default-lists-event"))
        divisions = [(o.text, o.get_attribute("value")) for o in select_division.options]
    except NoSuchElementException:
        # Sometimes dropdown is missing (rare), fallback to All
        divisions = [("All", "")]

    # Skip if all CSVs exist
    all_exist = all(
        os.path.exists(os.path.join(season_folder, f"{safe_name}_{div[0].replace(' ','_').replace('/','-')}.csv"))
        for div in divisions
    )
    if all_exist:
        print(f"[{race_name}] All division CSVs exist. Skipping race.")
        driver.quit()
        return f"[{race_name}] Skipped (all divisions exist)"

    for div_name, div_value in divisions:
        div_safe = div_name.replace(" ", "_").replace("/", "-")
        file_path = os.path.join(season_folder, f"{safe_name}_{div_safe}.csv")
        if os.path.exists(file_path):
            print(f"[{race_name}] {div_name} already exists. Skipping division.")
            continue
        
        print(f"[{race_name}] Scraping division: {div_name}")

        # Get genders
        try:
            Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
            human_pause(1,2)
            Select(driver.find_element(By.ID, "default-lists-event")).select_by_value(div_value)
            human_pause(1,2)
            gender_dropdown = Select(driver.find_element(By.ID, "default-lists-sex"))
            genders = [
                (o.get_attribute("value"), o.text.strip())
                for o in gender_dropdown.options
                if o.text.strip() and "All" not in o.text
            ]
            if not genders:
                genders = [("", "All")]
        except NoSuchElementException:
            genders = [("", "All")]

        division_results = []
        division_has_data = False

        for gender_code, gender_label in genders:
            print(f"[{race_name}] Scraping gender: {gender_label} in {div_name}")

            # Reload page to reset all dropdowns
            driver.get(base_url)
            human_pause(2,3)
            Select(driver.find_element(By.ID,"default-lists-event_main_group")).select_by_value(race_value)
            human_pause(1,2)
            try:
                Select(driver.find_element(By.ID,"default-lists-event")).select_by_value(div_value)
                human_pause(1,2)
            except NoSuchElementException:
                print(f"[{race_name}] Division dropdown not found for {div_name}, continuing")

            # Select gender
            if gender_code:
                try:
                    Select(driver.find_element(By.ID,"default-lists-sex")).select_by_value(gender_code)
                    human_pause(1,2)
                except NoSuchElementException:
                    print(f"[{race_name}] Gender dropdown not found for {div_name}, using default gender")

            # Ensure Workout = Total
            try:
                Select(driver.find_element(By.ID,"default-lists-ranking")).select_by_visible_text("Total")
            except:
                pass

            # Ensure 100 results
            try:
                Select(driver.find_element(By.ID,"default-num_results")).select_by_value("100")
            except:
                pass
            human_pause(1,2)

            # Click SHOW RESULTS
            try:
                submit_btn = driver.find_element(By.ID,"default-submit")
                driver.execute_script("arguments[0].click();", submit_btn)
                print(f"[{race_name}] Clicked SHOW RESULTS for {div_name} - {gender_label}")
            except NoSuchElementException:
                print(f"[{race_name}] Submit button not found for {div_name} - {gender_label}")
                continue

            # Wait for the table headers (Workout, Time, Total)
            try:
                WebDriverWait(driver, 25).until(
                    lambda d: len(d.find_elements(By.XPATH,
                        "//li[contains(@class,'list-group-header')]//div[contains(text(),'Workout')]"
                    )) > 0
                    or len(d.find_elements(By.XPATH,
                        "//li[contains(@class,'list-group-header')]//div[contains(text(),'Time')]"
                    )) > 0
                    or len(d.find_elements(By.XPATH,
                        "//li[contains(@class,'list-group-header')]//div[contains(text(),'Total')]"
                    )) > 0
                )
            except TimeoutException:
                print(f"[{race_name}] Results did not load for {div_name} - {gender_label}")
                debug_file = os.path.join(season_folder, f"debug_{safe_name}_{div_safe}_{gender_label}.html")
                with open(debug_file, "w", encoding="utf-8") as f:
                    f.write(driver.page_source)
                continue

            # Scrape the results
            has_data = scrape_pages(driver, race_name, div_name, gender_label, division_results)
            if has_data:
                division_has_data = True

        # Save CSV even if empty
        df = pd.DataFrame(
            division_results,
            columns=["Race","Division","Gender",
                     "Rank Overall","Rank Age Group",
                     "Name","Nation","Age Group","Total Time"]
        )
        df.to_csv(file_path, index=False)
        if division_has_data:
            print(f"[{race_name}] Saved {len(df)} rows for {div_name}")
        else:
            print(f"[{race_name}] Division {div_name} had no data. Empty CSV saved.")

    driver.quit()
    return f"[{race_name}] Finished scraping"
    
# ==========================
# MAIN EXECUTION
# ==========================

all_tasks = []

for season, base_url in BASE_URLS.items():
    driver_main = create_driver()
    driver_main.get(base_url)
    human_pause(2,4)
    try:
        WebDriverWait(driver_main, 20).until(
            EC.presence_of_element_located((By.ID,"default-lists-event_main_group"))
        )
    except TimeoutException:
        driver_main.quit()
        continue

    race_dropdown = Select(driver_main.find_element(By.ID,"default-lists-event_main_group"))
    races = [(race_dropdown.options[i].text,race_dropdown.options[i].get_attribute("value"))
             for i in range(len(race_dropdown.options))]
    driver_main.quit()

    for race_name, race_value in races:
        all_tasks.append((season, base_url, race_name, race_value))

with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    futures = [executor.submit(scrape_race,*task) for task in all_tasks]
    for f in futures:
        print(f.result())

print("\nALL DONE.")

### Check all existing seasons/races/divisions have been downloaded.

In [None]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# ==========================
# CONFIG
# ==========================

BASE_URLS = {f"Season_{i}": f"https://results.hyrox.com/season-{i}/" for i in range(1, 9)}
SAVE_ROOT = r"Datasets\Hyrox"

# ==========================
# DRIVER
# ==========================

def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")
    options.add_argument("--start-maximized")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
    return webdriver.Chrome(options=options)

# ==========================
# VERIFICATION
# ==========================

def verify_all_downloads():

    print("\n==============================")
    print("VERIFYING HYROX DATASET")
    print("==============================")

    for season, base_url in BASE_URLS.items():

        print(f"\n========== CHECKING {season} ==========")

        season_folder = os.path.join(SAVE_ROOT, season)

        if not os.path.exists(season_folder):
            print(f"‚ùå Season folder missing: {season}")
            continue

        local_files = set(os.listdir(season_folder))

        driver = create_driver()
        driver.get(base_url)

        try:
            WebDriverWait(driver, 20).until(
                
                EC.presence_of_element_located((By.ID, "default-lists-event_main_group"))
            )
        except TimeoutException:
            print(f"‚ùå Could not load season page: {season}")
            driver.quit()
            continue

        race_dropdown = Select(driver.find_element(By.ID, "default-lists-event_main_group"))
        races = [(opt.text.strip(), opt.get_attribute("value"))
                 for opt in race_dropdown.options]

        missing_races = []
        missing_divisions = []

        for race_name, race_value in races:

            safe_race = race_name.replace(" ", "_").replace("/", "-")

            print(f"Checking race: {race_name}")

            driver.get(base_url)
            time.sleep(2)

            Select(driver.find_element(By.ID, "default-lists-event_main_group"))\
                .select_by_value(race_value)
            time.sleep(2)

            division_dropdown = Select(driver.find_element(By.ID, "default-lists-event"))
            divisions = [opt.text.strip() for opt in division_dropdown.options]

            race_files = [f for f in local_files if f.startswith(safe_race + "_")]

            if not race_files:
                missing_races.append(race_name)

            race_missing_divs = []

            for div in divisions:
                safe_div = div.replace(" ", "_").replace("/", "-")
                expected_filename = f"{safe_race}_{safe_div}.csv"

                if expected_filename not in local_files:
                    race_missing_divs.append(div)

            if race_missing_divs:
                missing_divisions.append((race_name, race_missing_divs))

        driver.quit()

        # REPORT
        if not missing_races:
            print("‚úÖ No missing races.")
        else:
            print("\n‚ùå Missing races:")
            for r in missing_races:
                print(f"   - {r}")

        if not missing_divisions:
            print("‚úÖ All divisions present.")
        else:
            print("\n‚ö† Missing divisions:")
            for race, divs in missing_divisions:
                print(f"\n  {race}")
                for d in divs:
                    print(f"     - {d}")

        print(f"\n========== DONE {season} ==========")

    print("\n==============================")
    print("VERIFICATION COMPLETE")
    print("==============================")

# ==========================
# RUN
# ==========================

verify_all_downloads()

In [None]:
# ---------------------------------
# STEP 3: Validate Empty Files Coverage
# ---------------------------------

print("\n============================")
print("EMPTY FILE VALIDATION")
print("============================")

# Map base race+division ‚Üí files with data
coverage_map = defaultdict(list)

for division, files in division_files.items():

    for file_path, filename_without_ext, season in files:

        try:
            df = pd.read_csv(file_path)

            if df.empty:
                continue

            # Remove day/overall suffix for base grouping
            base_name = re.sub(r"_(OVERALL|SATURDAY|SUNDAY|FRIDAY|THURSDAY|WEDNESDAY)$",
                               "",
                               filename_without_ext.upper())

            coverage_map[base_name].append(filename_without_ext)

        except:
            continue


problems_found = False

for empty_file in empty_files_detected:

    empty_upper = empty_file.upper()

    base_name = re.sub(r"_(OVERALL|SATURDAY|SUNDAY|FRIDAY|THURSDAY|WEDNESDAY)$",
                       "",
                       empty_upper)

    if base_name in coverage_map and len(coverage_map[base_name]) > 0:
        print(f"‚úÖ OK: {empty_file} is empty but covered by:")
        for alt in coverage_map[base_name]:
            print(f"   ‚Ü≥ {alt}")
    else:
        print(f"‚ùå MISSING DATA: {empty_file} has no alternative dataset with data")
        problems_found = True

if not empty_files_detected:
    print("No empty files to validate.")

elif not problems_found:
    print("\nAll empty files are safely covered by alternative datasets.")

else:
    print("\n‚ö† Some races/divisions are completely missing data.")

In [None]:
import os
import pandas as pd

root_folder = r"Datasets\Hyrox"

deleted_files = []
failed_files = []

print("\n============================")
print("DELETING EMPTY CSV FILES")
print("============================")

for subdir, _, files in os.walk(root_folder):

    # Skip processed dataset folder
    if "processed dataset" in subdir.lower():
        continue

    for file in files:

        if not file.lower().endswith(".csv"):
            continue

        file_path = os.path.join(subdir, file)

        try:
            df = pd.read_csv(file_path)

            if df.empty:
                os.remove(file_path)
                deleted_files.append(file_path)
                print(f"üóë Deleted: {file_path}")

        except Exception as e:
            failed_files.append((file_path, str(e)))

# ---------------------------------
# SUMMARY
# ---------------------------------
print("\n============================")
print("DELETION SUMMARY")
print("============================")

print(f"Total empty files deleted: {len(deleted_files)}")

if failed_files:
    print("\nFiles that could not be processed:")
    for path, error in failed_files:
        print(f"- {path} ({error})")

In [None]:
import os
import pandas as pd
import re
from collections import defaultdict

root_folder = r"Datasets\Hyrox"
output_folder = os.path.join(root_folder, "Processed dataset")

os.makedirs(output_folder, exist_ok=True)

division_files = defaultdict(list)
division_datasets = {}

suffix_words = {
    "overall", "saturday", "sunday", "friday",
    "thursday", "wednesday"
}

# ---------------------------------
# STEP 1: Group files by division
# ---------------------------------
for subdir, _, files in os.walk(root_folder):

    # Skip processed dataset folder
    if "processed dataset" in subdir.lower():
        continue

    season = os.path.basename(subdir)

    for file in files:

        if not file.lower().endswith(".csv"):
            continue

        filename_without_ext = file.replace(".csv", "")

        # Skip generic names if they exist
        if filename_without_ext.lower() in ["doubles", "singles", "hyrox_pro"]:
            continue

        parts = filename_without_ext.split("_")

        # Find all occurrences of "hyrox"
        hyrox_indices = [i for i, p in enumerate(parts) if p.lower() == "hyrox"]

        if not hyrox_indices:
            continue

        # Take LAST occurrence (fixes London issue)
        hyrox_index = hyrox_indices[-1]

        division_parts = parts[hyrox_index:]

        # Remove unwanted suffixes
        while division_parts and division_parts[-1].lower() in suffix_words:
            division_parts = division_parts[:-1]

        division = "_".join(division_parts)

        # Standardise formatting
        division = division.upper()
        division = division.replace("-", "_")
        division = re.sub(r"_+$", "", division)
        division = re.sub(r"_+", "_", division)

        file_path = os.path.join(subdir, file)

        division_files[division].append((file_path, filename_without_ext, season))


# ---------------------------------
# STEP 2: Create dataset per division
# ---------------------------------
for division, files in division_files.items():

    dfs = []

    for file_path, filename_without_ext, season in files:
        try:
            df = pd.read_csv(file_path)

            if df.empty:
                continue   # Just skip empty files silently

            df["source_file"] = filename_without_ext
            df["Season"] = season

            dfs.append(df)

        except Exception:
            continue

    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)

        output_path = os.path.join(output_folder, f"{division}.csv")
        combined_df.to_csv(output_path, index=False)

        division_datasets[division] = combined_df
        globals()[division] = combined_df


# ---------------------------------
# FINAL SUMMARY
# ---------------------------------
print("\n============================")
print("DATASETS CREATED")
print("============================")

if division_datasets:
    for name in sorted(division_datasets.keys()):
        print(f"{name} ‚Üí {len(division_datasets[name])} rows")
else:
    print("No datasets created.")