In [None]:
import time
import random
import pandas as pd
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# ==========================
# CONFIG
# ==========================

BASE_URLS = {
    "Season_7": "https://results.hyrox.com/season-7/?pid=start",
    "Season_8": "https://results.hyrox.com/season-8/"
}

SAVE_ROOT = r"Datasets\Hyrox"

DIVISION_OPTIONS = [
    "HYROX PRO",
    "HYROX PRO - Overall",
    "HYROX PRO DOUBLES",
    "HYROX PRO DOUBLES - Overall"
]

def human_pause(a=3, b=7):
    time.sleep(random.uniform(a, b))

# ==========================
# DRIVER
# ==========================

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/120.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 25)

# ==========================
# SCRAPE PAGES FUNCTION
# ==========================

def scrape_pages(race_name, division, gender, race_results):
    page_number = 1
    is_doubles = "DOUBLES" in division.upper()

    while True:
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"div[data-sex='{gender[0]}']")))
        except TimeoutException:
            print("      No results container found")
            return

        human_pause(2, 5)

        try:
            container = driver.find_element(By.CSS_SELECTOR, f"div[data-sex='{gender[0]}']")
        except NoSuchElementException:
            print("      No container for this gender")
            return

        rows = container.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")
        if not rows:
            print("      No rows found in container")
            return

        for row in rows:
            if "list-group-header" in row.get_attribute("class"):
                continue
            try:
                rank = row.find_element(By.CSS_SELECTOR, ".place-primary").text
                age_rank = row.find_element(By.CSS_SELECTOR, ".place-secondary").text
                total_time = row.find_element(By.CSS_SELECTOR, ".type-time").text.replace("Total", "").strip()
                age_group = row.find_element(By.CSS_SELECTOR, ".type-age_class").text.replace("Age Group", "").strip()

                if is_doubles:
                    # For DOUBLES, get member names
                    members = row.find_elements(By.CSS_SELECTOR, ".type-relay_member a")
                    member_names = " & ".join([m.text for m in members])
                    race_results.append([race_name, division, gender, rank, age_rank, member_names, "", age_group, total_time])
                else:
                    # Singles
                    name = row.find_element(By.CSS_SELECTOR, "h4.type-fullname").text
                    nation = row.find_element(By.CSS_SELECTOR, ".nation__abbr").text
                    race_results.append([race_name, division, gender, rank, age_rank, name, nation, age_group, total_time])

            except:
                continue

        print(f"      Page {page_number} scraped")
        human_pause(3, 6)

        try:
            next_button = driver.find_element(By.XPATH, "//a[text()='>']")
            driver.execute_script("arguments[0].click();", next_button)
            page_number += 1
            human_pause(5, 10)
        except NoSuchElementException:
            print("      Last page reached")
            break

# ==========================
# MAIN LOOP
# ==========================

for season, base_url in BASE_URLS.items():
    print(f"\n==============================\nSTARTING {season}\n==============================\n")
    season_folder = os.path.join(SAVE_ROOT, season)
    os.makedirs(season_folder, exist_ok=True)

    driver.get(base_url)
    human_pause(5, 10)

    try:
        wait.until(EC.presence_of_element_located((By.ID, "default-lists-event_main_group")))
    except TimeoutException:
        print(f"No races found for {season}")
        continue

    race_dropdown = Select(driver.find_element(By.ID, "default-lists-event_main_group"))
    races = [(opt.text, opt.get_attribute("value")) for opt in race_dropdown.options]
    print(f"Found {len(races)} races in {season}")

    for race_name, race_value in races:
        safe_name = race_name.replace(" ", "_").replace("/", "-")
        print(f"\nProcessing race: {race_name}")

        # ==========================
        # Check available divisions
        # ==========================
        driver.get(base_url)
        human_pause(5, 9)
        Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
        human_pause(3, 6)
        select_division = Select(driver.find_element(By.ID, "default-lists-event"))
        available_divisions = [o.text for o in select_division.options]
        valid_divisions = [d for d in DIVISION_OPTIONS if d in available_divisions]

        if not valid_divisions:
            print("   No valid HYROX PRO divisions found for this race")
            continue

        for div in valid_divisions:
            div_safe_name = div.replace(" ", "_").replace("/", "-")
            file_path_div = os.path.join(season_folder, f"{safe_name}_{div_safe_name}.csv")

            if os.path.exists(file_path_div):
                print(f"   Division CSV already exists: {file_path_div}, skipping division")
                continue  # skip this division entirely

            print(f"   Division: {div}")
            division_results = []

            for gender_code, gender_name in [("M", "Men"), ("W", "Women")]:
                print(f"      Gender: {gender_name}")

                # Go back to base page and re-select race + division
                driver.get(base_url)
                human_pause(4, 8)
                Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
                human_pause(2, 5)
                Select(driver.find_element(By.ID, "default-lists-event")).select_by_visible_text(div)
                human_pause(2, 5)

                # Select gender and number of results
                Select(driver.find_element(By.ID, "default-lists-sex")).select_by_value(gender_code)
                Select(driver.find_element(By.ID, "default-num_results")).select_by_value("100")
                human_pause(2, 5)

                # Submit form
                driver.find_element(By.ID, "default-submit").click()
                human_pause(5, 10)

                # Scrape results
                scrape_pages(race_name, div, gender_name, division_results)
                human_pause(2, 5)

            # Save CSV after both genders are scraped
            columns = ["Race", "Division", "Gender", "Rank Overall", "Rank Age Group",
                       "Name", "Nation", "Age Group", "Total Time"]
            df_div = pd.DataFrame(division_results, columns=columns)
            df_div.to_csv(file_path_div, index=False)
            print(f"   Saved {len(df_div)} rows to {file_path_div}")
            human_pause(3, 6)

print("\nALL DONE.")
driver.quit()


STARTING Season_7

Found 73 races in Season_7

Processing race: 2025 Shanghai
   Division CSV already exists: Datasets\Hyrox\Season_7\2025_Shanghai_HYROX_PRO.csv, skipping division
   Division CSV already exists: Datasets\Hyrox\Season_7\2025_Shanghai_HYROX_PRO_DOUBLES.csv, skipping division

Processing race: 2025 Atlanta
   Division CSV already exists: Datasets\Hyrox\Season_7\2025_Atlanta_HYROX_PRO_-_Overall.csv, skipping division
   Division CSV already exists: Datasets\Hyrox\Season_7\2025_Atlanta_HYROX_PRO_DOUBLES_-_Overall.csv, skipping division

Processing race: 2025 Valencia
   Division CSV already exists: Datasets\Hyrox\Season_7\2025_Valencia_HYROX_PRO_-_Overall.csv, skipping division
   Division CSV already exists: Datasets\Hyrox\Season_7\2025_Valencia_HYROX_PRO_DOUBLES_-_Overall.csv, skipping division

Processing race: 2025 Maastricht
   Division CSV already exists: Datasets\Hyrox\Season_7\2025_Maastricht_HYROX_PRO.csv, skipping division
   Division CSV already exists: Dataset