In [None]:
import time
import random
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# ==========================
# CONFIG
# ==========================

BASE_URLS = {
    "Season_1": "https://results.hyrox.com/season-1/",
    "Season_2": "https://results.hyrox.com/season-2/",
    "Season_3": "https://results.hyrox.com/season-3/",
    "Season_4": "https://results.hyrox.com/season-4/",
    "Season_5": "https://results.hyrox.com/season-5/"
}

SAVE_ROOT = r"Datasets\Hyrox"
MAX_THREADS = 20  # number of concurrent races

def human_pause(a=3, b=7):
    time.sleep(random.uniform(a, b))

# ==========================
# SELENIUM DRIVER
# ==========================

def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--headless=new")  # headless for parallel threads
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
    return webdriver.Chrome(options=options)

# ==========================
# SCRAPE FUNCTION
# ==========================

def scrape_pages(driver, race_name, division, gender_label, race_results):
    """Scrape all pages for a given race/division/gender."""
    is_doubles = "DOUBLES" in division.upper()
    page_number = 1

    while True:
        try:
            WebDriverWait(driver, 25).until(lambda d: 
                "There are currently no results available" in d.page_source
                or len(d.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")) > 1
            )
        except TimeoutException:
            print(f"[{race_name}] Timeout waiting for results in {division} - {gender_label}")
            return False

        # Check no-results message
        try:
            no_result_elem = driver.find_element(By.XPATH,
                "//*[contains(text(),'There are currently no results available')]"
            )
            if no_result_elem.is_displayed():
                print(f"[{race_name}] No results for {division} - {gender_label}")
                return False
        except NoSuchElementException:
            pass

        rows = driver.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")
        rows = [r for r in rows if "list-group-header" not in r.get_attribute("class")]

        if not rows:
            print(f"[{race_name}] No rows found for {division} - {gender_label}")
            return False

        scraped_any = False
        for row in rows:
            try:
                rank = row.find_element(By.CSS_SELECTOR, ".place-primary").text
                age_rank = row.find_element(By.CSS_SELECTOR, ".place-secondary").text
                total_time = row.find_element(By.CSS_SELECTOR, ".type-time").text.replace("Total", "").strip()
                age_group = row.find_element(By.CSS_SELECTOR, ".type-age_class").text.replace("Age Group", "").strip()

                if is_doubles:
                    members = row.find_elements(By.CSS_SELECTOR, ".type-relay_member a")
                    member_names = " & ".join([m.text for m in members])
                    race_results.append([
                        race_name, division, gender_label,
                        rank, age_rank, member_names, "",
                        age_group, total_time
                    ])
                else:
                    name = row.find_element(By.CSS_SELECTOR, "h4.type-fullname").text
                    nation = row.find_element(By.CSS_SELECTOR, ".nation__abbr").text
                    race_results.append([
                        race_name, division, gender_label,
                        rank, age_rank, name, nation,
                        age_group, total_time
                    ])
                scraped_any = True
            except:
                continue

        print(f"[{race_name}] Page {page_number} scraped for {division} - {gender_label}")
        page_number += 1

        # Go to next page
        try:
            next_button = driver.find_element(By.XPATH, "//a[text()='>']")
            driver.execute_script("arguments[0].click();", next_button)
            human_pause(3, 6)
        except NoSuchElementException:
            break

    return scraped_any

# ==========================
# RACE SCRAPER (THREAD TARGET)
# ==========================

def scrape_race(season, base_url, race_name, race_value):
    driver = create_driver()
    season_folder = os.path.join(SAVE_ROOT, season)
    os.makedirs(season_folder, exist_ok=True)

    safe_name = race_name.replace(" ", "_").replace("/", "-")
    print(f"\n[{race_name}] Starting race scraping")

    driver.get(base_url)
    human_pause(4, 6)
    Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
    human_pause(2,4)
    select_division = Select(driver.find_element(By.ID, "default-lists-event"))
    available = [o.text for o in select_division.options]

    def build_priority(base_name):
        priority = []
        if base_name in available:
            priority.append(base_name)
        overall = f"{base_name} - Overall"
        if overall in available:
            priority.append(overall)
        other = [d for d in available if d.startswith(f"{base_name} -") and d not in priority]
        priority.extend(other)
        return priority

    def process_group(base_name):
        priority_list = build_priority(base_name)
        if not priority_list:
            print(f"[{race_name}] No {base_name} divisions available")
            return

        for div in priority_list:
            div_safe = div.replace(" ", "_").replace("/", "-")
            file_path = os.path.join(season_folder, f"{safe_name}_{div_safe}.csv")
            if os.path.exists(file_path):
                print(f"[{race_name}] {div} already scraped")
                continue

            print(f"[{race_name}] Trying {div}")
            is_doubles = "DOUBLES" in div.upper()
            genders = [("M","Male"),("W","Women")] if not is_doubles else [("M","Male"),("W","Female"),("X","Mixed")]

            division_results = []
            division_has_data = False

            driver.get(base_url)
            human_pause(3,6)
            Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
            human_pause(2,4)
            Select(driver.find_element(By.ID, "default-lists-event")).select_by_visible_text(div)
            human_pause(2,4)

            try:
                gender_dropdown = Select(driver.find_element(By.ID, "default-lists-sex"))
                available_gender_values = [o.get_attribute("value") for o in gender_dropdown.options]
            except NoSuchElementException:
                available_gender_values = []

            valid_genders = [(code, label) for code, label in genders if code in available_gender_values]
            if not valid_genders:
                print(f"[{race_name}] {div} has no gender categories")
                continue

            for idx, (gender_code, gender_label) in enumerate(valid_genders):
                print(f"[{race_name}] Scraping gender: {gender_label} in {div}")
                driver.get(base_url)
                human_pause(3,6)
                Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
                human_pause(2,4)
                Select(driver.find_element(By.ID, "default-lists-event")).select_by_visible_text(div)
                human_pause(2,4)
                Select(driver.find_element(By.ID, "default-lists-sex")).select_by_value(gender_code)
                Select(driver.find_element(By.ID, "default-num_results")).select_by_value("100")
                human_pause(2,4)
                driver.find_element(By.ID, "default-submit").click()
                human_pause(5,10)

                has_data = scrape_pages(driver, race_name, div, gender_label, division_results)
                if idx == 0 and not has_data:
                    print(f"[{race_name}] {div} had NO DATA")
                    division_has_data = False
                    break
                if has_data:
                    division_has_data = True

            if division_has_data:
                df = pd.DataFrame(
                    division_results,
                    columns=["Race","Division","Gender",
                             "Rank Overall","Rank Age Group",
                             "Name","Nation","Age Group","Total Time"]
                )
                df.to_csv(file_path,index=False)
                print(f"[{race_name}] Saved {len(df)} rows for {div}")

    process_group("HYROX PRO")
    process_group("HYROX PRO DOUBLES")

    driver.quit()
    return f"[{race_name}] Finished scraping"

# ==========================
# MAIN LOOP
# ==========================

all_tasks = []
for season, base_url in BASE_URLS.items():
    driver_main = create_driver()
    driver_main.get(base_url)
    human_pause(4,6)
    try:
        WebDriverWait(driver_main, 25).until(EC.presence_of_element_located((By.ID, "default-lists-event_main_group")))
    except TimeoutException:
        continue
    race_dropdown = Select(driver_main.find_element(By.ID, "default-lists-event_main_group"))
    races = [(race_dropdown.options[i].text, race_dropdown.options[i].get_attribute("value")) for i in range(len(race_dropdown.options))]
    driver_main.quit()
    for race_name, race_value in races:
        all_tasks.append((season, base_url, race_name, race_value))

# Threaded execution
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    futures = [executor.submit(scrape_race, *task) for task in all_tasks]
    for f in futures:
        print(f.result())

print("\nALL DONE.")


[2019 Oberhausen] Starting race scraping

[2019 Karlsruhe] Starting race scraping

[2020 Dallas] Starting race scraping

[2018 Essen] Starting race scraping

[2019 Wien] Starting race scraping

[2018 Wien] Starting race scraping
[2020 Karlsruhe] Starting race scraping

[2018 Stuttgart] Starting race scraping


[2019 Nürnberg] Starting race scraping

[World Championships] Starting race scraping

[2020 Hannover] Starting race scraping

[2019 Frankfurt] Starting race scraping

[2019 Hannover] Starting race scraping

[2019 New York] Starting race scraping

[2018 Leipzig] Starting race scraping

[2020 Chicago] Starting race scraping

[2020 Elite 12] Starting race scraping

[2019 Hamburg] Starting race scraping

[2019 Essen] Starting race scraping

[2018 Hamburg] Starting race scraping
[2020 Dallas] Trying HYROX PRO
[2019 Karlsruhe] Trying HYROX PRO
[2019 Oberhausen] Trying HYROX PRO
[2019 Frankfurt] Trying HYROX PRO
[2019 New York] Trying HYROX PRO
[2020 Elite 12] No HYROX PRO divisions av