# Scrape 50 at a time, for all divisions available for all seasons available, even the empty ones.

In [None]:
import time
import random
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# ==========================
# CONFIG
# ==========================

#BASE_URLS = {f"Season_{i}": f"https://results.hyrox.com/season-{i}/" for i in range(1, 9)}
BASE_URLS = {f"Season_{i}": f"https://results.hyrox.com/season-{i}/" for i in range(8, 9)}
SAVE_ROOT = r"Datasets\Hyrox"
MAX_THREADS = 50  # adjust for your system

def human_pause(a=2, b=5):
    time.sleep(random.uniform(a, b))

# ==========================
# SELENIUM DRIVER
# ==========================

def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--headless=new")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
    return webdriver.Chrome(options=options)

# ==========================
# SCRAPE PAGES FUNCTION
# ==========================

def scrape_pages(driver, race_name, division, gender_label, race_results):
    is_doubles = "DOUBLES" in division.upper()
    page_number = 1

    while True:
        try:
            WebDriverWait(driver, 25).until(lambda d: 
                "There are currently no results available" in d.page_source
                or len(d.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")) > 1
            )
        except TimeoutException:
            print(f"[{race_name}] Timeout waiting for {division} - {gender_label}")
            return False

        # No results?
        try:
            no_result_elem = driver.find_element(By.XPATH,
                "//*[contains(text(),'There are currently no results available')]"
            )
            if no_result_elem.is_displayed():
                print(f"[{race_name}] No results for {division} - {gender_label}")
                return False
        except NoSuchElementException:
            pass

        rows = driver.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")
        rows = [r for r in rows if "list-group-header" not in r.get_attribute("class")]

        if not rows:
            print(f"[{race_name}] No rows found for {division} - {gender_label}")
            return False

        scraped_any = False
        for row in rows:
            try:
                rank = row.find_element(By.CSS_SELECTOR, ".place-primary").text
                age_rank = row.find_element(By.CSS_SELECTOR, ".place-secondary").text
                total_time = row.find_element(By.CSS_SELECTOR, ".type-time").text.replace("Total", "").strip()
                age_group = row.find_element(By.CSS_SELECTOR, ".type-age_class").text.replace("Age Group", "").strip()

                if is_doubles:
                    members = row.find_elements(By.CSS_SELECTOR, ".type-relay_member a")
                    member_names = " & ".join([m.text for m in members])
                    race_results.append([
                        race_name, division, gender_label,
                        rank, age_rank, member_names, "",
                        age_group, total_time
                    ])
                else:
                    name = row.find_element(By.CSS_SELECTOR, "h4.type-fullname").text
                    nation = row.find_element(By.CSS_SELECTOR, ".nation__abbr").text
                    race_results.append([
                        race_name, division, gender_label,
                        rank, age_rank, name, nation,
                        age_group, total_time
                    ])
                scraped_any = True
            except:
                continue

        print(f"[{race_name}] Page {page_number} scraped for {division} - {gender_label}")
        page_number += 1

        try:
            next_button = driver.find_element(By.XPATH, "//a[text()='>']")
            driver.execute_script("arguments[0].click();", next_button)
            human_pause(2,5)
        except NoSuchElementException:
            break

    return scraped_any

# ==========================
# SCRAPE RACE FUNCTION
# ==========================

def scrape_race(season, base_url, race_name, race_value):
    driver = create_driver()
    season_folder = os.path.join(SAVE_ROOT, season)
    os.makedirs(season_folder, exist_ok=True)

    safe_name = race_name.replace(" ", "_").replace("/", "-")
    print(f"\n[{race_name}] Starting scraping")

    driver.get(base_url)
    human_pause(2,5)
    Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
    human_pause(1,3)

    # Get all divisions
    select_division = Select(driver.find_element(By.ID, "default-lists-event"))
    divisions = [o.text for o in select_division.options]

    # Check if all division CSVs exist; if yes, skip the race entirely
    all_exist = True
    for div in divisions:
        div_safe = div.replace(" ", "_").replace("/", "-")
        file_path = os.path.join(season_folder, f"{safe_name}_{div_safe}.csv")
        if not os.path.exists(file_path):
            all_exist = False
            break
    if all_exist:
        print(f"[{race_name}] All division CSVs exist. Skipping race.")
        driver.quit()
        return f"[{race_name}] Skipped (all divisions exist)"

    for div in divisions:
        div_safe = div.replace(" ", "_").replace("/", "-")
        file_path = os.path.join(season_folder, f"{safe_name}_{div_safe}.csv")
        if os.path.exists(file_path):
            print(f"[{race_name}] {div} already exists, will check for data")
        
        print(f"[{race_name}] Scraping division: {div}")

        driver.get(base_url)
        human_pause(2,4)
        Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
        human_pause(1,2)
        Select(driver.find_element(By.ID, "default-lists-event")).select_by_visible_text(div)
        human_pause(1,2)

        try:
            gender_dropdown = Select(driver.find_element(By.ID, "default-lists-sex"))
            genders = [(o.get_attribute("value"), o.text) for o in gender_dropdown.options]
        except NoSuchElementException:
            genders = [("", "All")]

        division_results = []
        division_has_data = False

        for gender_code, gender_label in genders:
            print(f"[{race_name}] Scraping gender: {gender_label} in {div}")

            driver.get(base_url)
            human_pause(2,4)
            Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
            human_pause(1,2)
            Select(driver.find_element(By.ID, "default-lists-event")).select_by_visible_text(div)
            human_pause(1,2)
            if gender_code:
                Select(driver.find_element(By.ID, "default-lists-sex")).select_by_value(gender_code)
            Select(driver.find_element(By.ID, "default-num_results")).select_by_value("100")
            human_pause(1,2)
            driver.find_element(By.ID, "default-submit").click()
            human_pause(2,4)

            has_data = scrape_pages(driver, race_name, div, gender_label, division_results)
            if has_data:
                division_has_data = True

        # Save CSV even if empty (no data)
        df = pd.DataFrame(
            division_results,
            columns=["Race","Division","Gender",
                     "Rank Overall","Rank Age Group",
                     "Name","Nation","Age Group","Total Time"]
        )
        df.to_csv(file_path, index=False)
        if division_has_data:
            print(f"[{race_name}] Saved {len(df)} rows for {div}")
        else:
            print(f"[{race_name}] Division {div} had no data. Empty CSV saved.")

    driver.quit()
    return f"[{race_name}] Finished scraping"

# ==========================
# MAIN EXECUTION
# ==========================

all_tasks = []

for season, base_url in BASE_URLS.items():
    driver_main = create_driver()
    driver_main.get(base_url)
    human_pause(2,4)
    try:
        WebDriverWait(driver_main, 20).until(EC.presence_of_element_located((By.ID, "default-lists-event_main_group")))
    except TimeoutException:
        driver_main.quit()
        continue

    race_dropdown = Select(driver_main.find_element(By.ID, "default-lists-event_main_group"))
    races = [(race_dropdown.options[i].text, race_dropdown.options[i].get_attribute("value")) for i in range(len(race_dropdown.options))]
    driver_main.quit()

    for race_name, race_value in races:
        all_tasks.append((season, base_url, race_name, race_value))

with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    futures = [executor.submit(scrape_race, *task) for task in all_tasks]
    for f in futures:
        print(f.result())

print("\nALL DONE.")


[2026 Phoenix] Starting scraping

[2026 Turin] Starting scraping

[2026 Bilbao] Starting scraping

[2026 Istanbul] Starting scraping

[2025 Vancouver] Starting scraping
[2026 Vienna] Starting scraping

[2026 Guadalajara] Starting scraping

[2026 Nice] Starting scraping

[2026 Las Vegas] Starting scraping

[2026 Amsterdam - Youngstars] Starting scraping

[2026 Washington DC] Starting scraping

[2025 London Excel] Starting scraping

[2025 Shenzhen] Starting scraping

[2025 Stockholm] Starting scraping

[2026 Fortaleza] Starting scraping

[2026 Osaka] Starting scraping

[2026 Katowice] Starting scraping

[2025 Melbourne] Starting scraping


[2025 Utrecht] Starting scraping

[2026 Amsterdam] Starting scraping

[2026 Taipei] Starting scraping

[2025 Anaheim] Starting scraping

[2025 Verona] Starting scraping

[2026 St. Gallen] Starting scraping

[2025 Frankfurt] Starting scraping

[2025 Boston] Starting scraping

[2025 Madrid] Starting scraping

[2026 Auckland] Starting scraping

[2026 Man