In [None]:
import time
import random
import pandas as pd
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# ==========================
# CONFIG
# ==========================

BASE_URLS = {
    "Season_7": "https://results.hyrox.com/season-7/",
    "Season_8": "https://results.hyrox.com/season-8/"
}

SAVE_ROOT = r"Datasets\Hyrox"

def human_pause(a=3, b=7):
    time.sleep(random.uniform(a, b))

# ==========================
# DRIVER
# ==========================

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/120.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 25)

# ==========================
# SCRAPE FUNCTION
# ==========================

def scrape_pages(race_name, division, gender_label, race_results):

    is_doubles = "DOUBLES" in division.upper()

    try:
        # ðŸ”¥ Wait for either results OR no-results message
        wait.until(lambda d: 
            "There are currently no results available" in d.page_source
            or len(d.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")) > 1
        )
    except TimeoutException:
        return False

    # Check for official no-results block (visible only)
    try:
        no_result_elem = driver.find_element(By.XPATH,
            "//*[contains(text(),'There are currently no results available')]"
        )
        if no_result_elem.is_displayed():
            return False
    except NoSuchElementException:
        pass

    rows = driver.find_elements(By.CSS_SELECTOR, "li.list-group-item.row")
    rows = [r for r in rows if "list-group-header" not in r.get_attribute("class")]

    if not rows:
        return False

    scraped_any = False

    for row in rows:
        try:
            rank = row.find_element(By.CSS_SELECTOR, ".place-primary").text
            age_rank = row.find_element(By.CSS_SELECTOR, ".place-secondary").text
            total_time = row.find_element(By.CSS_SELECTOR, ".type-time").text.replace("Total", "").strip()
            age_group = row.find_element(By.CSS_SELECTOR, ".type-age_class").text.replace("Age Group", "").strip()

            if is_doubles:
                members = row.find_elements(By.CSS_SELECTOR, ".type-relay_member a")
                member_names = " & ".join([m.text for m in members])

                race_results.append([
                    race_name, division, gender_label,
                    rank, age_rank, member_names, "",
                    age_group, total_time
                ])
            else:
                name = row.find_element(By.CSS_SELECTOR, "h4.type-fullname").text
                nation = row.find_element(By.CSS_SELECTOR, ".nation__abbr").text

                race_results.append([
                    race_name, division, gender_label,
                    rank, age_rank, name, nation,
                    age_group, total_time
                ])

            scraped_any = True

        except:
            continue

    return scraped_any
    

# ==========================
# MAIN LOOP
# ==========================

for season, base_url in BASE_URLS.items():

    print(f"\n===== STARTING {season} =====")

    season_folder = os.path.join(SAVE_ROOT, season)
    os.makedirs(season_folder, exist_ok=True)

    driver.get(base_url)
    human_pause(5, 8)

    try:
        wait.until(EC.presence_of_element_located((By.ID, "default-lists-event_main_group")))
    except TimeoutException:
        continue

    race_dropdown = Select(driver.find_element(By.ID, "default-lists-event_main_group"))
    races = [(opt.text, opt.get_attribute("value")) for opt in race_dropdown.options]

    for race_name, race_value in races:

        safe_name = race_name.replace(" ", "_").replace("/", "-")

        existing_csvs = [
            f for f in os.listdir(season_folder)
            if f.startswith(safe_name) and f.endswith(".csv")
        ]
        if len(existing_csvs) >= 2:
            print(f"Skipping {race_name}")
            continue

        print(f"\nProcessing race: {race_name}")

        driver.get(base_url)
        human_pause(4, 6)

        Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
        human_pause(2, 4)

        select_division = Select(driver.find_element(By.ID, "default-lists-event"))
        available = [o.text for o in select_division.options]

        def build_priority(base_name):
            priority = []
            if base_name in available:
                priority.append(base_name)
            overall = f"{base_name} - Overall"
            if overall in available:
                priority.append(overall)
            other = [d for d in available
                     if d.startswith(f"{base_name} -")
                     and d not in priority]
            priority.extend(other)
            return priority

        def process_group(base_name):
            priority_list = build_priority(base_name)
            if not priority_list:
                return
        
            for div in priority_list:
                div_safe = div.replace(" ", "_").replace("/", "-")
                file_path = os.path.join(season_folder, f"{safe_name}_{div_safe}.csv")
        
                # Skip only if this division CSV exists
                if os.path.exists(file_path):
                    print(f"   {div} already scraped")
                    continue
        
                print(f"   Trying {div}")
                is_doubles = "DOUBLES" in div.upper()
                genders = (
                    [("M","Male"),("W","Female"),("X","Mixed")]
                    if is_doubles
                    else [("M","Men"),("W","Women")]
                )
        
                division_results = []
                division_has_data = False
        
                # Load division page
                driver.get(base_url)
                human_pause(4,8)
                Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
                human_pause(2,4)
                Select(driver.find_element(By.ID, "default-lists-event")).select_by_visible_text(div)
                human_pause(2,4)
        
                # Detect available genders
                try:
                    gender_dropdown = Select(driver.find_element(By.ID, "default-lists-sex"))
                    available_gender_values = [o.get_attribute("value") for o in gender_dropdown.options]
                except NoSuchElementException:
                    available_gender_values = []
        
                valid_genders = [(code, label) for code, label in genders if code in available_gender_values]
                if not valid_genders:
                    print(f"   {div} has no gender categories")
                    continue
        
                # Scrape each gender
                for idx, (gender_code, gender_label) in enumerate(valid_genders):
                    print(f"      Gender: {gender_label}")
        
                    # Full reset for this division+gender
                    driver.get(base_url)
                    human_pause(4,8)
                    Select(driver.find_element(By.ID, "default-lists-event_main_group")).select_by_value(race_value)
                    human_pause(2,5)
                    Select(driver.find_element(By.ID, "default-lists-event")).select_by_visible_text(div)
                    human_pause(2,5)
                    Select(driver.find_element(By.ID, "default-lists-sex")).select_by_value(gender_code)
                    Select(driver.find_element(By.ID, "default-num_results")).select_by_value("100")
                    human_pause(2,5)
                    driver.find_element(By.ID, "default-submit").click()
                    human_pause(6,10)
        
                    has_data = scrape_pages(race_name, div, gender_label, division_results)
        
                    if idx == 0 and not has_data:
                        print(f"   {div} had NO DATA")
                        division_has_data = False
                        break
        
                    if has_data:
                        division_has_data = True
        
                if division_has_data:
                    df = pd.DataFrame(
                        division_results,
                        columns=["Race","Division","Gender",
                                 "Rank Overall","Rank Age Group",
                                 "Name","Nation","Age Group","Total Time"]
                    )
                    df.to_csv(file_path,index=False)
                    print(f"   Saved {len(df)} rows")
            
        process_group("HYROX PRO")
        process_group("HYROX PRO DOUBLES")

print("\nALL DONE.")
driver.quit()


===== STARTING Season_7 =====
Skipping 2025 Shanghai
Skipping 2025 Atlanta
Skipping 2025 Valencia
Skipping 2025 Maastricht
Skipping 2025 Mumbai

Processing race: 2025 World Championships
   Trying HYROX PRO - Overall
      Gender: Men
   HYROX PRO - Overall had NO DATA
   Trying HYROX PRO - Friday
      Gender: Men
   HYROX PRO - Friday had NO DATA
   Trying HYROX PRO - Saturday
      Gender: Men
      Gender: Women
