In [1]:
from typing import Union

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
def initialize_driver() -> webdriver.Chrome:
    """
    Initializes and returns a Chrome WebDriver instance with specified options.

    Returns:
        - webdriver.Chrome: An instance of Chrome WebDriver with specified options.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-search-engine-choice-screen")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def accept_cookies(driver : webdriver.Chrome) -> None:
    """
    Accepts cookies on a webpage using the provided WebDriver instance.

    Parameters:
        - driver (webdriver.Chrome): The WebDriver instance used to interact with the webpage.
    """
    cookie_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]'))
    )
    cookie_button.click()

---
# World records web scraping

In [3]:
def create_csv(content : str, csv_name : str) -> None:
    """
    Extracts athletic records from HTML and saves to a CSV.

    Parameters:
        - content (str): HTML content.
        - csv_name (str): Output CSV file name.
    """

    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser').prettify()
    soup = BeautifulSoup(soup, 'html.parser')

    # Find the table containing the records
    table_body = soup.find(attrs={'class':'Table_table__2zsdR RecordsTable_table__3X8lL'}).find('tbody')
    # Find all table rows
    rows = table_body.find_all('tr')

    data = {
        'DISCIPLINE': [],
        'PERF': [],
        'COMPETITOR': [],
        'DOB': [],
        'COUNTRY': [],
        'VENUE': [],
        'DATE': []
    }

    for row in rows:
        cells = row.find_all('td')
        data['DISCIPLINE'].append(cells[0].text.strip())
        
        perf = cells[2].text.strip()
        for r in ["*", "Mx", "Wo", "h"]:
            perf = perf.replace(r, "")
        data['PERF'].append(perf.strip())

        data['COMPETITOR'].append(cells[4].text.strip())
        data['DOB'].append(cells[5].text.strip())
        data['COUNTRY'].append(cells[6].text.strip())
        data['VENUE'].append(cells[7].text.strip().replace("(i)", ""))
        data['DATE'].append(cells[8].text.strip())

    records = pd.DataFrame(data)
    records.to_csv(csv_name, index=False)

In [4]:
def scrape_world_records() -> None:
    driver = initialize_driver()
    
    try:
        driver.get("https://worldathletics.org/records/by-category/world-records")
        accept_cookies(driver)
        
        create_csv(driver.page_source, "data/women_world_records.csv")
        
        men_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[3]/div/div/div[2]/ul/li[2]/button'))
        )
        men_button.click()
        
        create_csv(driver.page_source, "data/men_world_records.csv")
    
    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        driver.quit()

scrape_world_records()

---
# Top 100 Performance

In [5]:
def create_csv(content: str, year: int) -> pd.DataFrame:
    """
    Extracts athletic TOP100 performances from HTML and saves to a CSV.

    Parameters:
        - content (str): HTML content.
        - year (int): Year of the TOP100 performances.
    """

    soup = BeautifulSoup(content, 'html.parser').prettify()
    soup = BeautifulSoup(soup, 'html.parser')
    table_body = soup.find(attrs={'class':'records-table'})

    if table_body is None:
        raise ValueError("The data for the specified year is not available, please try another year.")

    rows = table_body.find_all('tr')

    data = {
        'YEAR': [],
        'RANK': [],
        'MARK': [],
        'WIND': [],
        'COMPETITOR': [],
        'DOB': [],
        'COUNTRY': [],
        'POS': [],
        'VENUE': [],
        'DATE': [],
        'SCORE': [],
    }

    for row in rows[1:]:
        cells = row.find_all('td')
        data['YEAR'].append(year)
        data['RANK'].append(int(cells[0].get_text(strip=True)))
        data['MARK'].append(cells[1].get_text(strip=True))
        wind = cells[2].get_text(strip=True)
        data['WIND'].append(float(wind) if wind != "" else None)
        data['COMPETITOR'].append(cells[3].get_text(strip=True))
        data['DOB'].append(cells[4].text.strip())
        data['COUNTRY'].append(cells[5].text.strip())
        data['POS'].append(cells[6].text.strip())
        data['VENUE'].append(cells[8].text.strip())
        data['DATE'].append(cells[9].text.strip())
        data['SCORE'].append(int(cells[10].text.strip()))

    return pd.DataFrame(data)

def scrape_year(year: int,
                sex: str = "men",
                event: str = "marathon",
                num_page: int = 1,
                bestResultsOnly: bool = False,
                maxResultsByCountry: Union[str, int] = "all") -> pd.DataFrame:
    """
    Scrape athletics performance data from the World Athletics website for a given year, sex, and event.

    Parameters:
        - year (int) : The year for which to scrape the data.

        - sex (str, optional) : The sex category for the event. Can be "men" or "women". Default is "men".

        - event (str, optional) : The athletics event to scrape. Default is "marathon". Allowed events are: "50-metres", "100-metres"
        "200-metres", "400-metres", "800-metres", "1500-metres", "3000-metres", "5000-metres", "10000-metres", "5-kilometres", "10-kilometres",
        "half-marathon", "marathon"

        - num_page (int, optional) : The number of pages to scrape. Default is 1.
    
        - bestResultsOnly (bool, optional) : Whether to scrape only the best results. Default is False.
    
        - maxResultsByCountry (Union[str, int], optional) :The maximum number of results by country. Can be "all" or an integer between 1 and 5.
        Default is "all".

    Returns:
        - pd.DataFrame : A DataFrame containing the scraped data.

    Example:
    --------
    >>> df = scrape_year(2023, sex="women", event="100-metres", num_page=2, bestResultsOnly=True, maxResultsByCountry=3)
    >>> print(df.head())
    """
    
    ########## Validate input
    category = {
    "50-metres": "sprints", "100-metres": "sprints", "200-metres": "sprints", "400-metres": "sprints", "800-metres": "middlelong",
    "1500-metres": "middlelong", "3000-metres": "middlelong", "5000-metres": "middlelong", "10000-metres": "middlelong",
    "5-kilometres": "road-running", "10-kilometres": "road-running", "half-marathon": "road-running", "marathon": "road-running",
    }

    if event not in category:
        allowed_events = ", ".join(category.keys())
        raise ValueError(f"Invalid event: {event}. Allowed events are: {allowed_events}")
    
    bestResultsOnly = "true" if bestResultsOnly else "false"

    if not (maxResultsByCountry == "all" or (isinstance(maxResultsByCountry, int) and 1 <= maxResultsByCountry <= 5)):
        raise ValueError("maxResultsByCountry must be 'all' or an integer between 1 and 5")
    ##########

    driver = initialize_driver()
    all_data = pd.DataFrame()

    for page in range(1, num_page + 1):
        url = f"https://worldathletics.org/records/toplists/{category[event]}/{event}/all/{sex}/senior/{year}?regionType=world&page={page}&bestResultsOnly={bestResultsOnly}&maxResultsByCountry={maxResultsByCountry}&ageCategory=senior"
        driver.get(url)
        if page == 1:
            accept_cookies(driver)
        df = create_csv(driver.page_source, year)
        all_data = pd.concat([all_data, df], ignore_index=True)
    
    driver.quit()
    return all_data

# Example usage
# df = scrape_year(2019, sex = "men", event = "50-metres")

In [None]:
sex_list = ["men", "women"]
disciplines_list = ["50-metres", "100-metres", "200-metres", "400-metres", "800-metres", "1500-metres", "3000-metres",
                    "5000-metres", "10000-metres", "10-kilometres", "half-marathon", "marathon"]

all_data = pd.DataFrame(columns=['YEAR', 'SEX', 'DISCIPLINE', 'RANK', 'MARK', 'WIND', 'COMPETITOR', 'DOB', 'COUNTRY', 'POS', 'VENUE', 'DATE', 'SCORE'])

for discipline in disciplines_list:
    for sex in sex_list:
        for year in range(2001, 2025):
            print(f"Scraping {sex}, {discipline}, {year}")
            yearly_data = scrape_year(year, sex, discipline)
            if not yearly_data.empty:
                yearly_data['YEAR'], yearly_data['SEX'], yearly_data['DISCIPLINE'] = year, sex, discipline
                yearly_data = yearly_data[['YEAR', 'SEX', 'DISCIPLINE'] + [col for col in yearly_data.columns if col not in ['YEAR', 'SEX', 'DISCIPLINE']]]
                all_data = pd.concat([all_data, yearly_data], ignore_index=True)

all_data.to_csv("data/top100_all_2001_2024.csv", index=False)