In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
def initialize_driver() -> webdriver.Chrome:
    """
    Initializes and returns a Chrome WebDriver instance with specified options.

    Returns:
        - webdriver.Chrome: An instance of Chrome WebDriver with specified options.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-search-engine-choice-screen")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def accept_cookies(driver : webdriver.Chrome) -> None:
    """
    Accepts cookies on a webpage using the provided WebDriver instance.

    Parameters:
        - driver (webdriver.Chrome): The WebDriver instance used to interact with the webpage.
    """
    cookie_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]'))
    )
    cookie_button.click()

def create_csv(content : str, csv_name : str) -> None:
    """
    Extracts athletic records from HTML and saves to a CSV.

    Parameters:
        - content (str): HTML content.
        - csv_name (str): Output CSV file name.
    """

    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser').prettify()
    soup = BeautifulSoup(soup, 'html.parser')

    # Find the table containing the records
    table_body = soup.find(attrs={'class':'Table_table__2zsdR RecordsTable_table__3X8lL'}).find('tbody')
    # Find all table rows
    rows = table_body.find_all('tr')

    data = {
        'DISCIPLINE': [],
        'PERF': [],
        'COMPETITOR': [],
        'DOB': [],
        'COUNTRY': [],
        'VENUE': [],
        'DATE': []
    }

    for row in rows:
        cells = row.find_all('td')
        data['DISCIPLINE'].append(cells[0].text.strip())
        
        perf = cells[2].text.strip()
        for r in ["*", "Mx", "Wo", "h"]:
            perf = perf.replace(r, "")
        data['PERF'].append(perf.strip())

        data['COMPETITOR'].append(cells[4].text.strip())
        data['DOB'].append(cells[5].text.strip())
        data['COUNTRY'].append(cells[6].text.strip())
        data['VENUE'].append(cells[7].text.strip().replace("(i)", ""))
        data['DATE'].append(cells[8].text.strip())

    records = pd.DataFrame(data)
    records.to_csv(csv_name, index=False)

In [3]:
def scrape_world_records():
    driver = initialize_driver()
    
    try:
        driver.get("https://worldathletics.org/records/by-category/world-records")
        accept_cookies(driver)
        
        create_csv(driver.page_source, "data/women_world_records.csv")
        
        men_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[3]/div/div/div[2]/ul/li[2]/button'))
        )
        men_button.click()
        
        create_csv(driver.page_source, "data/men_world_records.csv")
    
    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        driver.quit()

scrape_world_records()