## Scrape Athletics Data From World Athletics

The below script scrapes the athletics data from [worldathletics.org](https://worldathletics.org/records/all-time-toplists/sprints/100-metres/outdoor/men/senior?regionType=world&timing=electronic&windReading=regular&page=1&bestResultsOnly=false&firstDay=1900-01-01&lastDay=2021-09-20) for the men's 100 metres sprint across all years and writes it to a csv file for subsequent analysis.

#### Import libraries

In [None]:
import time
import requests
import polars as pl
from datetime import date
from bs4 import BeautifulSoup

#### Specify URL with parametres

In [None]:
# base URL to download from including pagination number
base_url = "https://worldathletics.org/records/all-time-toplists/sprints/100-metres/outdoor/men/senior?regionType=world&timing=electronic&windReading=regular&page={}&bestResultsOnly=false&firstDay=1900-01-01&lastDay={}"

#### Iterate over all pages and write to csv

In [None]:
all_records = [] # initialise empty list to store the data

page = 1 # starting page
today_str = str(date.today()) # end date range
max_pages = 500 # page limit to force break if required

while True:
    
    url = base_url.format(page, today_str) # pass page number and today's date as params into url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    table_rows = soup.select('tr') # extract data from each row in the table

    # break loop if no more data
    if not table_rows or page > max_pages:
        break

    print(f'Retrieving data for page {page}')
    
    # loop through rows and extract data
    for row in table_rows:
        columns = row.find_all('td')
        if columns:
            record = {
                "Rank": columns[0].text.strip(),
                "Mark": columns[1].text.strip(),
                "WIND": columns[2].text.strip(),
                "Competitor": columns[3].text.strip(),
                "DOB": columns[4].text.strip(),
                "Nat": columns[5].text.strip(),
                "Pos": columns[6].text.strip(),
                "Venue": columns[8].text.strip(),
                "Date": columns[9].text.strip(),
                "Results Score": columns[10].text.strip()
            }
            all_records.append(record)

    page += 1
    time.sleep(1)

df = pl.DataFrame(all_records)
df.write_csv("100m_sprint_records.csv")

print("Operation complete!")