In [127]:
import requests
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession
# Function to get the HTML content of a page
def get_html(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.content
BASE_URL = "https://www.letour.fr"

In [128]:
def get_years_urls(base_url):
    # Get the HTML content
    html = get_html(base_url)
    soup = BeautifulSoup(html, 'html.parser')

    # Find the year selection element
    year_select = soup.find('select', {'name': 'year'})

    # Extract all year options
    years_urls = {option['data-tabs-target'] : BASE_URL+option['value'] for option in year_select.find_all('option') if option['value']}
    
    return years_urls

# URL of the Tour de France history page
url = "https://www.letour.fr/en/history"
years_urls = get_years_urls(url)
print(years_urls)

{'2023': 'https://www.letour.fr/en/block/history/11823/4e4949d130f21278f7d8bce32a071d67', '2022': 'https://www.letour.fr/en/block/history/11822/ca646e1b4b99d7ee973c457c87392395', '2021': 'https://www.letour.fr/en/block/history/11821/a3fe7d350ef546e3c27ae83a84fa3a0b', '2020': 'https://www.letour.fr/en/block/history/11820/17fa8e795e69e9f326ea26cf9912e571', '2019': 'https://www.letour.fr/en/block/history/11819/96c0eb3fa403ebf222f28b6e45115c56', '2018': 'https://www.letour.fr/en/block/history/11818/f34c3404d95a697dcf77d4cd8e8278fa', '2017': 'https://www.letour.fr/en/block/history/10810/7035a5dc53631209d3581f64a433b10d', '2016': 'https://www.letour.fr/en/block/history/10809/7be3a459d846b4672915c576cb7ed6b9', '2015': 'https://www.letour.fr/en/block/history/10808/3f166bedb535ee9bff2fb7ead0a7812c', '2014': 'https://www.letour.fr/en/block/history/10807/d538a7fbdfc0d657fbf064561840fbb1', '2013': 'https://www.letour.fr/en/block/history/10806/196e36cfe7aff4b5fc2d1055f7dcf864', '2012': 'https://www

In [131]:
async def fetch_yearly_tdf_data(year_url):
    try:
        # Create an asynchronous HTML Session
        session = AsyncHTMLSession()
        
        # Get the HTML content
        response = await session.get(year_url)
        
        # Render the page (this will execute the JavaScript)
        await response.html.arender(timeout=20)
        
        # Parse the rendered HTML with BeautifulSoup
        soup = BeautifulSoup(response.html.html, 'html.parser')
        
        # Find the buttons with class 'tabs__item btn js-tabs-nested'
        buttons = soup.find_all('button', class_='tabs__item btn js-tabs-nested')
        
        # Extract the button details
        selections_urls = {}
        for button in buttons:
            selections_urls[button.get_text(strip=True)] = BASE_URL+button.get('data-tabs-ajax')
        
        return selections_urls
        
    except Exception as e:
        print(f"An error occurred: {e}")

selections_urls = {}
for year, url in years_urls.items():
    selections_urls[year] = await fetch_yearly_tdf_data(url)


In [136]:
# Function to get stages and ranking types for a given year
def get_year_data(year_url):
    """
    Gets:
    - Stages
    - Rankings
    - Distance
    """
    year_html = get_html(year_url)
    year_soup = BeautifulSoup(year_html, 'html.parser')
    
    # Get stage names
    stage_select = year_soup.find('select', {'name': 'stage'})
    stages = [option.text for option in stage_select.find_all('option') if option['value']]
    
    # Get final blobal ranking (table with class 'rankingTable' and columns 'Rank', 'Rider', 'Team', 'Times', 'Gap', 'B', 'P')
    ranking_table = year_soup.find('table', {'class': 'rankingTable'})
    rows = ranking_table.find_all('tr')
    rankings = []
    for row in rows[1:]:
        cols = row.find_all('td')
        ranking = {
            'Rank': cols[0].text.strip(),
            'Rider': cols[1].text.strip(),
            'Team': cols[2].text.strip(),
            'Times': cols[3].text.strip(),
            'Gap': cols[4].text.strip(),
            'Bonus': cols[5].text.strip(),
            'Penalty': cols[6].text.strip()
        }
        rankings.append(ranking)
    
    # Get the distance of the Tour (spans of class 'statsInfos__label' or 'statsInfos__number')
    span_text = year_soup.find_all('span', {'class': 'statsInfos__label'})
    span_value = year_soup.find_all('span', {'class': 'statsInfos__number'})
    spans = [val.text for pair in zip(span_text, span_value) for val in pair] 
    numbers = [span for span in spans]
    print(numbers)

    return stages, rankings

# Example: Get data for 2023
stages, rankings = get_year_data(years_urls["2022"])
print("Stages:", stages)
print("Rankings:", rankings)


['Number of stages', '22', 'Distance (km)', '3 328']
Stages: ['Stage 1 : Copenhague > Copenhague', 'Stage 2 : Roskilde > Nyborg', 'Stage 3 : Vejle > Sønderborg', 'Stage 1 : Transfert > Transfert', 'Stage 4 : Dunkerque > Calais', 'Stage 5 : Lille Métropole > Arenberg Porte du Hainaut', 'Stage 6 : Binche > Longwy', 'Stage 7 : Tomblaine > La Super Planche des Belles Filles', 'Stage 8 : Dole > Lausanne', 'Stage 9 : Aigle > Châtel Les Portes du Soleil', 'Stage 10 : Morzine Les Portes du Soleil > Megève', 'Stage 11 : Albertville > Col du Granon Serre Chevalier', "Stage 12 : Briançon > Alpe d'Huez", "Stage 13 : Le Bourg d'Oisans > Saint-Étienne", 'Stage 14 : Saint-Étienne > Mende', 'Stage 15 : Rodez > Carcassonne', 'Stage 16 : Carcassonne > Foix', 'Stage 17 : Saint-Gaudens > Peyragudes', 'Stage 18 : Lourdes > Hautacam', 'Stage 19 : Castelnau-Magnoac > Cahors', 'Stage 20 : Lacapelle-Marival > Rocamadour', 'Stage 21 : Paris La Défense Arena > Paris Champs-Élysées']
Rankings: [{'Rank': '1', 'Rid

In [139]:
def get_teams_data(team_url):
    team_html = get_html(team_url)
    team_soup = BeautifulSoup(team_html, 'html.parser')
    
    # Extract the team and riders information
    teams = team_soup.find_all('h3', class_='list__heading')
    res = {}
    for team in teams:
        team_name = team.find('a').get_text(strip=True)
        res[team_name] = []
        
        # Find the corresponding list box for the team
        list_box = team.find_next('div', class_='list__box')
        
        # Extract rider information from the list box
        riders = list_box.find_all('li', class_='list__box__item')
        for rider in riders:
            bib = rider.find('span', class_='bib').get_text(strip=True)
            name = rider.find('a', class_='runner__link').get_text(strip=True)
            res[team_name].append({'bib': bib, 'name': name})

    return res

def get_stages_data(data_url):
    data_html = get_html(data_url)
    data_soup = BeautifulSoup(data_html, 'html.parser')
    
    # Tha data here is a table with columns Stages 	Highlights 	Date 	Start 	Finish
    # Get the table with class 'rankingTable'
    table = data_soup.find('table', class_='rankingTable')
    rows = table.find_all('tr')
    res = []
    for row in rows[1:]:
        cols = row.find_all('td')
        stage = {
            'Stage': cols[0].text.strip(),
            # 'Highlights': cols[1].text.strip(),
            'Date': cols[2].text.strip(),
            'Start': cols[3].text.strip(),
            'Finish': cols[4].text.strip()
        }
        res.append(stage)
    return res

def get_jersey_wearers_data(data_url):
    data_html = get_html(data_url)
    data_soup = BeautifulSoup(data_html, 'html.parser')
    
    # Get the table with class 'rankingTable'
    table = data_soup.find('table', class_='rankingTable')
    rows = table.find_all('tr')
    res = []
    for row in rows[1:]:
        cols = row.find_all('td')
        stage = {
            'Stage': cols[0].text.strip(),
            'Yellow Jersey': cols[1].text.strip(),
            'Green Jersey': cols[2].text.strip(),
            'Polka Dot Jersey': cols[3].text.strip(),
            'White Jersey': cols[4].text.strip()
        }
        res.append(stage)
    return res

def get_stage_winners_data(data_url):
    data_html = get_html(data_url)
    data_soup = BeautifulSoup(data_html, 'html.parser')
    
    # Get the table with class 'rankingTable'
    table = data_soup.find('table', class_='rankingTable')
    rows = table.find_all('tr')
    res = []
    for row in rows[1:]:
        cols = row.find_all('td')
        stage = {
            'Stage': cols[0].text.strip(),
            'Parcours': cols[2].text.strip(),
            'Winner of stage': cols[3].text.strip().split('\n')[0],
        }
        res.append(stage)
    return res

year = "2002"
teams = get_teams_data(selections_urls[year]["Starters"])
print("Teams:", teams)
stages = get_stages_data(selections_urls[year]["Stages"])
print("Stages:", stages)
jersey_wearers = get_jersey_wearers_data(selections_urls[year]["Jersey wearers"])
print("Jersey Wearers:", jersey_wearers)
stage_winners = get_stage_winners_data(selections_urls[year]["Stages winners"])
print("Stage Winners:", stage_winners)

Teams: {'US POSTAL SERVICE': [{'bib': '1', 'name': 'LANCE ARMSTRONG'}, {'bib': '2', 'name': 'VJATCESLAV EKIMOV'}, {'bib': '3', 'name': 'ROBERTO HERAS'}, {'bib': '4', 'name': 'GEORGE HINCAPIE'}, {'bib': '5', 'name': 'BENOIT JOACHIM'}, {'bib': '6', 'name': 'FLOYD LANDIS'}, {'bib': '7', 'name': 'PAVEL PADRNOS'}, {'bib': '8', 'name': 'VICTOR HUGO PENA'}, {'bib': '9', 'name': 'JOSÉ LUIS RUBIERA'}], 'TEAM TELEKOM': [{'bib': '11', 'name': 'ERIK ZABEL'}, {'bib': '12', 'name': 'ROLF ALDAG'}, {'bib': '13', 'name': 'UDO BÖLTS'}, {'bib': '14', 'name': 'GIAN MATTEO FAGNINI'}, {'bib': '15', 'name': 'GIUSEPPE GUERINI'}, {'bib': '16', 'name': 'DANILO HONDO'}, {'bib': '17', 'name': 'BOBBY JULICH'}, {'bib': '18', 'name': 'KEVIN LIVINGSTON'}, {'bib': '19', 'name': 'STEFFEN WESEMANN'}], 'ONCE - EROSKI': [{'bib': '21', 'name': 'JOSEBA BELOKI'}, {'bib': '22', 'name': 'JOSÉ AZEVEDO'}, {'bib': '23', 'name': 'ALVARO GONZALEZ GALDEANO'}, {'bib': '24', 'name': 'IGOR GONZALEZ GALDEANO'}, {'bib': '25', 'name': 'JÖ