In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import json
from datetime import datetime
import pickle
import time

### Initial step: build dictionary of `{year:mainurl}`

In [None]:
r = requests.get('https://www.letour.fr/en/history')
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('button') #class="dateTabs__link js-tabs"')

In [None]:
year_links_dict = {}

for item in results:
    if item.text:
        key = int(item.text)
        value = {'mainurl':item['data-tabs-ajax']}
        year_links_dict[key] = value
        
print(len(year_links_dict))
year_links_dict#[2018]

In [None]:
for year, dct in year_links_dict.items():
    print(year, dct['mainurl'])

#### Adding links to `Starters`, `Stages`, `Jersey wearers`, `Stage winners` and `Ranking` 

In [None]:
base_url = 'https://www.letour.fr'

for year in list(year_links_dict.keys()):
    print('Main URL: ', year, year_links_dict[year]['mainurl'])
    r = requests.get(base_url + year_links_dict[year]['mainurl'])
    soup = BeautifulSoup(r.text, 'html.parser')
    
    new_dict = {}
    buttons = soup.find_all('button', class_="js-tabs-nested")
    for b in buttons:
        new_key = b.text.lower().replace(' ', '_') + '_url'
        new_val = b['data-tabs-ajax']
        year_links_dict[year][new_key] = new_val

Save to `pickle`

In [None]:
def save_pickle(obj, filename):
    with open('data/' + filename + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data/year_links_dict.pkl', 'wb') as f:
    pickle.dump(year_links_dict, f, pickle.HIGHEST_PROTOCOL)

Open from `pickle`

In [None]:
def load_pickle(name):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
year_links_dict = load_pickle('year_links_dict')
year_links_dict

#### Add `num_of_stages` to dict

Function to scrape the total number of stages, for a particular year

In [None]:
base_url = 'https://www.letour.fr'

def scrape_number_of_stages(year):
    print(f'Scraping number of stages for {year}')
    r = requests.get(base_url + year_links_num_dict[year]['mainurl'])
    soup = BeautifulSoup(r.text, 'html.parser')
    return int(soup.find_all('span', class_='statsInfos__number')[0].text)

In [None]:
# mainurl = '/en/block/history/11818/f34c3404d95a697dcf77d4cd8e8278fa' # 2018 - 21 stages
# mainurl = '/en/block/history/10804/89b36b01ffe439e016ec1d59c57b63d1' # 2011 - 21 stages
mainurl = '/en/block/history/10708/0b76b8f809ad5d8bcf3579df597644d8'  # 1904 - 6 stages

print(scrape_number_of_stages(1956))

In [None]:
year_links_num_dict = year_links_dict.copy()

for year in list(year_links_num_dict.keys()):
    year_links_num_dict[year]['num_of_stages'] = scrape_number_of_stages(year)

In [None]:
save_pickle(year_links_num_dict, 'year_links_num_dict')

### Load `year_links_num_dict`

In [None]:
year_links_num_dict = load_pickle('year_links_num_dict')
year_links_num_dict

## Define functions to scrape data from different tables

#### 1. Starters

In [None]:
base_url = 'https://www.letour.fr'

def scrape_starters(year):
    print(f'Scraping starters for {year}')
    r = requests.get(base_url + year_links_num_dict[year]['starters_url'], timeout=10)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    rows = soup.find_all('tr')
    rows_list = []

    for i, row in enumerate(rows):
        rows_list.append(row.text.strip())
        
    starters_list = []

    for row in rows_list:
        matched_on_team = re.search(r'^[a-zA-Z]', row)

        if matched_on_team: # if row is a team name
            new_row = [row,None,None]
            starters_list.append(new_row)
        else: # row is a rider
            new_row = [None]
            new_row.append([x.strip() for x in row.split('\n ')][0])
            new_row.append([x.strip() for x in row.split('\n ')][1])
            starters_list.append(new_row)
            
    starters_df = pd.DataFrame(starters_list, columns=['team','rider_num','rider_name'])
    starters_df.team = starters_df.team.fillna(method='ffill')
    starters_df = starters_df[starters_df.rider_num.isnull() == False]
    starters_df = starters_df.reset_index(drop=True)
    filepath = 'data/' + str(year) + '/' + str(year) + '_starters' + '.csv'
    starters_df.to_csv(filepath, index=False)
    print('Saved ' + filepath)    

#### 2. Stages

In [None]:
base_url = 'https://www.letour.fr'

def scrape_stages(year):
    print(f'Scraping stages for {year}')
    r = requests.get(base_url + year_links_num_dict[year]['stages_url'], timeout=10)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    stages_list = []
    rows = soup.find_all('tr')
    for i, row in enumerate(rows):
        stages_list.append(row.text.strip())

    stages_list = [i.split('\n') for i in stages_list]
    stages_list.pop(0)
    header = ['stage_num','date_start','start_city','finish_city']
    stages_df = pd.DataFrame(stages_list, columns=header)
    filepath = 'data/' + str(year) + '/' + str(year) + '_stages' + '.csv'
    stages_df.to_csv(filepath, index=False)
    print('Saved ' + filepath)    

#### 3. Jersey wearers

In [None]:
base_url = 'https://www.letour.fr'

def scrape_jersey_wearers(year):
    print(f'Scraping jersey wearers for {year}')
    r = requests.get(base_url + year_links_num_dict[year]['jersey_wearers_url'], timeout=10)
    soup = BeautifulSoup(r.text, 'html.parser')
    jersey_list = []
    num_of_cols = 5
    
    rows = soup.find_all('tr')
    for row in rows:
        new_row = row.text.strip('\n')
        new_row = re.sub(r'\s\s+',',',new_row)
        #new_row = re.sub(r',$','',new_row) # commented out to prevent column number not matching issue
        jersey_list.append(new_row)

    jersey_list = [i.split(',') for i in jersey_list]
    header = jersey_list.pop(0)
    header = [x.lower().replace(' ', '_') for x in header[0].split('\n')] # modified header code to fix col match bug
    header[0] = 'stage_num'
    #header = ['stage_num','yellow_jersey','green_jersey','polka_dot_jersey','white_jersey']
    jersey_df = pd.DataFrame(jersey_list)
    
    while len(header) > len(jersey_df.columns): # while header is longer than num of cols in df
        header.pop()
    
    while len(jersey_df.columns) > num_of_cols: # while df has more columns than it should (fixes column number not matching bug)
        jersey_df = jersey_df.drop(jersey_df.columns[len(jersey_df.columns)-1], axis=1)
    
    jersey_df.columns = header

    filepath = 'data/' + str(year) + '/' + str(year) + '_jersey_wearers' + '.csv'
    jersey_df.to_csv(filepath, index=False)
    print('Saved ' + filepath)    

#### 4. Stage winners

In [None]:
base_url = 'https://www.letour.fr'

def scrape_stage_winners(year):
    print(f'Scraping stage winners for {year}')
    r = requests.get(base_url + year_links_num_dict[year]['stages_winners_url'], timeout=10)
    soup = BeautifulSoup(r.text, 'html.parser')
    stage_winners_list = []

    rows = soup.find_all('tr')
    for row in rows:
        new_row = row.text.strip('\n')
        new_row = re.sub(r'\s\s\s+',',',new_row) # added a third \s to fix bug of splitting on team name in parens
        new_row = re.sub(r',$','',new_row)
        new_row = re.sub(r'\n',',',new_row)
        stage_winners_list.append(new_row)

    stage_winners_list = [i.split(',') for i in stage_winners_list]
    stage_winners_list.pop(0)
    header = ['stage_num','parcours','winner','team']
    stage_winners_df = pd.DataFrame(stage_winners_list
                                    , columns=header
                                   )
    filepath = 'data/' + str(year) + '/' + str(year) + '_stage_winners' + '.csv'
    stage_winners_df.to_csv(filepath, index=False)
    print('Saved ' + filepath)

#### 5. Rankings per stage (i.e. times, gaps, points, etc.)
What are `b` and `p`?

Function to loop through all ranking codes for a given year, and scrape/export data as CSVs

In [None]:
ranking_cats = {'indiv_general':'itg',
                'indiv_stage':'ite',
                'points_general':'ipg',
                #'points_stage':'ipe',
                'climber_general':'img',
                #'climber_stage':'ime',
                'youth_general':'ijg',
                #'combative_general':'icg',
                'team_stage':'ete',
                'team_general':'etg'
               }

In [None]:
base_url = 'https://www.letour.fr'

def scrape_all_rankings(year):
    print(f'Scraping rankings for all codes for {year}')
    
    for label, code in ranking_cats.items(): # loop through ranking codes
    
        print(label, code)

        num_columns = -1 # added this code block to fix header length not matching number of cols of data bug
        if code == 'itg' or code == 'ite':
#             continue
            header = ['rank','rider','rider_no','team','times','gap','b','p']
            num_columns = 8
        elif code == 'ipg':
#             continue
            header = ['rank','rider','rider_no','team','points','b','p']
            num_columns = 7
        elif code == 'img':
            header = ['rank','rider','rider_no','team','points']
            num_columns = 5
        elif code == 'ijg':
#             continue
            header = ['rank','rider','rider_no','team','times','gap']
            num_columns = 6
        elif code == 'ete' or code == 'etg':
#             continue
            header = ['rank','team','times','gap']
            num_columns = 4
        else:
            print('Not a ranking code I am interested in!')

        rankings_df = pd.DataFrame()

        for stage_num in range(year_links_num_dict[year]['num_of_stages']): # loop through stages
            stage_num += 1
            print('\nStage number:', stage_num)
            full_url = str(base_url + year_links_num_dict[year]['ranking_url'] 
                         + f"?stage={stage_num}" 
                         + f"&type={code}")
            r = requests.get(full_url, timeout=None)
            soup = BeautifulSoup(r.text, 'html.parser')
            print(full_url)

            rows_for_df = {} 
            row_num = 0
            for item in soup.tbody.find_all('tr'):
                row = item.find_all('td')

                if len(row) == num_columns: # check for if number of columns in header matches

                    new_row = []
                    for col in row: 
                        new_row.append(col.text.strip())
                        rows_for_df[row_num] = new_row
                    row_num += 1
               
            df = pd.DataFrame.from_dict(rows_for_df, orient='index'
                                    , columns=header
                                       )

            df['stage_num'] = stage_num
            cols = list(df.columns)
            cols = [cols[-1]] + cols[:-1]
            df = df[cols]
            df = df.reset_index(drop=True)

            rankings_df = pd.concat([rankings_df, df])
                                 
        filepath = 'data/' + str(year) + '/' + str(year) + '_rankings_' + code + '.csv'
        rankings_df.to_csv(filepath, index=False)
        print('Saved ' + filepath)
        time.sleep(5)

## Time to Scrape! Loop through all years...

Finished years: 

`[
1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,
2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,
2010,2011,2012,2013,2014,2015,2016,2017,2018
]`

In [None]:
2018-1966+1

In [None]:
list(reversed(list(year_links_num_dict.keys())))[53:]

In [None]:
base_url = 'https://www.letour.fr'

for year in list(reversed(list(year_links_num_dict.keys())))[53:]: # starting from 1965, going backwards
    directory = 'data/' + str(year)
    if not os.path.exists(directory):
        os.makedirs(directory)

    scrape_all_rankings(year) # 1. scrape rankings data
    time.sleep(10)
    scrape_starters(year) # 2. scrape starters data
    time.sleep(10)
    scrape_stages(year) # 3. scrape stages data
    time.sleep(10)
    scrape_jersey_wearers(year) # 4. scrape jersey wearers data
    time.sleep(10)
    scrape_stage_winners(year) # 5. scrape stage winners data
    time.sleep(10)