In [3]:
from IPython.display import display_html
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

In [4]:
class Scraper:
    
    def __init__(self):
        
        self.name = "Tour de France"
        self.years = range(1903,2021)
        self.stages = range(1,25)
        self.df_stages = []
        self.hdr = {'User-Agent': 'Mozilla/5.0'}
        self.dictionariy = {ord('ñ'): 'n', ord('ü'): 'u', ord('é'): 'e',
                            ord('í'): 'i', ord('Á'): 'A', ord('î'): 'i',
                            ord('ū'): 'u', ord('ä'): 'a', ord('è'): 'e',
                            ord('ø'): 'o', ord('æ'): 'ae', ord('š'): 's',
                            ord('ā'): 'a', ord('á'): 'a', ord('ó'): 'o',
                            ord('č'): 'c', ord('ç'): 'c', ord('ö'): 'o',
                            ord('Ó'): 'O', ord('Í'): 'I', ord('ż'): 'z',
                            ord('ï'): 'i', ord('ô'): 'o', ord('ý'): 'y',
                            ord('ć'): 'c', ord('Š'): 'S', ord('ò'): 'o',
                            ord('ë'): 'e', ord('ß'): 'ss', ord('ú'): 'u',
                            ord('ș'): 's', ord('ž'): 'z', ord('ł'): 'l',
                            ord('Đ'): 'D', ord('ś'): 's', ord('ě'): 'e',
                            ord('ř'): 'r', ord('É'): 'E', ord('ń'): 'n',
                            ord('ņ'): 'n', ord('Ł'): 'L', ord('Å'): 'A',
                            ord('ã'): 'a'}
        self.riders = []
        self.df_riders = []
        self.df_stage_types = []
    
    def scrape(self):
        
        # loop over the years
        for year in self.years:
            
            # find the site
            site = "https://www.procyclingstats.com/race/tour-de-france/" + str(year)
            # request access
            target = Request(site, headers = self.hdr)
            # open the page
            target_page = urlopen(target)
            # create soup object
            soup = BeautifulSoup(target_page)
            
            # check if page exists else continue
            if soup.find_all("div", {"class": "confirmation red notfound"}):
                print(site + " not found")
                continue
            else:
                # loop over stages
                for stage in self.stages:
                    target_site = site + "/stage-" + str(stage)
                    print(target_site)
                    target = Request(target_site, headers = self.hdr)
                    target_page = urlopen(target)
                    soup = BeautifulSoup(target_page)
            
                    if soup.find_all("div", {"class": "confirmation red notfound"}):
                        print(target_site + " not found")
                        break
                    else:
                        # find the basic results
                        a = soup.find_all("table", {"class":"basic results"})
                        
                        # convert using read_html
                        for table in a:
                            df_html = pd.read_html(str(table))
                            df = pd.concat(df_html)
                            df['Year'] = year
                            df['Stage'] = stage
                            self.df_stages.append(df)
        self.concat_df()
        #self.finalize_df()
    
    def finalize_df(self):# run the rest of the program and save output
        
        self.clean_riders()
        self.scrape_riders()
        self.stages_save_csv()
        self.riders_save_csv()
        self.scrape_stage_types()
        self.stage_types_save_csv()
    
    def scrape_stage_types(self):
        
        df_years = []
        df_stage = []
        df_type = []
        
        for year in self.years:
            site = "https://www.procyclingstats.com/race/tour-de-france/" + str(year) + "/gc/stages"
            target = Request(site, headers = self.hdr)
            target_page = urlopen(target)
            soup = BeautifulSoup(target_page)

            # check if page exists.
            if soup.find_all("div", {"class": "confirmation red notfound"}):
                print(site + " not found")
                continue

            target = Request(site, headers = self.hdr)
            target_page = urlopen(target)
            soup = BeautifulSoup(target_page)
            a = soup.find_all("div", {"class": "w2"})
            a = a[1:]
            a = [str(a[i]).split("icon profile ")[1].split('"')[0] for i,j in enumerate(a)]
            df_type.extend(a)
            df_years.extend([str(year) for i in range(len(a))])
            df_stage.extend([i + 1 for i in range(len(a))])
            
        data = {"year": df_years, "type": df_type, "stage": df_stage}
        
        self.df_stage_types = pd.DataFrame(data)

    def scrape_riders(self, start = 0):
        
        riders = [rider for rider in self.df_stages.Rider.unique()]
        riders_split = [str(rider).split() for rider in riders[start:]]
        
        df_first_names = []
        df_last_names = []
        df_nationality = []
        df_dates_of_birth = []
        df_heights = []
        df_weights = []
        df_one_day_races = []
        df_GC = []
        df_time_trial = []
        df_sprint = []
        df_climber = []
        
        for i, rider in enumerate(riders_split):
            
            # check for NAs
            if len(rider) < 2:
                continue
                
            first_name = rider[1].translate(self.dictionariy)
            second_name = rider[0].translate(self.dictionariy)

            if first_name in df_first_names:
                continue

            site = "https://www.procyclingstats.com/rider/" + first_name + "-" + second_name
            print(first_name, second_name)
            #print(site)
            target = Request(site,headers = self.hdr)
            target_page = urlopen(target)
            soup = BeautifulSoup(target_page)

            if soup.find_all("div", {"class": "confirmation red notfound"}):
                continue

            string = soup.find("div", {"class": "rdr-info-cont"})
            date_of_birth = string.get_text().split("Date of birth:")[1].split("(")[0]
            nationality = string.get_text().split("Nationality:")[1].split("Weight")[0]
            
            if "Height" in string.get_text():
                height = string.get_text().split("Height: ")[1].split("m")[0]
            else:
                height = "NA"
                
            if "Weight" in string.get_text():
                weight = string.get_text().split("Weight: ")[1].split("kg")[0]
            else:
                weight = "NA"
                
            one_day_races = string.get_text().split("Points per specialty")[1].split("A-Z")[0]#.split("[A-Za-z]")[0]
            one_day_races = re.split('[A-Z]', one_day_races)[0]
            GC = string.get_text().split("One day races")[1].split("A-Z")[0]#.split("[A-Za-z]")[0]
            GC = re.split('[A-Z]', GC)[0]
            time_trial = string.get_text().split("GC")[1].split("A-Z")[0]#.split("[A-Za-z]")[0]
            time_trial = re.split('[A-Z]', time_trial)[0]
            sprint = string.get_text().split("Time trial")[1].split("A-Z")[0]#.split("[A-Za-z]")[0]
            sprint = re.split('[A-Z]', sprint)[0]
            climber = string.get_text().split("Sprint")[1].split("A-Z")[0]#.split("[A-Za-z]")[0]
            climber = re.split('[A-Z]', climber)[0]

            df_first_names.append(rider[1])
            df_last_names.append(rider[0])
            df_nationality.append(nationality)
            df_dates_of_birth.append(date_of_birth)
            df_heights.append(height)
            df_weights.append(weight)
            df_one_day_races.append(one_day_races)
            df_GC.append(GC)
            df_time_trial.append(time_trial)
            df_sprint.append(sprint)
            df_climber.append(climber)

        data = {"First Name": df_first_names, "Last Names": df_last_names,
                "Nationality":df_nationality, "Dates of Birth": df_dates_of_birth, 
               "Height":df_heights, "Weight":df_weights, "One day races":df_one_day_races,
               "GC": df_GC, "Time trial":df_time_trial, "Sprint":df_sprint, "Climber":df_climber}

        self.df_riders = pd.DataFrame(data)
        
    def concat_df(self):
        
        if self.df_stages:
            self.df_stages = pd.concat(self.df_stages, sort = False)
        else:
            raise Exception("Dataframe empty.")
    
    def clean_riders(self):
        
        if not self.df_stages.empty:
            self.df_stages = self.df_stages.fillna(' ')
            self.df_stages['Rider'] = [re.sub(self.df_stages.iloc[i]['Team'], '', self.df_stages.iloc[i]['Rider']) if not isinstance(self.df_stages.iloc[i]['Rider'], float) else "NA" for i,j in enumerate(self.df_stages['Rider'])]
        else:
            raise Exception("Dataframe empty.")
    
    def stages_save_csv(self):

        if not self.df_stages.empty:
            self.df_stages.to_csv(self.name + "_stages.csv")
        else:
            raise Exception("Dataframe empty.")
            
    def riders_save_csv(self):

        if not self.df_riders.empty:
            self.df_riders.to_csv(self.name + "_riders.csv")
        else:
            raise Exception("Dataframe empty.")
            
    def stage_types_save_csv(self):

        if not self.df_stage_types.empty:
            self.df_stage_types.to_csv(self.name + "_stage_types.csv")
        else:
            raise Exception("Dataframe empty.")


In [None]:
tour = Scraper()
tour.scrape()

https://www.procyclingstats.com/race/tour-de-france/1903/stage-1
https://www.procyclingstats.com/race/tour-de-france/1903/stage-2
https://www.procyclingstats.com/race/tour-de-france/1903/stage-3
https://www.procyclingstats.com/race/tour-de-france/1903/stage-4
https://www.procyclingstats.com/race/tour-de-france/1903/stage-5
https://www.procyclingstats.com/race/tour-de-france/1903/stage-6
https://www.procyclingstats.com/race/tour-de-france/1903/stage-7
https://www.procyclingstats.com/race/tour-de-france/1903/stage-7 not found
https://www.procyclingstats.com/race/tour-de-france/1904/stage-1
https://www.procyclingstats.com/race/tour-de-france/1904/stage-2
https://www.procyclingstats.com/race/tour-de-france/1904/stage-3
https://www.procyclingstats.com/race/tour-de-france/1904/stage-4
https://www.procyclingstats.com/race/tour-de-france/1904/stage-5
https://www.procyclingstats.com/race/tour-de-france/1904/stage-6
https://www.procyclingstats.com/race/tour-de-france/1904/stage-7
https://www.pro

https://www.procyclingstats.com/race/tour-de-france/1912/stage-1
https://www.procyclingstats.com/race/tour-de-france/1912/stage-2
https://www.procyclingstats.com/race/tour-de-france/1912/stage-3
https://www.procyclingstats.com/race/tour-de-france/1912/stage-4
https://www.procyclingstats.com/race/tour-de-france/1912/stage-5
https://www.procyclingstats.com/race/tour-de-france/1912/stage-6
https://www.procyclingstats.com/race/tour-de-france/1912/stage-7
https://www.procyclingstats.com/race/tour-de-france/1912/stage-8
https://www.procyclingstats.com/race/tour-de-france/1912/stage-9
https://www.procyclingstats.com/race/tour-de-france/1912/stage-10
https://www.procyclingstats.com/race/tour-de-france/1912/stage-11
https://www.procyclingstats.com/race/tour-de-france/1912/stage-12
https://www.procyclingstats.com/race/tour-de-france/1912/stage-13
https://www.procyclingstats.com/race/tour-de-france/1912/stage-14
https://www.procyclingstats.com/race/tour-de-france/1912/stage-15
https://www.procycl

https://www.procyclingstats.com/race/tour-de-france/1923/stage-2
https://www.procyclingstats.com/race/tour-de-france/1923/stage-3
https://www.procyclingstats.com/race/tour-de-france/1923/stage-4
https://www.procyclingstats.com/race/tour-de-france/1923/stage-5
https://www.procyclingstats.com/race/tour-de-france/1923/stage-6
https://www.procyclingstats.com/race/tour-de-france/1923/stage-7
https://www.procyclingstats.com/race/tour-de-france/1923/stage-8
https://www.procyclingstats.com/race/tour-de-france/1923/stage-9
https://www.procyclingstats.com/race/tour-de-france/1923/stage-10
https://www.procyclingstats.com/race/tour-de-france/1923/stage-11
https://www.procyclingstats.com/race/tour-de-france/1923/stage-12
https://www.procyclingstats.com/race/tour-de-france/1923/stage-13
https://www.procyclingstats.com/race/tour-de-france/1923/stage-14
https://www.procyclingstats.com/race/tour-de-france/1923/stage-15
https://www.procyclingstats.com/race/tour-de-france/1923/stage-16
https://www.procyc

https://www.procyclingstats.com/race/tour-de-france/1929/stage-6
https://www.procyclingstats.com/race/tour-de-france/1929/stage-7
https://www.procyclingstats.com/race/tour-de-france/1929/stage-8
https://www.procyclingstats.com/race/tour-de-france/1929/stage-9
https://www.procyclingstats.com/race/tour-de-france/1929/stage-10
https://www.procyclingstats.com/race/tour-de-france/1929/stage-11
https://www.procyclingstats.com/race/tour-de-france/1929/stage-12
https://www.procyclingstats.com/race/tour-de-france/1929/stage-13
https://www.procyclingstats.com/race/tour-de-france/1929/stage-14
https://www.procyclingstats.com/race/tour-de-france/1929/stage-15
https://www.procyclingstats.com/race/tour-de-france/1929/stage-16
https://www.procyclingstats.com/race/tour-de-france/1929/stage-17
https://www.procyclingstats.com/race/tour-de-france/1929/stage-18
https://www.procyclingstats.com/race/tour-de-france/1929/stage-19
https://www.procyclingstats.com/race/tour-de-france/1929/stage-20
https://www.pr

https://www.procyclingstats.com/race/tour-de-france/1934/stage-12
https://www.procyclingstats.com/race/tour-de-france/1934/stage-13
https://www.procyclingstats.com/race/tour-de-france/1934/stage-14
https://www.procyclingstats.com/race/tour-de-france/1934/stage-15
https://www.procyclingstats.com/race/tour-de-france/1934/stage-16
https://www.procyclingstats.com/race/tour-de-france/1934/stage-17
https://www.procyclingstats.com/race/tour-de-france/1934/stage-18
https://www.procyclingstats.com/race/tour-de-france/1934/stage-19
https://www.procyclingstats.com/race/tour-de-france/1934/stage-20
https://www.procyclingstats.com/race/tour-de-france/1934/stage-21
https://www.procyclingstats.com/race/tour-de-france/1934/stage-21 not found
https://www.procyclingstats.com/race/tour-de-france/1935/stage-1
https://www.procyclingstats.com/race/tour-de-france/1935/stage-2
https://www.procyclingstats.com/race/tour-de-france/1935/stage-3
https://www.procyclingstats.com/race/tour-de-france/1935/stage-4
http

https://www.procyclingstats.com/race/tour-de-france/1950/stage-4
https://www.procyclingstats.com/race/tour-de-france/1950/stage-5
https://www.procyclingstats.com/race/tour-de-france/1950/stage-6
https://www.procyclingstats.com/race/tour-de-france/1950/stage-7
https://www.procyclingstats.com/race/tour-de-france/1950/stage-8
https://www.procyclingstats.com/race/tour-de-france/1950/stage-9
https://www.procyclingstats.com/race/tour-de-france/1950/stage-10
https://www.procyclingstats.com/race/tour-de-france/1950/stage-11
https://www.procyclingstats.com/race/tour-de-france/1950/stage-12
https://www.procyclingstats.com/race/tour-de-france/1950/stage-13
https://www.procyclingstats.com/race/tour-de-france/1950/stage-14
https://www.procyclingstats.com/race/tour-de-france/1950/stage-15
https://www.procyclingstats.com/race/tour-de-france/1950/stage-16
https://www.procyclingstats.com/race/tour-de-france/1950/stage-17
https://www.procyclingstats.com/race/tour-de-france/1950/stage-18
https://www.proc

https://www.procyclingstats.com/race/tour-de-france/1958/stage-15
https://www.procyclingstats.com/race/tour-de-france/1958/stage-16
https://www.procyclingstats.com/race/tour-de-france/1958/stage-17
https://www.procyclingstats.com/race/tour-de-france/1958/stage-18
https://www.procyclingstats.com/race/tour-de-france/1958/stage-19
https://www.procyclingstats.com/race/tour-de-france/1958/stage-20
https://www.procyclingstats.com/race/tour-de-france/1958/stage-21
https://www.procyclingstats.com/race/tour-de-france/1958/stage-22
https://www.procyclingstats.com/race/tour-de-france/1958/stage-23
https://www.procyclingstats.com/race/tour-de-france/1958/stage-24
https://www.procyclingstats.com/race/tour-de-france/1959/stage-1
https://www.procyclingstats.com/race/tour-de-france/1959/stage-2
https://www.procyclingstats.com/race/tour-de-france/1959/stage-3
https://www.procyclingstats.com/race/tour-de-france/1959/stage-4
https://www.procyclingstats.com/race/tour-de-france/1959/stage-5
https://www.pro

https://www.procyclingstats.com/race/tour-de-france/1979/stage-23
https://www.procyclingstats.com/race/tour-de-france/1979/stage-24
https://www.procyclingstats.com/race/tour-de-france/1980/stage-1
https://www.procyclingstats.com/race/tour-de-france/1980/stage-1 not found
https://www.procyclingstats.com/race/tour-de-france/1981/stage-1
https://www.procyclingstats.com/race/tour-de-france/1981/stage-1 not found
https://www.procyclingstats.com/race/tour-de-france/1982/stage-1
https://www.procyclingstats.com/race/tour-de-france/1982/stage-2
https://www.procyclingstats.com/race/tour-de-france/1982/stage-3
https://www.procyclingstats.com/race/tour-de-france/1982/stage-4
https://www.procyclingstats.com/race/tour-de-france/1982/stage-5
https://www.procyclingstats.com/race/tour-de-france/1982/stage-6
https://www.procyclingstats.com/race/tour-de-france/1982/stage-7
https://www.procyclingstats.com/race/tour-de-france/1982/stage-8
https://www.procyclingstats.com/race/tour-de-france/1982/stage-9
htt

https://www.procyclingstats.com/race/tour-de-france/1987/stage-16
https://www.procyclingstats.com/race/tour-de-france/1987/stage-17
https://www.procyclingstats.com/race/tour-de-france/1987/stage-18
https://www.procyclingstats.com/race/tour-de-france/1987/stage-19
https://www.procyclingstats.com/race/tour-de-france/1987/stage-20
https://www.procyclingstats.com/race/tour-de-france/1987/stage-21
https://www.procyclingstats.com/race/tour-de-france/1987/stage-22
https://www.procyclingstats.com/race/tour-de-france/1987/stage-23
https://www.procyclingstats.com/race/tour-de-france/1987/stage-24
https://www.procyclingstats.com/race/tour-de-france/1988/stage-1
https://www.procyclingstats.com/race/tour-de-france/1988/stage-2
https://www.procyclingstats.com/race/tour-de-france/1988/stage-3
https://www.procyclingstats.com/race/tour-de-france/1988/stage-4
https://www.procyclingstats.com/race/tour-de-france/1988/stage-5
https://www.procyclingstats.com/race/tour-de-france/1988/stage-6
https://www.proc

https://www.procyclingstats.com/race/tour-de-france/1992/stage-22 not found
https://www.procyclingstats.com/race/tour-de-france/1993/stage-1
https://www.procyclingstats.com/race/tour-de-france/1993/stage-2
https://www.procyclingstats.com/race/tour-de-france/1993/stage-3
https://www.procyclingstats.com/race/tour-de-france/1993/stage-4
https://www.procyclingstats.com/race/tour-de-france/1993/stage-5
https://www.procyclingstats.com/race/tour-de-france/1993/stage-6
https://www.procyclingstats.com/race/tour-de-france/1993/stage-7
https://www.procyclingstats.com/race/tour-de-france/1993/stage-8
https://www.procyclingstats.com/race/tour-de-france/1993/stage-9
https://www.procyclingstats.com/race/tour-de-france/1993/stage-10
https://www.procyclingstats.com/race/tour-de-france/1993/stage-11
https://www.procyclingstats.com/race/tour-de-france/1993/stage-12
https://www.procyclingstats.com/race/tour-de-france/1993/stage-13
https://www.procyclingstats.com/race/tour-de-france/1993/stage-14
https://w

https://www.procyclingstats.com/race/tour-de-france/1998/stage-12
https://www.procyclingstats.com/race/tour-de-france/1998/stage-13
https://www.procyclingstats.com/race/tour-de-france/1998/stage-14
https://www.procyclingstats.com/race/tour-de-france/1998/stage-15
https://www.procyclingstats.com/race/tour-de-france/1998/stage-16
https://www.procyclingstats.com/race/tour-de-france/1998/stage-17
https://www.procyclingstats.com/race/tour-de-france/1998/stage-17 not found
https://www.procyclingstats.com/race/tour-de-france/1999/stage-1
https://www.procyclingstats.com/race/tour-de-france/1999/stage-2
https://www.procyclingstats.com/race/tour-de-france/1999/stage-3
https://www.procyclingstats.com/race/tour-de-france/1999/stage-4
https://www.procyclingstats.com/race/tour-de-france/1999/stage-5
https://www.procyclingstats.com/race/tour-de-france/1999/stage-6
https://www.procyclingstats.com/race/tour-de-france/1999/stage-7
https://www.procyclingstats.com/race/tour-de-france/1999/stage-8
https://

https://www.procyclingstats.com/race/tour-de-france/2004/stage-7
https://www.procyclingstats.com/race/tour-de-france/2004/stage-8
https://www.procyclingstats.com/race/tour-de-france/2004/stage-9
https://www.procyclingstats.com/race/tour-de-france/2004/stage-10
https://www.procyclingstats.com/race/tour-de-france/2004/stage-11
https://www.procyclingstats.com/race/tour-de-france/2004/stage-12
https://www.procyclingstats.com/race/tour-de-france/2004/stage-13
https://www.procyclingstats.com/race/tour-de-france/2004/stage-14
https://www.procyclingstats.com/race/tour-de-france/2004/stage-15
https://www.procyclingstats.com/race/tour-de-france/2004/stage-16
https://www.procyclingstats.com/race/tour-de-france/2004/stage-17
https://www.procyclingstats.com/race/tour-de-france/2004/stage-18
https://www.procyclingstats.com/race/tour-de-france/2004/stage-19
https://www.procyclingstats.com/race/tour-de-france/2004/stage-20
https://www.procyclingstats.com/race/tour-de-france/2004/stage-21
https://www.p

https://www.procyclingstats.com/race/tour-de-france/2009/stage-20
https://www.procyclingstats.com/race/tour-de-france/2009/stage-21
https://www.procyclingstats.com/race/tour-de-france/2009/stage-22
https://www.procyclingstats.com/race/tour-de-france/2009/stage-22 not found
https://www.procyclingstats.com/race/tour-de-france/2010/stage-1
https://www.procyclingstats.com/race/tour-de-france/2010/stage-2
https://www.procyclingstats.com/race/tour-de-france/2010/stage-3
https://www.procyclingstats.com/race/tour-de-france/2010/stage-4
https://www.procyclingstats.com/race/tour-de-france/2010/stage-5
https://www.procyclingstats.com/race/tour-de-france/2010/stage-6
https://www.procyclingstats.com/race/tour-de-france/2010/stage-7
https://www.procyclingstats.com/race/tour-de-france/2010/stage-8
https://www.procyclingstats.com/race/tour-de-france/2010/stage-9
https://www.procyclingstats.com/race/tour-de-france/2010/stage-10
https://www.procyclingstats.com/race/tour-de-france/2010/stage-11
https://w