In [31]:
# import statements
import requests
import re
import json
import time
import pdb
from bs4 import BeautifulSoup as BS

In [32]:
# scrape wikipedia page
def scrape(url):
    # need header to scrape from wikipedia
    headers = {
        'User-Agent': 'SchoolProjectBot/1.0 (r1043412@student.thomasmore.be) Python-Requests/2.31.0'
    }
    page = requests.get(url, headers=headers)
    
    return BS(page.text, "html")

In [33]:
# take information from BeautifulSoup and clean it
def extract_clean_data(soup):
    # extract the table from the page html
    table = soup.find('table', class_ = 'wikitable')

    # use list comprehension instead of for loop
    courses = [
        # take all td's and put them in a list 'cells', take the second and third elements from that list
        (lambda cells: (cells[1].get_text(strip=True), cells[2].get_text(strip=True)))(row.find_all("td"))

        # take all rows exept title and results row
        for row in table.select('tr')[1:-1]

        # only read rows that have more than 2 elements
        if len(row.find_all('td')) > 2
    ]

    # remove rest days and remove courses without 'to' as these are very short outliers within a city
    courses = [course for course in courses if (course[1] and re.search('to[A-Z ]', course[0]))]

    # debug
    print(courses)

    # clean data so we can later perform analisys on it
    courses = [
        (lambda course:
        (course[0][:re.search('to[A-Z ]', course[0]).start()], # start of course
        course[0][re.search('to[A-Z ]', course[0]).end()-1:], # end of course
        course[1][:re.search(r'\D', course[1]).start()]) # distance of course
        )(course)
        for course in courses
    ]

    # transfer list of cleaned data to a dictionary so it can be written to json
    data_dict = [
        {'key': i, 'start': course[0], 'end': course[1], 'distance': course[2]}
        
        # use enumerate for the counter
        for i, course in enumerate(courses, start=1)
    ]

    return data_dict


In [34]:
# write data to json
def write_json(dict, course_year, course_type):
    # open the json file and update it with the python dictionary
    with open(f'data/{course_type}.json', 'r+') as file:
        # load the entire json file into memory
        file_json = json.load(file)

        # update the json with the python dictionary
        file_json[course_type].update({course_year: dict})

        # put cursor at beginning of file and overwrite it with the dictionary in memory
        file.seek(0)
        json.dump(file_json, file, indent=4)

In [35]:
# combine functions into one functions we can use to scrape and clean the data
def scrape_and_clean(grand_tour, start_year):
    # set year to start year initially, after which it'll be updated
    year = start_year
    
    # select last part of link and json file based on which grand tour
    match grand_tour:
        case 'Tour de France':
            page_url = '_Tour_de_France'
            json_file = 'tour_de_france'
        case 'Giro d\'Italia':
            page_url = '_Giro_d\'Italia'
            json_file = 'giro_ditalia'
        case 'Vuelta a España':
            page_url = '_Vuelta_a_España'
            json_file = 'vuelta_a_espagna'

    # scrape a page every 5 years
    while year < 2025:
        # assemble the url and scrape the page
        assembled_url = 'https://en.wikipedia.org/wiki/' + str(year) + page_url
        page_html = scrape(assembled_url)

        # debug
        print(assembled_url)

        # test if the table we'll pull is correct
        if not (page_html.find('table', class_='wikitable') == None):
            # debug
            # print(page_html.find('table', class_='wikitable'))
            # print(page_html.find('table', class_='wikitable').find('th', string='Course'))
            # print(page_html.find('table', class_='wikitable').find('th', string='Distance'))

            if not (page_html.find('table', class_='wikitable').find('th', string=lambda t: t and 'Course' in t) == None 
                    and page_html.find('table', class_='wikitable').find('th', string=lambda t: t and 'Distance' in t) == None):
                # when the table is correct, carry on with the main logic
                course_dictionary = extract_clean_data(page_html)
                write_json(course_dictionary, year, json_file)
                
                # add 5 to year to move on to the next wiki page
                year += 5
            else:
                # print a message of scrape wasn't sucessful
                print(f'Correct collums not found for {grand_tour} in year {year}.')

                # when the table isn't correct , we'll only add 1 year
                year += 1
        else:
            # print a message of scrape wasn't sucessful
            print(f'Page not found for {grand_tour} in year {year}.')

            # when the table isn't correct , we'll only add 1 year
            year += 1


        
        # short wait time between each scrape
        time.sleep(3)

In [None]:
# finally we'll execute the main loop
scrape_and_clean('Tour de France', 1905)
scrape_and_clean('Giro d\'Italia', 1909) # change 'to' to '-' for 2016 TODO:
scrape_and_clean('Vuelta a España', 1935)

https://en.wikipedia.org/wiki/1909_Giro_d'Italia
[('MilantoBologna', '397\xa0km (247\xa0mi)'), ('BolognatoChieti', '375.8\xa0km (233.5\xa0mi)'), ('ChietitoNaples', '242.8\xa0km (150.9\xa0mi)'), ('NaplestoRome', '228.1\xa0km (141.7\xa0mi)'), ('RometoFlorence', '346.5\xa0km (215.3\xa0mi)'), ('FlorencetoGenoa', '294.1\xa0km (182.7\xa0mi)'), ('GenoatoTurin', '354.9\xa0km (220.5\xa0mi)'), ('TurintoMilan', '206\xa0km (128\xa0mi)')]
https://en.wikipedia.org/wiki/1914_Giro_d'Italia
[('MilantoCuneo', '420\xa0km (261\xa0mi)'), ('CuneotoLucca', '340.5\xa0km (212\xa0mi)'), ('LuccatoRome', '430\xa0km (267\xa0mi)'), ('RometoAvellino', '365.4\xa0km (227\xa0mi)'), ('AvellinotoBari', '328.7\xa0km (204\xa0mi)'), ("BaritoL'Aquila", '428\xa0km (266\xa0mi)'), ("L'AquilatoLugo", '429.1\xa0km (267\xa0mi)'), ('LugotoMilan', '420.3\xa0km (261\xa0mi)')]
https://en.wikipedia.org/wiki/1919_Giro_d'Italia
[('MilantoTrento', '302.8\xa0km (188\xa0mi)'), ('TrentotoTrieste', '334.3\xa0km (208\xa0mi)'), ('TriestetoFerra