# Scraping & Cleaning for Question 1

## Import Statements & Functions
The import statements and functions we'll call in the main loop

In [49]:
# import statements
import requests
import re
import json
import time
from bs4 import BeautifulSoup as BS

In [50]:
# scrape wikipedia page
def scrape(url):
    # need header to scrape from wikipedia
    headers = {
        'User-Agent': 'SchoolProjectBot/1.0 (r1043412@student.thomasmore.be) Python-Requests/2.31.0'
    }
    page = requests.get(url, headers=headers)

    # short wait
    time.sleep(3)
    
    return BS(page.text, "html")

In [51]:
# take information from BeautifulSoup and clean it
def extract_clean_data(soup):
    # extract the table from the page html
    table = soup.find('table', class_ = 'wikitable')

    # use list comprehension instead of for loop
    courses = [
        # take all td's and put them in a list 'cells', take the second and third elements from that list
        (lambda cells: (cells[1].get_text(strip=True), cells[2].get_text(strip=True)))(row.find_all("td"))

        # take all rows exept title and results row
        for row in table.select('tr')[1:-1]

        # only read rows that have more than 2 elements
        if len(row.find_all('td')) > 2
    ]

    # remove rest days, transfer days and remove courses without 'to' as these are very short outliers within a city
    courses = [course for course in courses if (re.search('to[A-Z ]', course[0]) and course[1])]

    # clean data so we can later perform analisys on it
    courses = [
        (lambda course:
        (course[0][:re.search('to[A-Z ]', course[0]).start()], # start of course
        course[0][re.search('to[A-Z ]', course[0]).end()-1:], # end of course
        course[1][:re.search(r'\D', course[1]).start()]) # distance of course
        )(course)
        for course in courses
    ]

    # transfer list of cleaned data to a dictionary so it can be written to json
    data_dict = [
        {'key': i, 'start': course[0], 'end': course[1], 'distance': int(course[2])} # distance must be int to use NumPy on it
        
        # use enumerate for the counter
        for i, course in enumerate(courses, start=1)
    ]

    return data_dict


In [52]:
# write data to json
def write_json(dict, course_year, course_type):
    # open the json file and update it with the python dictionary
    with open(f'data/{course_type}.json', 'r+') as file:
        # load the entire json file into memory
        file_json = json.load(file)

        # update the json with the python dictionary
        file_json[course_type].update({course_year: dict})

        # put cursor at beginning of file and overwrite it with the dictionary in memory
        file.seek(0)
        json.dump(file_json, file, indent=4)

## Main Loop
Call the other functions in one function so we can easily feed the grand tours into it

In [53]:
# combine functions into one functions we can use to scrape and clean the data
def scrape_and_clean(grand_tour, start_year):
    # set year to start year initially, after which it'll be updated
    year = start_year
    
    # select last part of link and json file based on which grand tour
    match grand_tour:
        case 'Tour de France':
            page_url = '_Tour_de_France'
            json_file = 'tour_de_france'
        case 'Giro d\'Italia':
            page_url = '_Giro_d\'Italia'
            json_file = 'giro_ditalia'
        case 'Vuelta a España':
            page_url = '_Vuelta_a_España'
            json_file = 'vuelta_a_espagna'

    # scrape a page every 5 years
    while year < 2025:
        # assemble the url and scrape the page
        assembled_url = 'https://en.wikipedia.org/wiki/' + str(year) + page_url
        page_html = scrape(assembled_url)

        # skip the year 2016 for the Giro as it's layout is slightly different, and writing extra code for a singular year isn't worth it
        if not ((year == 2016 and grand_tour == 'Giro d\'Italia') or (year == 2022 and grand_tour == 'Tour de France')):
            # test if the table we'll pull is correct -- the page needs to have a table with the class wikitable
            if not (page_html.find('table', class_='wikitable') == None):
                # check if the wikitable has the table headers of the collumns we want -- check if text exists and if it's correct
                if not (page_html.find('table', class_='wikitable').find('th', string=lambda text: text and 'Course' in text) == None 
                        and page_html.find('table', class_='wikitable').find('th', string=lambda text: text and 'Distance' in text) == None):
                    # when the table is correct, carry on with the main logic
                    course_dictionary = extract_clean_data(page_html)
                    write_json(course_dictionary, year, json_file)
                    
                    # confirm the scrape of that year was succesful
                    print(f'Succesfully scraped and cleaned data of year {year} for {grand_tour}.')

                    # add 5 to year to move on to the next wiki page
                    year += 5
                else:
                    # print a message of scrape wasn't sucessful
                    print(f'Correct collums not found for {grand_tour} in year {year}.')

                    # when the table isn't correct , we'll only add 1 year
                    year += 1
            else:
                # print a message of scrape wasn't sucessful
                print(f'Page not found for {grand_tour} in year {year}.')

                # when the table isn't correct , we'll only add 1 year
                year += 1
        else:
            print('Skipped tour')
            year += 1

## Execution of the main loop

In [54]:
# finally we'll execute the main loop
grand_tour_list = [
    ('Tour de France', 1905),
    ('Giro d\'Italia', 1909),
    ('Vuelta a España', 1935)
]

[scrape_and_clean(tour, year) for tour, year in grand_tour_list]

Correct collums not found for Tour de France in year 1905.
Succesfully scraped and cleaned data of year 1906 for Tour de France.
Succesfully scraped and cleaned data of year 1911 for Tour de France.
Page not found for Tour de France in year 1916.
Page not found for Tour de France in year 1917.
Page not found for Tour de France in year 1918.
Succesfully scraped and cleaned data of year 1919 for Tour de France.
Succesfully scraped and cleaned data of year 1924 for Tour de France.
Succesfully scraped and cleaned data of year 1929 for Tour de France.
Succesfully scraped and cleaned data of year 1934 for Tour de France.
Succesfully scraped and cleaned data of year 1939 for Tour de France.
Page not found for Tour de France in year 1944.
Page not found for Tour de France in year 1945.
Page not found for Tour de France in year 1946.
Succesfully scraped and cleaned data of year 1947 for Tour de France.
Succesfully scraped and cleaned data of year 1952 for Tour de France.
Succesfully scraped and

[None, None, None]