## Import Needed Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
from selenium.webdriver.common.keys import Keys
from functools import reduce, partial
#libraries for data and modeling
import numpy as np
import pandas as pd

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

driver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = driver

In [3]:
#function to randomize sleep times to help spoof activity
def human_imposter(n = 60):
    i = round(np.random.uniform(low=15,high=n),3)
    time.sleep(i)

In [4]:
driver=webdriver.Chrome(driver)
driver.get('https://www.formula1.com/en/results.html/2021/races/1064/bahrain/race-result.html' )
print('You have 1 minute to manually pass the cookies popup')
time.sleep(60)

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


## Web scrapping

In [23]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

### Exctract Records of Each Menu

In [24]:
str_records = ["_".join(i.text.split()) for i in soup.find_all('a',{'data-name':'resultType'})]
str_records

['Race_result',
 'Fastest_laps',
 'Pit_stop_summary',
 'Starting_grid',
 'Qualifying',
 'Practice_3',
 'Practice_2',
 'Practice_1']

In [25]:
str_locations = [i.findNext().text for i in soup.find_all('a',{'data-name':'meetingKey'})[1:]]
str_locations

['Bahrain',
 'Emilia Romagna',
 'Portugal',
 'Spain',
 'Monaco',
 'Azerbaijan',
 'France',
 'Styria',
 'Austria',
 'Great Britain',
 'Hungary',
 'Belgium',
 'Netherlands',
 'Italy',
 'Russia',
 'Turkey',
 'United States ',
 'Mexico',
 'Brazil',
 'Qatar',
 'Saudi Arabia',
 'Abu Dhabi']

In [26]:
str_years = [i.findNext().text for i in soup.find_all('a',{'data-name':'year'})]
str_years[:12]

['2021',
 '2020',
 '2019',
 '2018',
 '2017',
 '2016',
 '2015',
 '2014',
 '2013',
 '2012',
 '2011',
 '2010']

In [27]:
years = driver.find_elements_by_xpath('//a[@data-name="year"]')


In [28]:
locations = driver.find_elements_by_xpath('//a[@data-name="meetingKey"]')

In [29]:
records = driver.find_elements_by_xpath('//a[@data-name="resultType"]')


### Function to Get Data from the Table

In [30]:
def get_table_data(record):    
    temp_soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = temp_soup.find('table')
    rows = [row for row in table.find_all('tr')] 
    my_list = []
    headers = ['_'.join(i.text.strip().split()) for i in rows[0].find_all('th')[1:-1]]
    headers = [(record+'_'+header)  if header!='Driver' else header for header in headers]
    for row in rows[1:]:
        temp_row = [ ' '.join(j.text.strip().split()) for j in row.find_all('td')[1:-1]]  
        my_list.append(dict(zip(headers,temp_row)))

    return(pd.DataFrame(my_list))

In [31]:
# Function to scrap the data from location menu
def get_records_df(location):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    if (str_records != ["_".join(i.text.split()) for i in soup.find_all('a',{'data-name':'resultType'})]):
        return None
    my_dict = {}
    for i, record in enumerate(str_records):
        records = driver.find_elements_by_xpath('//a[@data-name="resultType"]')
        records[i].click()
        human_imposter(15) #need time for loading tables
        my_dict[record] = get_table_data(record)

    my_reduce = partial(pd.merge, on='Driver', how='outer') 
    temp = reduce(my_reduce, my_dict.values())
    temp['location'] = location
    return temp

In [32]:
# Function to scrap the data from year menu
def get_loc_df(year):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    str_locations = [i.findNext().text for i in soup.find_all('a',{'data-name':'meetingKey'})[1:]]
    my_dict = {}
    for i, location in enumerate(str_locations):
        locations = driver.find_elements_by_xpath('//a[@data-name="meetingKey"]')
        driver.execute_script("window.scrollTo(0, 0);"); #can't click unless you scroll up all the way
        time.sleep(3) #allow time for scroll
        locations[i+1].click() #need +1 to avoid All category
        human_imposter(15)
        temp_df = get_records_df(location)
        if temp_df is None:
            continue #iterate past races without standard records
        my_dict[location] = temp_df
    temp = pd.concat(my_dict.values(),ignore_index = True)
    temp['year'] = year
    return temp

## Merge Each four years together

- we did not merge all data at once to avoid issues caused by big data

In [33]:
my_dict = {}
for i, year in enumerate(str_years[0:4]):
    years = driver.find_elements_by_xpath('//a[@data-name="year"]')
    driver.execute_script("window.scrollTo(0, 0);"); #can't click unless you scroll up all the way
    time.sleep(3) #allow time for scroll
    years[i].click()
    human_imposter(15)
    my_dict[year] = get_loc_df(year)
df = pd.concat(my_dict.values(),ignore_index = True)
df

Unnamed: 0,Race_result_Pos,Race_result_No,Driver,Race_result_Car,Race_result_Laps,Race_result_Time/Retired,Race_result_PTS,Fastest_laps_Pos,Fastest_laps_No,Fastest_laps_Car,...,Practice_2_Gap,Practice_2_Laps,Practice_1_Pos,Practice_1_No,Practice_1_Car,Practice_1_Time,Practice_1_Gap,Practice_1_Laps,location,year
0,1,44,Lewis Hamilton HAM,Mercedes,56,1:32:03.897,25,4,44,Mercedes,...,+0.235s,24,4,44,Mercedes,1:31.921,+0.527s,15,Bahrain,2021
1,1,44,Lewis Hamilton HAM,Mercedes,56,1:32:03.897,25,4,44,Mercedes,...,+0.235s,24,4,44,Mercedes,1:31.921,+0.527s,15,Bahrain,2021
2,2,33,Max Verstappen VER,Red Bull Racing Honda,56,+0.745s,18,2,33,Red Bull Racing Honda,...,,23,1,33,Red Bull Racing Honda,1:31.394,,12,Bahrain,2021
3,2,33,Max Verstappen VER,Red Bull Racing Honda,56,+0.745s,18,2,33,Red Bull Racing Honda,...,,23,1,33,Red Bull Racing Honda,1:31.394,,12,Bahrain,2021
4,3,77,Valtteri Bottas BOT,Mercedes,56,+37.383s,16,1,77,Mercedes,...,+0.371s,23,2,77,Mercedes,1:31.692,+0.298s,17,Bahrain,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2239,NC,9,Marcus Ericsson ERI,Sauber Ferrari,24,DNF,0,19,9,Sauber Ferrari,...,+2.266s,36,17,9,Sauber Ferrari,1:41.928,+3.437s,22,Abu Dhabi,2018
2240,NC,7,Kimi Räikkönen RAI,Ferrari,6,DNF,0,18,7,Ferrari,...,+0.225s,40,7,7,Ferrari,1:40.417,+1.926s,24,Abu Dhabi,2018
2241,NC,27,Nico Hulkenberg HUL,Renault,0,DNF,0,,,,...,+0.994s,36,12,27,Renault,1:41.023,+2.532s,24,Abu Dhabi,2018
2242,,,Antonio Giovinazzi GIO,,,,,,,,...,,,16,36,Sauber Ferrari,1:41.662,+3.171s,24,Abu Dhabi,2018


In [48]:
my_dict = {}
for i, year in enumerate(str_years[4:8]):
    years = driver.find_elements_by_xpath('//a[@data-name="year"]')
    driver.execute_script("window.scrollTo(0, 0);"); #can't click unless you scroll up all the way
    time.sleep(3) #allow time for scroll
    years[i+4].click()
    human_imposter(15)
    my_dict[year] = get_loc_df(year)
df2 = pd.concat(my_dict.values(),ignore_index = True)
df2

Unnamed: 0,Race_result_Pos,Race_result_No,Driver,Race_result_Car,Race_result_Laps,Race_result_Time/Retired,Race_result_PTS,Fastest_laps_Pos,Fastest_laps_No,Fastest_laps_Car,...,Practice_1_Pos,Practice_1_No,Practice_1_Car,Practice_1_Time,Practice_1_Gap,Practice_1_Laps,location,year,Pit_stop_summary_Pos,Pit_stop_summary_Avg_Speed
0,1,5,Sebastian Vettel VET,Ferrari,57,1:24:11.672,25,3,5,Ferrari,...,6,5,Ferrari,1:25.464,+1.244s,10,Australia,2017,,
1,2,44,Lewis Hamilton HAM,Mercedes,57,+9.975s,18,6,44,Mercedes,...,1,44,Mercedes,1:24.220,,22,Australia,2017,,
2,3,77,Valtteri Bottas BOT,Mercedes,57,+11.250s,15,2,77,Mercedes,...,2,77,Mercedes,1:24.803,+0.583s,25,Australia,2017,,
3,4,7,Kimi Räikkönen RAI,Ferrari,57,+22.393s,12,1,7,Ferrari,...,5,7,Ferrari,1:25.372,+1.152s,16,Australia,2017,,
4,5,33,Max Verstappen VER,Red Bull Racing TAG Heuer,57,+28.827s,10,5,33,Red Bull Racing TAG Heuer,...,4,33,Red Bull Racing TAG Heuer,1:25.246,+1.026s,19,Australia,2017,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284,NC,10,Kamui Kobayashi KOB,Caterham Renault,42,DNF,0,13,10,Caterham Renault,...,18,10,Caterham Renault,1:47.971,+4.495s,24,Abu Dhabi,2014,,
3285,NC,13,Pastor Maldonado MAL,Lotus Renault,26,DNF,0,20,13,Lotus Renault,...,15,13,Lotus Renault,1:46.711,+3.235s,31,Abu Dhabi,2014,,
3286,NC,26,Daniil Kvyat KVY,STR Renault,14,DNF,0,19,26,STR Renault,...,7,26,STR Renault,1:45.835,+2.359s,32,Abu Dhabi,2014,,
3287,,,Esteban Ocon OCO,,,,,,,,...,16,31,Lotus Renault,1:47.066,+3.590s,29,Abu Dhabi,2014,,


In [42]:
my_dict = {}
for i, year in enumerate(str_years[8:12]):
    years = driver.find_elements_by_xpath('//a[@data-name="year"]')
    driver.execute_script("window.scrollTo(0, 0);"); #can't click unless you scroll up all the way
    time.sleep(3) #allow time for scroll
    years[i+8].click()
    human_imposter(15)
    my_dict[year] = get_loc_df(year)
df3 = pd.concat(my_dict.values(),ignore_index = True)
df3

Unnamed: 0,Race_result_Pos,Race_result_No,Driver,Race_result_Car,Race_result_Laps,Race_result_Time/Retired,Race_result_PTS,Fastest_laps_Pos,Fastest_laps_No,Fastest_laps_Car,...,Practice_2_Gap,Practice_2_Laps,Practice_1_Pos,Practice_1_No,Practice_1_Car,Practice_1_Time,Practice_1_Gap,Practice_1_Laps,location,year
0,1,7,Kimi Räikkönen RAI,Lotus Renault,58,1:30:03.225,25,1,7,Lotus Renault,...,+0.453s,38,6,7,Lotus Renault,1:27.877,+0.666s,17,Australia,2013
1,1,7,Kimi Räikkönen RAI,Lotus Renault,58,1:30:03.225,25,1,7,Lotus Renault,...,+0.453s,38,6,7,Lotus Renault,1:27.877,+0.666s,17,Australia,2013
2,2,3,Fernando Alonso ALO,Ferrari,58,+12.451s,18,3,3,Ferrari,...,+0.840s,35,3,3,Ferrari,1:27.547,+0.336s,16,Australia,2013
3,2,3,Fernando Alonso ALO,Ferrari,58,+12.451s,18,3,3,Ferrari,...,+0.840s,35,3,3,Ferrari,1:27.547,+0.336s,16,Australia,2013
4,2,3,Fernando Alonso ALO,Ferrari,58,+12.451s,18,3,3,Ferrari,...,+0.840s,35,3,3,Ferrari,1:27.547,+0.336s,16,Australia,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3875,21,18,Jarno Trulli TRU,Lotus Cosworth,51,+4 laps,0,18,18,Lotus Cosworth,...,+4.724s,35,20,18,Lotus Cosworth,1:48.472,+5.712s,17,Abu Dhabi,2010
3876,NC,24,Timo Glock GLO,Virgin Cosworth,43,DNF,0,22,24,Virgin Cosworth,...,+4.371s,31,19,24,Virgin Cosworth,1:48.450,+5.690s,19,Abu Dhabi,2010
3877,NC,3,Michael Schumacher MSC,Mercedes,0,DNF,0,,,,...,+1.358s,29,7,3,Mercedes,1:44.199,+1.439s,19,Abu Dhabi,2010
3878,NC,15,Vitantonio Liuzzi LIU,Force India Mercedes,0,DNF,0,,,,...,+1.315s,31,15,15,Force India Mercedes,1:45.585,+2.825s,14,Abu Dhabi,2010


## Save all dataframes in csv files for later use

In [45]:
df.to_csv('2018_2021.csv')

In [46]:
df3.to_csv('2010_2013.csv')

In [49]:
df2.to_csv('2014_2017.csv')

In [50]:
driver.close()