In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import requests

In [2]:
# Function to calculate points for each driver prior to the race

def points_calc(df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df

## Races DF

Races DF:
I used 2 sources for the data: the Ergast F1 data repository and the official Formula 1 website. I used 2 data sources for greater accuracy.
THis dataframe contains information about all the championships and races from 1950 to 2022, including their location and link to wikipedia page.

In [3]:
races = {'season': [],
        'round': [],
        'circuit_id': [],
        'lat': [],
        'long': [],
        'country': [],
        'date': [],
        'url': []}
for year in list(range(1950,2023)):
    
    url = 'https://ergast.com/api/f1/{}.json'
    r = requests.get(url.format(year))
    json = r.json()

    for item in json['MRData']['RaceTable']['Races']:
        try:
            races['season'].append(int(item['season']))
        except:
            races['season'].append(None)

        try:
            races['round'].append(int(item['round']))
        except:
            races['round'].append(None)

        try:
            races['circuit_id'].append(item['Circuit']['circuitId'])
        except:
            races['circuit_id'].append(None)

        try:
            races['lat'].append(float(item['Circuit']['Location']['lat']))
        except:
            races['lat'].append(None)

        try:
            races['long'].append(float(item['Circuit']['Location']['long']))
        except:
            races['long'].append(None)

        try:
            races['country'].append(item['Circuit']['Location']['country'])
        except:
            races['country'].append(None)

        try:
            races['date'].append(item['date'])
        except:
            races['date'].append(None)

        try:
            races['url'].append(item['url'])
        except:
            races['url'].append(None)
        
races = pd.DataFrame(races)
print(races.shape)

(1079, 8)


In [4]:
races.head()

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,1950,1,silverstone,52.0786,-1.01694,UK,1950-05-13,http://en.wikipedia.org/wiki/1950_British_Gran...
1,1950,2,monaco,43.7347,7.42056,Monaco,1950-05-21,http://en.wikipedia.org/wiki/1950_Monaco_Grand...
2,1950,3,indianapolis,39.795,-86.2347,USA,1950-05-30,http://en.wikipedia.org/wiki/1950_Indianapolis...
3,1950,4,bremgarten,46.9589,7.40194,Switzerland,1950-06-04,http://en.wikipedia.org/wiki/1950_Swiss_Grand_...
4,1950,5,spa,50.4372,5.97139,Belgium,1950-06-18,http://en.wikipedia.org/wiki/1950_Belgian_Gran...


In [5]:
races.tail()

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
1074,2022,18,suzuka,34.8431,136.541,Japan,2022-10-09,http://en.wikipedia.org/wiki/2022_Japanese_Gra...
1075,2022,19,americas,30.1328,-97.6411,USA,2022-10-23,http://en.wikipedia.org/wiki/2022_United_State...
1076,2022,20,rodriguez,19.4042,-99.0907,Mexico,2022-10-30,http://en.wikipedia.org/wiki/2022_Mexican_Gran...
1077,2022,21,interlagos,-23.7036,-46.6997,Brazil,2022-11-13,http://en.wikipedia.org/wiki/2022_Brazilian_Gr...
1078,2022,22,yas_marina,24.4672,54.6031,UAE,2022-11-20,http://en.wikipedia.org/wiki/2022_Abu_Dhabi_Gr...


In [6]:
races.to_csv('races.csv', index = False)

## Rounds
Counting number of rounds in each season

In [7]:
race = pd.read_csv('races.csv')

In [16]:
rounds = []
for year in np.array(race.season.unique()):
    rounds.append([year,list(race[race.season == year]['round'])])
    
rounds[:5]

[[1950, [1, 2, 3, 4, 5, 6, 7]],
 [1951, [1, 2, 3, 4, 5, 6, 7, 8]],
 [1952, [1, 2, 3, 4, 5, 6, 7, 8]],
 [1953, [1, 2, 3, 4, 5, 6, 7, 8, 9]],
 [1954, [1, 2, 3, 4, 5, 6, 7, 8, 9]]]

## Results DF

For the second dataframe I iterated through each year and each round of my races file to query the Ergast API and get information about all the drivers’ results. I included features such as grid and finishing position of each driver, their teams, and other less relevant variables such as date of birth, nationality and finishing status, which I will explore later to check whether there could be a correlation between the age of the drivers and their performance, if racing in their home country could have any psychological impact, or if some drivers are more prone to crash than others.

In [17]:
results = {'season': [],
          'round':[],
           'circuit_id':[],
          'driver': [],
           'date_of_birth': [],
           'nationality': [],
          'constructor': [],
          'grid': [],
          'time': [],
          'status': [],
          'points': [],
          'podium': [],
          'url': []}

for n in list(range(len(rounds))):
    for i in rounds[n][1]:
    
        url = 'http://ergast.com/api/f1/{}/{}/results.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['RaceTable']['Races'][0]['Results']:
            try:
                results['season'].append(int(json['MRData']['RaceTable']['Races'][0]['season']))
            except:
                results['season'].append(None)

            try:
                results['round'].append(int(json['MRData']['RaceTable']['Races'][0]['round']))
            except:
                results['round'].append(None)

            try:
                results['circuit_id'].append(json['MRData']['RaceTable']['Races'][0]['Circuit']['circuitId'])
            except:
                results['circuit_id'].append(None)

            try:
                results['driver'].append(item['Driver']['driverId'])
            except:
                results['driver'].append(None)
            
            try:
                results['date_of_birth'].append(item['Driver']['dateOfBirth'])
            except:
                results['date_of_birth'].append(None)
                
            try:
                results['nationality'].append(item['Driver']['nationality'])
            except:
                results['nationality'].append(None)

            try:
                results['constructor'].append(item['Constructor']['constructorId'])
            except:
                results['constructor'].append(None)

            try:
                results['grid'].append(int(item['grid']))
            except:
                results['grid'].append(None)

            try:
                results['time'].append(int(item['Time']['millis']))
            except:
                results['time'].append(None)

            try:
                results['status'].append(item['status'])
            except:
                results['status'].append(None)

            try:
                results['points'].append(int(item['points']))
            except:
                results['points'].append(None)

            try:
                results['podium'].append(int(item['position']))
            except:
                results['podium'].append(None)

            try:
                results['url'].append(json['MRData']['RaceTable']['Races'][0]['url'])
            except:
                results['url'].append(None)

results = pd.DataFrame(results)
print(results.shape)

(25387, 13)


In [18]:
results.head()

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium,url
0,1950,1,silverstone,farina,1906-10-30,Italian,alfa,1,8003600.0,Finished,9.0,1,http://en.wikipedia.org/wiki/1950_British_Gran...
1,1950,1,silverstone,fagioli,1898-06-09,Italian,alfa,2,8006200.0,Finished,6.0,2,http://en.wikipedia.org/wiki/1950_British_Gran...
2,1950,1,silverstone,reg_parnell,1911-07-02,British,alfa,4,8055600.0,Finished,4.0,3,http://en.wikipedia.org/wiki/1950_British_Gran...
3,1950,1,silverstone,cabantous,1904-10-08,French,lago,6,,+2 Laps,3.0,4,http://en.wikipedia.org/wiki/1950_British_Gran...
4,1950,1,silverstone,rosier,1905-11-05,French,lago,9,,+2 Laps,2.0,5,http://en.wikipedia.org/wiki/1950_British_Gran...


In [19]:
results.tail()

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium,url
25382,2022,22,yas_marina,mick_schumacher,1999-03-22,German,haas,12,,+1 Lap,0.0,16,http://en.wikipedia.org/wiki/2022_Abu_Dhabi_Gr...
25383,2022,22,yas_marina,kevin_magnussen,1992-10-05,Danish,haas,16,,+1 Lap,0.0,17,http://en.wikipedia.org/wiki/2022_Abu_Dhabi_Gr...
25384,2022,22,yas_marina,hamilton,1985-01-07,British,mercedes,5,,Gearbox,0.0,18,http://en.wikipedia.org/wiki/2022_Abu_Dhabi_Gr...
25385,2022,22,yas_marina,latifi,1995-06-29,Canadian,williams,20,,Collision damage,0.0,19,http://en.wikipedia.org/wiki/2022_Abu_Dhabi_Gr...
25386,2022,22,yas_marina,alonso,1981-07-29,Spanish,alpine,10,,Water leak,0.0,20,http://en.wikipedia.org/wiki/2022_Abu_Dhabi_Gr...


In [20]:
results.to_csv('results.csv', index = False)

## Driver Standings DF

Points are awarded during the Championship based on where drivers and teams finish the race. Only the first 10 drivers finishing are awarded points, with the winner receiving 25 points. The Ergast API provides the number of points, wins and the standing position of each driver and team throughout the Championship. Because the points are awarded after the race, I had to create a lookup function to shift the points from previous races within the same Championship.

In [30]:
driver_standings = {'season': [],
                    'round':[],
                    'driver': [],
                    'driver_points': [],
                    'driver_wins': [],
                   'driver_standings_pos': []}

for n in list(range(len(rounds))):
    for i in rounds[n][1]:
    
        url = 'https://ergast.com/api/f1/{}/{}/driverStandings.json'
        r = requests.get(url.format(rounds[n][0], i),verify=False)
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']:
            try:
                driver_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                driver_standings['season'].append(None)

            try:
                driver_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                driver_standings['round'].append(None)
                                         
            try:
                driver_standings['driver'].append(item['Driver']['driverId'])
            except:
                driver_standings['driver'].append(None)
            
            try:
                driver_standings['driver_points'].append(int(item['points']))
            except:
                driver_standings['driver_points'].append(None)
            
            try:
                driver_standings['driver_wins'].append(int(item['wins']))
            except:
                driver_standings['driver_wins'].append(None)
                
            try:
                driver_standings['driver_standings_pos'].append(int(item['position']))
            except:
                driver_standings['driver_standings_pos'].append(None)
            
driver_standings = pd.DataFrame(driver_standings)
print(driver_standings.shape)

















































































(27580, 6)


In [31]:
driver_standings.tail()

Unnamed: 0,season,round,driver,driver_points,driver_wins,driver_standings_pos
27575,2022,22,zhou,6.0,0,18
27576,2022,22,albon,4.0,0,19
27577,2022,22,latifi,2.0,0,20
27578,2022,22,de_vries,2.0,0,21
27579,2022,22,hulkenberg,0.0,0,22


In [32]:
driver_standings = points_calc(driver_standings, 'driver', 'driver_points')

In [33]:
driver_standings = points_calc(driver_standings, 'driver', 'driver_wins')

In [34]:
driver_standings = points_calc(driver_standings, 'driver', 'driver_standings_pos')

In [35]:
driver_standings.head()

Unnamed: 0,season,round,driver,driver_points_after_race,driver_wins_after_race,driver_standings_pos_after_race,driver_points,driver_wins,driver_standings_pos
0,1950,1,farina,9.0,1,1,0.0,0.0,0.0
1,1950,1,fagioli,6.0,0,2,0.0,0.0,0.0
2,1950,1,reg_parnell,4.0,0,3,0.0,0.0,0.0
3,1950,1,cabantous,3.0,0,4,0.0,0.0,0.0
4,1950,1,rosier,2.0,0,5,0.0,0.0,0.0


In [36]:
driver_standings.tail()

Unnamed: 0,season,round,driver,driver_points_after_race,driver_wins_after_race,driver_standings_pos_after_race,driver_points,driver_wins,driver_standings_pos
27575,2022,22,zhou,6.0,0,18,6.0,0.0,18.0
27576,2022,22,albon,4.0,0,19,4.0,0.0,19.0
27577,2022,22,latifi,2.0,0,20,2.0,0.0,20.0
27578,2022,22,de_vries,2.0,0,21,2.0,0.0,21.0
27579,2022,22,hulkenberg,0.0,0,22,0.0,0.0,22.0


In [37]:
driver_standings.to_csv('driver_standings.csv', index = False)

## Constructor standings

The Constructors Championship was awarded for the first time in 1958 so there is no data prior to that year. The data mining process is the same as the driver standings’, eventually applying the same lookup function to get the data before the race.

In [38]:
constructor_rounds = rounds[8:]

constructor_standings = {'season': [],
                    'round':[],
                    'constructor': [],
                    'constructor_points': [],
                    'constructor_wins': [],
                   'constructor_standings_pos': []}

for n in list(range(len(constructor_rounds))):
    for i in constructor_rounds[n][1]:
    
        url = 'https://ergast.com/api/f1/{}/{}/constructorStandings.json'
        r = requests.get(url.format(constructor_rounds[n][0], i),verify=False)
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']:
            try:
                constructor_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                constructor_standings['season'].append(None)

            try:
                constructor_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                constructor_standings['round'].append(None)
                                         
            try:
                constructor_standings['constructor'].append(item['Constructor']['constructorId'])
            except:
                constructor_standings['constructor'].append(None)
            
            try:
                constructor_standings['constructor_points'].append(int(item['points']))
            except:
                constructor_standings['constructor_points'].append(None)
            
            try:
                constructor_standings['constructor_wins'].append(int(item['wins']))
            except:
                constructor_standings['constructor_wins'].append(None)
                
            try:
                constructor_standings['constructor_standings_pos'].append(int(item['position']))
            except:
                constructor_standings['constructor_standings_pos'].append(None)
            
constructor_standings = pd.DataFrame(constructor_standings)
print(constructor_standings.shape)













































































(12931, 6)




In [39]:
constructor_standings = points_calc(constructor_standings, 'constructor', 'constructor_points')

In [40]:
constructor_standings = points_calc(constructor_standings, 'constructor', 'constructor_wins')

In [41]:
constructor_standings = points_calc(constructor_standings, 'constructor', 'constructor_standings_pos')

In [42]:
constructor_standings.tail()

Unnamed: 0,season,round,constructor,constructor_points_after_race,constructor_wins_after_race,constructor_standings_pos_after_race,constructor_points,constructor_wins,constructor_standings_pos
12926,2022,22,alfa,55.0,0,6,55.0,0.0,6.0
12927,2022,22,aston_martin,55.0,0,7,50.0,0.0,7.0
12928,2022,22,haas,37.0,0,8,37.0,0.0,8.0
12929,2022,22,alphatauri,35.0,0,9,35.0,0.0,9.0
12930,2022,22,williams,8.0,0,10,8.0,0.0,10.0


In [43]:
constructor_standings.to_csv('constructor_standings.csv', index = False)