In [126]:
from urllib.request import urlopen
import numpy as np
import pandas as pd
import csv
import torch, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
import math
import os
from tqdm.auto import tqdm
from io import BytesIO

# Data Preparation

Create a CSV file of information for one race, each row contains info for each lap
- 0th lap will be the driver's best qualifying lap

In [53]:
Sessions = pd.read_csv("Sessions.csv")
races = Sessions.query("session_type == 'Race' and session_name != 'Sprint'")
qualis = Sessions.query("session_type == 'Qualifying' and session_name == 'Qualifying'")



For each Track:
- Need to find all data for qualifying and race (Every lap they put in)
- Create a dataframe of every driver and all the laps they did including whether they pit, retired, etc
- Each row needs to have Lap time

In [132]:
QualiAndRaceSessions= Sessions[Sessions['session_name'].isin(['Qualifying', 'Race'])]
print(QualiAndRaceSessions)    

     circuit_key circuit_short_name country_code  country_key   country_name  \
6             63             Sakhir          BRN           36        Bahrain   
7             63             Sakhir          BRN           36        Bahrain   
11           149             Jeddah          KSA          153   Saudi Arabia   
12           149             Jeddah          KSA          153   Saudi Arabia   
16            10          Melbourne          AUS            5      Australia   
..           ...                ...          ...          ...            ...   
161           15          Catalunya          ESP            1          Spain   
165           19          Spielberg          AUT           17        Austria   
166           19          Spielberg          AUT           17        Austria   
170            2        Silverstone          GBR            2  Great Britain   
171            2        Silverstone          GBR            2  Great Britain   

                      date_end         

In [135]:
# Loop over each track per year
for i in tqdm(range(0,len(QualiAndRaceSessions),2)):
    location = QualiAndRaceSessions.iloc[i]['location']
    circuitkey = QualiAndRaceSessions.iloc[i]['circuit_key']
    QualifyingKey = QualiAndRaceSessions.iloc[i]['session_key']
    RaceKey = QualiAndRaceSessions.iloc[i+1]['session_key']
    
    print(location, QualifyingKey, RaceKey)
    # Gather all data pertaining to qualifying from session_key for each driver
    
    res = urlopen("https://api.openf1.org/v1/laps?csv=true&session_key=" + str(QualifyingKey))
    QualiLaps = pd.read_csv(BytesIO(res.read()))

    res = urlopen("https://api.openf1.org/v1/pit?csv=true&session_key="+str(QualifyingKey))
    QualiPits = pd.read_csv(BytesIO(res.read()))

    res = urlopen("https://api.openf1.org/v1/race_control?csv=true&session_key="+str(QualifyingKey))
    QualiRaceControl = pd.read_csv(BytesIO(res.read()))

    res = urlopen("https://api.openf1.org/v1/stints?csv=true&session_key="+str(QualifyingKey))
    QualiStints = pd.read_csv(BytesIO(res.read()))
    # Create a DataFrame with each row having for each driver: Driver_ID, position, inpit, status, laptime
    # Each Row needs to have Driver_ID, Driver Name, Position(Maybe?), whether they pitted that lap, status, laptime, tire age, compound, If there was a flag,
        
    # Gather all data pertaining to race from session_key for each driver
    res = urlopen("https://api.openf1.org/v1/laps?csv=true&session_key=" + str(RaceKey))
    RaceLaps = pd.read_csv(BytesIO(res.read()))

    res = urlopen("https://api.openf1.org/v1/pit?csv=true&session_key="+str(RaceKey))
    RacePits = pd.read_csv(BytesIO(res.read()))

    res = urlopen("https://api.openf1.org/v1/race_control?csv=true&session_key="+str(RaceKey))
    RaceControl = pd.read_csv(BytesIO(res.read()))

    res = urlopen("https://api.openf1.org/v1/stints?csv=true&session_key="+str(RaceKey))
    RaceStints = pd.read_csv(BytesIO(res.read()))
    break # Break so not spamming api calls while developing

  0%|          | 0/34 [00:00<?, ?it/s]

Sakhir 7768 7953


In [120]:
Piastri = RaceLaps.loc[RaceLaps['driver_number']==81]
print(Piastri.loc[Piastri['lap_number']==14])

print(RaceLaps.loc[RaceLaps['lap_number']==14])

Empty DataFrame
Columns: [date_start, driver_number, duration_sector_1, duration_sector_2, duration_sector_3, i1_speed, i2_speed, is_pit_out_lap, lap_duration, lap_number, meeting_key, segments_sector_1, segments_sector_2, segments_sector_3, session_key, st_speed]
Index: []
                           date_start  driver_number  duration_sector_1  \
260  2023-03-05 15:24:56.258000+00:00              1             31.427   
261  2023-03-05 15:25:58.335000+00:00              2             31.156   
262  2023-03-05 15:26:02.805000+00:00              4             31.728   
263  2023-03-05 15:25:57.226000+00:00             10             31.518   
264  2023-03-05 15:25:07.148000+00:00             11             31.411   
265  2023-03-05 15:25:22.804000+00:00             14             31.869   
266  2023-03-05 15:25:08.539000+00:00             16             52.070   
267  2023-03-05 15:25:24.883000+00:00             18             31.565   
268  2023-03-05 15:25:45.757000+00:00             

Need to get best Qualifying lap time for each driver (This will be their 0th lap time)

Then need to loop over every lap done in the race and add it to the dataframe in a similar way

In [141]:
# Using SilverStone race data since not all races have full data
QualifyingKey = 9554

res = urlopen("https://api.openf1.org/v1/laps?csv=true&session_key=" + str(QualifyingKey))
QualiLaps = pd.read_csv(BytesIO(res.read()))

res = urlopen("https://api.openf1.org/v1/pit?csv=true&session_key="+str(QualifyingKey))
QualiPits = pd.read_csv(BytesIO(res.read()))

res = urlopen("https://api.openf1.org/v1/race_control?csv=true&session_key="+str(QualifyingKey))
QualiRaceControl = pd.read_csv(BytesIO(res.read()))

res = urlopen("https://api.openf1.org/v1/stints?csv=true&session_key="+str(QualifyingKey))
QualiStints = pd.read_csv(BytesIO(res.read()))

RaceKey = 9558
res = urlopen("https://api.openf1.org/v1/laps?csv=true&session_key=" + str(RaceKey))
RaceLaps = pd.read_csv(BytesIO(res.read()))

res = urlopen("https://api.openf1.org/v1/pit?csv=true&session_key="+str(RaceKey))
RacePits = pd.read_csv(BytesIO(res.read()))

res = urlopen("https://api.openf1.org/v1/race_control?csv=true&session_key="+str(RaceKey))
RaceControl = pd.read_csv(BytesIO(res.read()))

res = urlopen("https://api.openf1.org/v1/stints?csv=true&session_key="+str(RaceKey))
RaceStints = pd.read_csv(BytesIO(res.read()))

In [140]:
print(RacePits)

                                date  driver_number  lap_number  meeting_key  \
0   2024-07-07 14:22:36.871000+00:00             24          12         1240   
1   2024-07-07 14:33:20.548000+00:00             16          19         1240   
2   2024-07-07 14:33:28.571000+00:00             11          19         1240   
3   2024-07-07 14:33:50.761000+00:00             31          19         1240   
4   2024-07-07 14:34:16.841000+00:00             24          19         1240   
5   2024-07-07 14:37:31.335000+00:00             31          21         1240   
6   2024-07-07 14:44:10.113000+00:00              1          26         1240   
7   2024-07-07 14:44:12.103000+00:00             55          26         1240   
8   2024-07-07 14:44:29.373000+00:00             27          26         1240   
9   2024-07-07 14:44:31.110000+00:00             18          26         1240   
10  2024-07-07 14:44:53.151000+00:00              3          26         1240   
11  2024-07-07 14:45:13.048000+00:00    

In [156]:
columns = ['CircuitID']
for k in range(20):
    columns.extend([
        f'driver_ID_{k+1}',
        f'position{k+1}',
        f'inPit{k+1}',
        f'status{k+1}',
        f'laptime{k+1}'
    ])
QualiLaps_sorted = QualiLaps.sort_values(by='driver_number', ascending=False)

R_NumLaps = RaceLaps.sort_values(by='lap_number',ascending = False).iloc[0]['lap_number']


laplist = []
for lap_num in range(0,R_NumLaps+1):
    driver_ids = []
    laps = []
    pits = []
    for driver_number in QualiLaps_sorted['driver_number'].unique():
        driver_ids.append(driver_number)
        if(lap_num == 0):
            q_driver_laps = QualiLaps_sorted.loc[QualiLaps_sorted['driver_number'] == driver_number]
            if not q_driver_laps.empty:
                fastest_lap = q_driver_laps.sort_values(by='lap_duration', ascending=True).iloc[0]['lap_duration']
                laps.append(fastest_lap)
                pits.append(False)
            else:
                print("No laps found")
        else:
            r_driver_laps =RaceLaps.loc[RaceLaps['driver_number'] == driver_number]
            if not r_driver_laps.empty:
                lapdata = r_driver_laps.loc[r_driver_laps['lap_number']==lap_num]
                if not lapdata.empty:
                    laptime = lapdata.iloc[0]['lap_duration']
                    pitted = lapdata.iloc[0]['is_pit_out_lap']
                    if not laptime:
                        print("No laps found")
                    if(math.isnan(laptime)):
                        laps.append(0)
                        pits.append(False)
                    else:
                        laps.append(laptime)
                        pits.append(pitted)
                    
                else:
                    laps.append(-1)#Driver retired
                    pits.append(False)
            else:
                laps.append(-1)
                pits.append(False)
        
        
    row = {}
    row['CircuitID'] = circuitkey
    for j in range(len(driver_ids)):
        row[f'driver_ID_{j+1}'] = driver_ids[j]
        row[f'position{j+1}'] = 1
        row[f'inPit{j+1}'] = pits[j]
        row[f'status{j+1}'] = 1
        row[f'laptime{j+1}'] = laps[j]
        
    laplist.append(row)
    


df = pd.DataFrame(laplist)
print(df)

    CircuitID  driver_ID_1  position1  inPit1  status1  laptime1  driver_ID_2  \
0          63           81          1   False        1    86.237           77   
1          63           81          1   False        1     0.000           77   
2          63           81          1   False        1    92.361           77   
3          63           81          1   False        1    91.927           77   
4          63           81          1   False        1    91.919           77   
5          63           81          1   False        1    91.832           77   
6          63           81          1   False        1    92.062           77   
7          63           81          1   False        1    91.925           77   
8          63           81          1   False        1    91.994           77   
9          63           81          1   False        1    92.000           77   
10         63           81          1   False        1    91.811           77   
11         63           81  

In [20]:
QualiLaps_sorted = QualiLaps.sort_values(by='driver_number', ascending=False)
# print(QualiLaps_sorted.loc[QualiLaps_sorted['driver_number']==81].sort_values(by='lap_duration', ascending=True).iloc[0])
for driver_number in QualiLaps_sorted['driver_number'].unique():
    print(driver_number, QualiLaps_sorted.loc[QualiLaps_sorted['driver_number']==driver_number].sort_values(by='lap_duration', ascending=True).iloc[0]['lap_duration'])

81 92.101
77 91.443
63 90.34
55 90.154
44 90.384
31 90.914
27 90.809
24 91.473
23 91.461
22 91.4
21 92.121
20 91.892
18 90.836
16 90.0
14 90.336
11 89.846
10 91.818
4 91.381
2 91.652
1 89.708


In [38]:
DriverQualiStints = {}

# 1) Convert QualiStints to a Map of each driver number to a query on the stints array AND Convert QualiLaps to a numpy array for efficiency
QualiLapsdropped = QualiLaps.drop(columns=['date_start', 'segments_sector_1','segments_sector_2','segments_sector_3'])
# Columns: driver_number, duration_sector_1, duration_sector_2, duration_sector_3, i1_speed, i2_speed, is_pit_out_lap, lap_duration, lap_number, meeting_key, session_key, st_speed
QualiLapsdropped = QualiLapsdropped.to_numpy()

print(QualiStints)

    
# 2) Loop over every lap done by each driver during qualifying
#   2a) Check which stint the lap is in
#   2b) Add to row: Tire_compound (1: soft, 2: medium, 3: hard, 4: unknown, 5: slick, 6: superslick), tire_age_at_start+(lap_num-lap_start)
    
# print(QualiStints.query("driver_number==44"))

# print(QualiLapsCorr)
# QualiLapsCorr.corr()

   compound  driver_number  lap_end  lap_start  meeting_key  session_key  \
0      SOFT              1        1          1         1141         7768   
1      SOFT              4        1          1         1141         7768   
2      SOFT             11        1          1         1141         7768   
3      SOFT             81        1          1         1141         7768   
4      SOFT             27        2          1         1141         7768   
..      ...            ...      ...        ...          ...          ...   
84     SOFT             63       15         13         1141         7768   
85     SOFT             16       17         15         1141         7768   
86     SOFT             27       17         15         1141         7768   
87  UNKNOWN             55       17         16         1141         7768   
88     SOFT             18       18         16         1141         7768   

    stint_number  tyre_age_at_start  
0              1                  0  
1          

Make a Dataset:

- Helper functions for getting information from each id

DataSet Class:

- <strike> Initializes with a directory containing race data files organized by year </strike>
- <strike> 3 methods for setting current year, round and next round to automatically load appropriate file </strike>
- method for processing race data for a given index, transforming into tensors 
    - Handles NaN values by replacing them with 0 and scales features like driver IDs and lap times
    - prepares sequences of lap times up to current lap for model input
    - extracts and scales relevant features from dataset to create input-output pairs for model training
- <strike> method for returning length of dataset, accounts for fact that last lap is a label </strike>


In [None]:
class RaceDataSet(torch.utils.data.Dataset):
    
    def __init__(self, dir):
        self.dir = dir
        self.year = 2023
        self.round = 1
        
        self.currentyear = os.listdir(self.dir+f'{self.year}/')
        if(self.round < len(self.currentyear)):
            self.currentrace = pd.read_csv(self.dir+f'{self.year}/{self.currentyear[self.round-1]}.csv')
        else:
            self.currentrace = pd.read_csv(self.dir+f'{self.year}/{self.currentyear[-1]}.csv')
            
        
    def set_year(self, year):
        self.year = year
        self.currentyear = os.listdir(self.dir+f'{self.year}/')
    
    def set_round(self, round):
        self.round = round
        if(self.year==2025):
            return
        if(self.round < len(self.currentyear)):
            self.currentrace = pd.read_csv(self.dir+f'{self.year}/{self.currentyear[self.round-1]}.csv')
        else:
            self.currentrace = pd.read_csv(self.dir+f'{self.year}/{self.currentyear[-1]}.csv')
            
        
    def nextround(self):
        self.round += 1
        if(self.round <len(self.currentyear)):
            self.set_round(self.round)
        else:
            self.round = 1
            self.set_year(self.year+1)
            self.set_round(self.round)

    def __len__(self):
        return len(self.currentrace)-1
    
    
    

Helper functions for displaying input/output tensors in english
- converts tensor into a chart which shows: driver name, position, inpit, laptime and status

Creating the LSTM Model

In [None]:
class RacePredictor(nn.Module):
    def __init__(self, input_size, output_size, lstm_hids, lstm_layers, dropout):
        super(RacePredictor, self).__init__()
        
        self.input_size = input_size
        self.lstm_hids = lstm_hids
        self.lstm_layers = lstm_layers
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=lstm_hids, num_layers=lstm_layers, dropout=dropout, batch_first=True)
        
        self.fc = nn.Linear(in_features=lstm_hids, out_features=output_size)
        
        nn.init.xavier_uniform_(self.fc.weight.data)
        
        for name,params in self.named_parameters():
            if name[:6] == 'weight':
                nn.init.xavier_uniform_(params)
            elif name[:4] == 'bias':
                nn.init.constant_(params, 0.0)
                
    def forward(self, ins, prev_states = None):
        lstm_outs, next_states = self.lstm(ins, prev_states)
        outs = self.fc(lstm_outs)
        return outs, next_states
    
    def zero_states(self):
        hidden_state = torch.zeros(self.lstm_layers,1,self.lstm_hids)
        cell_state = torch.zeros(self.lstm_layers,1,self.lstm_hids)
        return(hidden_state, cell_state)

Helper functions for training the LSTM model
- training method, testing method and method to do both