# Predicting F1 Qualification and Race Outcomes Using Practice Session Data

# Introduction
Formula 1 is a sport where milliseconds matter, and predictive insights can be the difference between victory and defeat. This project leverages data from Free Practice sessions (FP1, FP2, and FP3) to predict the outcome of race day—specifically, the eventual race winner. From a motorsport strategy perspective, we analyze key performance indicators such as lap times, tire compound choices, stint durations, and tire degradation rates to assess how teams and drivers prepare for qualifying and race simulations.

On the data science front, we build a structured pipeline using the FastF1 library to extract, process, and engineer features from historical Grand Prix data. Machine learning models—including classification and regression techniques—are employed to correlate practice performance with final race outcomes. By combining domain-specific race knowledge with predictive analytics, this project aims to anticipate on-track results before the lights go out.

### Importing Libraries

In [1]:
import fastf1
from fastf1.ergast import Ergast
from fastf1 import plotting
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.decomposition import PCA
import shap
import streamlit as st
import warnings

warnings.filterwarnings('ignore')
fastf1.Cache.enable_cache('./fastf1_cache')

# Load Data using legacy Ergast fallback to avoid SSL verification issues
# import urllib3
# urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# import fastf1.ergast.interface as ergast_interface

  from .autonotebook import tqdm as notebook_tqdm


## Load and Extracxt Data

#### Extracting Practice Data

In [59]:
# Extracting Practice Session Stats
def get_practice_data(seasons):
    sessions=['FP1', 'FP2', 'FP3']
    data = []
    for season in seasons:
        print(f"Season: {season}")
        schedule = fastf1.get_event_schedule(season)
        for _, row in schedule.iterrows():
            race = row['EventName']
            if race == 'Pre-Season Test':
                continue
            driver_data = defaultdict(lambda: {
                'LapTimes': [],
                'DriverID': None
            })
            for i, session in enumerate(sessions):
                try:
                    practice_sess = fastf1.get_session(season, race, session)
                    practice_sess.load()

                    for driver in practice_sess.drivers:
                        driver_laps = practice_sess.laps.pick_driver(driver).copy()
                        driver_laps = driver_laps[driver_laps['LapTime'].notna()]
                        
                        if driver_laps.empty:
                            continue
                        lap_times = driver_laps['LapTime'].dt.total_seconds().tolist()
    
                        driver_data[driver]['LapTimes'].extend(lap_times)
                        driver_data[driver]['DriverID'] = practice_sess.get_driver(driver)['DriverId'] 
                except Exception as e:
                    print(f"Skipped {season} {race} {session} due to: {e}")
                    continue

            for driver, stats in driver_data.items():
                if not stats['LapTimes']:
                    continue
                fastest_lap = min(stats['LapTimes'])
                avg_lap = sum(stats['LapTimes'])/len(stats['LapTimes'])
                data.append({
                    'Season': season,
                    'Race': race,
                    'DriverID': stats['DriverID'],
                    f'FastestPracticeLap': fastest_lap,
                    f'AvgPracticeLap': avg_lap,
                    # **tire_compound_counts # e.g. 'Soft': 6, 'Medium': 3
                })

    return pd.DataFrame(data)

In [2]:
seasons = [2021, 2022, 2023, 2024]
# practice_df =  get_practice_data(seasons)
# practice_df.to_csv('practice_data.csv', index=False)

In [3]:
practice_df = pd.read_csv('practice_data.csv', delimiter=',')
practice_df.head()

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap
0,2021,Bahrain Grand Prix,ricciardo,91.23,108.376614
1,2021,Bahrain Grand Prix,norris,90.942,110.350455
2,2021,Bahrain Grand Prix,vettel,91.769,108.59951
3,2021,Bahrain Grand Prix,latifi,93.4,108.627216
4,2021,Bahrain Grand Prix,raikkonen,91.862,108.040345


#### Extracting Qualifying Data

In [4]:
def get_qualifying_data(seasons):
    session = 'Q'
    data = []
    for season in seasons:
        print(f"Season: {season}")
        schedule = fastf1.get_event_schedule(season)
        for _, row in schedule.iterrows():
            race = row['EventName']
            if race == 'Pre-Season Test':
                continue
            print(f"Race: {race}")
            try:
                quali_sess = fastf1.get_session(season, race, session)
                quali_sess.load()
            except Exception as e:
                print(f"Skipped {season} {race} {session} due to: {e}")
                continue
                
            results = quali_sess.results
        
            for driver_id, row_data in results.iterrows():
                data.append({
                    'Season': season,
                    'Race': race,
                    'DriverID': row_data['DriverId'],
                    'Q1Time': row_data['Q1'].total_seconds() if pd.notna(row_data['Q1']) else None,
                    'Q2Time': row_data['Q2'].total_seconds() if pd.notna(row_data['Q2']) else None,
                    'Q3Time': row_data['Q3'].total_seconds() if pd.notna(row_data['Q3']) else None,
                    'QualiGridPos': int(row_data['Position']) if pd.notna(row_data['Position']) else None
                })
    
        
    return pd.DataFrame(data)

In [5]:
seasons = [2021, 2022, 2023, 2024]
# quali_df = get_qualifying_data(seasons)
# quali_df.to_csv('qualifying_data.csv', index=False)

In [6]:
quali_df = pd.read_csv('qualifying_data.csv', delimiter=',')
quali_df.head()

Unnamed: 0,Season,Race,DriverID,Q1Time,Q2Time,Q3Time,QualiGridPos
0,2021,Bahrain Grand Prix,max_verstappen,90.499,90.318,88.997,1.0
1,2021,Bahrain Grand Prix,hamilton,90.617,90.085,89.385,2.0
2,2021,Bahrain Grand Prix,bottas,91.2,90.186,89.586,3.0
3,2021,Bahrain Grand Prix,leclerc,90.691,90.01,89.678,4.0
4,2021,Bahrain Grand Prix,gasly,90.848,90.513,89.809,5.0


#### Extracting Race Result Label Data

In [7]:
def get_race_labels(seasons):
    data = []
    for season in seasons:
        schedule = fastf1.get_event_schedule(season)
        for _, row in schedule.iterrows():
            race = row['EventName']
            if race == 'Pre-Season Test':
                continue
            try:
                race_sess = fastf1.get_session(season, race, 'R')
                race_sess.load()
            except Exception as e:
                    print(f"Skipped {season} {race} {session} due to: {e}")
                
            results = race_sess.results.rename(
                columns={'Position': 'RaceResult'}
            )
            for driver_id, row_data in results.iterrows():
                data.append({
                    'Season': season,
                    'Race': race,
                    'DriverID': row_data['DriverId'],
                    'RaceGridPos': row_data['GridPosition'],
                    'RacePosition': int(row_data['RaceResult']) if pd.notna(row_data['RaceResult']) else None
                })
        
    return pd.DataFrame(data)

In [8]:
seasons = [2021, 2022, 2023, 2024]
# race_df = get_race_labels(seasons)
# race_df.to_csv('race_data.csv', index=False)

In [9]:
race_df = pd.read_csv('race_data.csv', delimiter=',')
race_df.head()

Unnamed: 0,Season,Race,DriverID,RaceGridPos,RacePosition
0,2021,Bahrain Grand Prix,hamilton,2.0,1.0
1,2021,Bahrain Grand Prix,max_verstappen,1.0,2.0
2,2021,Bahrain Grand Prix,bottas,3.0,3.0
3,2021,Bahrain Grand Prix,norris,7.0,4.0
4,2021,Bahrain Grand Prix,perez,0.0,5.0


#### Merge All Features + Race Result

In [10]:
def build_features(seasons):
    # session = fastf1.get_session(season, gp_name, 'FP2')
    # session.load()
    
    practice_df = get_practice_data(seasons)
    quali_df =  get_qualifying_data(seasons)
    race_df = get_race_labels(seasons)

    # Merging the data
    df = practice_df.merge(quali_df, on='DriverID', how='left')
    df = df.merge(race_df, on='DriverID', how='left')

    return df

In [11]:
seasons = [2021, 2022, 2023, 2024]
# features_df = build_features(seasons)

In [340]:
# df = practice_df.merge(quali_df, on=['Season', 'Race', 'DriverID'], how='left')
# df = df.merge(race_df, on=['Season', 'Race', 'DriverID'], how='left')

In [12]:
df = pd.read_csv("combined_data.csv", delimiter=",")

In [13]:
df.head()

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
0,2021,Bahrain Grand Prix,ricciardo,91.23,108.376614,90.795,90.222,89.927,6.0,6.0,7.0
1,2021,Bahrain Grand Prix,norris,90.942,110.350455,90.902,90.099,89.974,7.0,7.0,4.0
2,2021,Bahrain Grand Prix,vettel,91.769,108.59951,92.056,,,18.0,20.0,15.0
3,2021,Bahrain Grand Prix,latifi,93.4,108.627216,91.936,,,17.0,17.0,18.0
4,2021,Bahrain Grand Prix,raikkonen,91.862,108.040345,91.547,91.238,,14.0,14.0,11.0


## Feature Engineering

In [14]:
df.head()

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
0,2021,Bahrain Grand Prix,ricciardo,91.23,108.376614,90.795,90.222,89.927,6.0,6.0,7.0
1,2021,Bahrain Grand Prix,norris,90.942,110.350455,90.902,90.099,89.974,7.0,7.0,4.0
2,2021,Bahrain Grand Prix,vettel,91.769,108.59951,92.056,,,18.0,20.0,15.0
3,2021,Bahrain Grand Prix,latifi,93.4,108.627216,91.936,,,17.0,17.0,18.0
4,2021,Bahrain Grand Prix,raikkonen,91.862,108.040345,91.547,91.238,,14.0,14.0,11.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1938 entries, 0 to 1937
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              1938 non-null   int64  
 1   Race                1938 non-null   object 
 2   DriverID            1857 non-null   object 
 3   FastestPracticeLap  1938 non-null   float64
 4   AvgPracticeLap      1938 non-null   float64
 5   Q1Time              1836 non-null   float64
 6   Q2Time              1377 non-null   float64
 7   Q3Time              906 non-null    float64
 8   QualiGridPos        1856 non-null   float64
 9   RaceGridPos         1854 non-null   float64
 10  RacePosition        1854 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 166.7+ KB


### 1 - Checking for 0 Values in RaceGridPos 

<b>Explanation</b>
- 0 values in the RaceGridPos feature can mean there is an error in the values as a driver cannot have a 0 grid position
- The grid positions can be from 1-20
- After further analysis through research it was determined that the missing values correlated with drivers starting from the Pitlane

In [16]:
df_error = df[df['RaceGridPos'] == 0]
df_error = df_error[df_error['Q3Time'].isna()][['Season', 'Race', 'DriverID', 'QualiGridPos', 'RaceGridPos']]
df_error

Unnamed: 0,Season,Race,DriverID,QualiGridPos,RaceGridPos
7,2021,Bahrain Grand Prix,perez,11.0,0.0
22,2021,Emilia Romagna Grand Prix,vettel,13.0,0.0
134,2021,French Grand Prix,tsunoda,20.0,0.0
227,2021,Hungarian Grand Prix,giovinazzi,14.0,0.0
233,2021,Belgian Grand Prix,raikkonen,19.0,0.0
252,2021,Dutch Grand Prix,latifi,14.0,0.0
256,2021,Dutch Grand Prix,perez,16.0,0.0
280,2021,Italian Grand Prix,tsunoda,17.0,0.0
374,2021,São Paulo Grand Prix,raikkonen,14.0,0.0
544,2022,Emilia Romagna Grand Prix,zhou,14.0,0.0


<b>Note</b>
- The removal of the 0 values was a tedious manual process and the corrected data was saved as final_data.csv
- This is because it required checking for each race that has a driver with a zero grid position, determining why the position is 0 and then replacing the actual race grid position which was different to the qualifying grid position all the time
- After research it was concluded that the 0 values were because of the driver starting from the pitlane
- In some races the driver would start from the pitlane yet his grid position would remaing empty, meaning each driver behind that driver was in their same respective race grid position
- But this was an issue as pitlane start means the drivers are last so each driver had to be moved forward accordingly to correct for the drivers starting from the pitlane


In [19]:
df_final = pd.read_csv("final_data.csv", delimiter=",")

In [21]:
df_error_2 = df_final[df_final['RaceGridPos'] == 0]
df_error_2

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
7,2021,Bahrain Grand Prix,perez,91.503,110.446846,91.165,90.659,,11.0,0.0,5.0
22,2021,Emilia Romagna Grand Prix,vettel,76.389,87.750211,75.459,75.394,,13.0,0.0,15.0
134,2021,French Grand Prix,tsunoda,92.9,116.05724,,,,20.0,0.0,13.0
195,2021,British Grand Prix,perez,88.163,98.340622,87.121,87.073,86.844,5.0,0.0,16.0
227,2021,Hungarian Grand Prix,giovinazzi,79.113,99.988075,77.776,77.583,,14.0,0.0,13.0
233,2021,Belgian Grand Prix,raikkonen,105.967,117.372391,124.452,,,19.0,0.0,18.0
236,2021,Belgian Grand Prix,perez,105.404,123.123034,119.334,116.886,122.112,7.0,0.0,19.0
252,2021,Dutch Grand Prix,latifi,71.083,89.94434,70.093,71.161,,14.0,0.0,16.0
256,2021,Dutch Grand Prix,perez,70.526,90.331038,70.53,,,16.0,0.0,8.0
275,2021,Italian Grand Prix,gasly,81.719,94.752319,81.44,80.556,80.26,6.0,0.0,19.0


In [17]:
# First, pick relevant columns from the corrected rows
corrected = df_final[['Season', 'Race', 'DriverID', 'RaceGridPos']]

comparison_df = pd.merge(
    df_error,
    corrected,
    on=['Season', 'Race', 'DriverID'],
    how='left',
    suffixes=('_Original', '_Corrected_For_Penalties')
)

comparison_df

NameError: name 'df_final' is not defined

<b>Observations</b>
- Most drivers with 0 value in the RaceGridPos started from pitlanes so value was changed to the last values
- In the cases of latifi (dutch grand prix 2021) and gasly (belgian grand prix 2022), there were other drivers who were also started in the pit lane behind them, namely perez and tsunoda respectively.
- In the case of Tsunoda (Italain grand prix 2021), the RaceGridPos was missing
- Now we will use the df_final as the main dataframe

In [348]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1938 entries, 0 to 1937
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              1938 non-null   int64  
 1   Race                1938 non-null   object 
 2   DriverID            1857 non-null   object 
 3   FastestPracticeLap  1938 non-null   float64
 4   AvgPracticeLap      1938 non-null   float64
 5   Q1Time              1836 non-null   float64
 6   Q2Time              1377 non-null   float64
 7   Q3Time              906 non-null    float64
 8   QualiGridPos        1856 non-null   float64
 9   RaceGridPos         1854 non-null   float64
 10  RacePosition        1854 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 166.7+ KB


### 2 - Removing Pre-Season Test Rows

<b>Note</b>: 
- Since there are no preseason tests don't have practice session or qualifying sessions or final grid positions, including such rows will add no value and are not viable data points

In [349]:
df = df[~df['Race'].isin(['Pre-Season Test', 'Pre-Season Testing'])]

In [350]:
df.head()

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
0,2021,Bahrain Grand Prix,ricciardo,91.23,108.376614,90.795,90.222,89.927,6.0,6.0,7.0
1,2021,Bahrain Grand Prix,norris,90.942,110.350455,90.902,90.099,89.974,7.0,7.0,4.0
2,2021,Bahrain Grand Prix,vettel,91.769,108.59951,92.056,,,18.0,20.0,15.0
3,2021,Bahrain Grand Prix,latifi,93.4,108.627216,91.936,,,17.0,17.0,18.0
4,2021,Bahrain Grand Prix,raikkonen,91.862,108.040345,91.547,91.238,,14.0,14.0,11.0


In [351]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1898 entries, 0 to 1937
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              1898 non-null   int64  
 1   Race                1898 non-null   object 
 2   DriverID            1817 non-null   object 
 3   FastestPracticeLap  1898 non-null   float64
 4   AvgPracticeLap      1898 non-null   float64
 5   Q1Time              1796 non-null   float64
 6   Q2Time              1348 non-null   float64
 7   Q3Time              888 non-null    float64
 8   QualiGridPos        1816 non-null   float64
 9   RaceGridPos         1814 non-null   float64
 10  RacePosition        1814 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 177.9+ KB


### 3 - Inspecting missing driverID and Removing Nulls 

In [352]:
null_driverid_df = df[df['DriverID'].isnull()]
null_driverid_df

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
59,2021,Portuguese Grand Prix,,81.806,89.152667,,,,,,
75,2021,Spanish Grand Prix,,80.700,92.586750,,,,,,
79,2021,Spanish Grand Prix,,81.887,97.836556,,,,,,
138,2021,French Grand Prix,,97.881,115.508000,,,,,,
162,2021,Styrian Grand Prix,,67.823,88.532333,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1922,2024,Abu Dhabi Grand Prix,,85.471,104.907938,,,,,,
1923,2024,Abu Dhabi Grand Prix,,85.877,106.444842,,,,,,
1924,2024,Abu Dhabi Grand Prix,,86.179,108.647053,,,,,,
1925,2024,Abu Dhabi Grand Prix,,86.121,104.538286,,,,,,


In [353]:
null_driverid_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 59 to 1928
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              81 non-null     int64  
 1   Race                81 non-null     object 
 2   DriverID            0 non-null      object 
 3   FastestPracticeLap  81 non-null     float64
 4   AvgPracticeLap      81 non-null     float64
 5   Q1Time              0 non-null      float64
 6   Q2Time              0 non-null      float64
 7   Q3Time              0 non-null      float64
 8   QualiGridPos        0 non-null      float64
 9   RaceGridPos         0 non-null      float64
 10  RacePosition        0 non-null      float64
dtypes: float64(8), int64(1), object(2)
memory usage: 7.6+ KB


<b>Observations:</b>
- The missing driverIDs have everything missing from Qualifying times, GridPositions and RacePositions
- This lack of driverid,  qualifying data and race data could attribute to constructors giving reserve drivers only practice sessions, which is quite common in F1
- Thus it is sensible to remove this missing data

In [354]:
df = df.dropna(subset=['DriverID'])

In [355]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1817 entries, 0 to 1937
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              1817 non-null   int64  
 1   Race                1817 non-null   object 
 2   DriverID            1817 non-null   object 
 3   FastestPracticeLap  1817 non-null   float64
 4   AvgPracticeLap      1817 non-null   float64
 5   Q1Time              1796 non-null   float64
 6   Q2Time              1348 non-null   float64
 7   Q3Time              888 non-null    float64
 8   QualiGridPos        1816 non-null   float64
 9   RaceGridPos         1814 non-null   float64
 10  RacePosition        1814 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 170.3+ KB


### 4 - Inspecting missing raceposition and Handling Missing Values

In [356]:
null_racepos_df = df[df['RacePosition'].isnull()]
null_racepos_df

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
435,2021,Abu Dhabi Grand Prix,mazepin,86.332,106.386891,85.685,,,20.0,,
507,2022,Saudi Arabian Grand Prix,mick_schumacher,90.765,107.55013,90.167,89.92,,14.0,,
1248,2023,Singapore Grand Prix,stroll,93.126,109.927176,93.397,,,20.0,,


<b>Explanations</b>:
- Nikita Mazepin qualified 20th for the Abu Dhabi Grand Prix-2021 but he withdrew before the race as he tested positive for coronavirus. His place on the grid was left vacant
- Mick Schumacher qualified 14th for the Saudi Arabian Grand Prix-2022 but but his car was withdrawn following a crash, and he did not take the start. Drivers who qualified behind him gained a grid position as he officially did not progress beyond qualifying
- Lance Stroll crashed out in Q1 of Singapore Grand Prix-2023 and did not partake in the race
- Since this is a small set of data, we can remove these or fill the race positions as last, either way, they will not add any information
- On the contrary, adding a race position as 20 can hinder the model training as it's highly possible that the drivers can have good practice and quali sessions and a 20th position might not be the most appropriate results
- This it is feasible to remove the 3 data entries

In [357]:
df = df.dropna(subset=['RacePosition'])

In [358]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1814 entries, 0 to 1937
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              1814 non-null   int64  
 1   Race                1814 non-null   object 
 2   DriverID            1814 non-null   object 
 3   FastestPracticeLap  1814 non-null   float64
 4   AvgPracticeLap      1814 non-null   float64
 5   Q1Time              1793 non-null   float64
 6   Q2Time              1347 non-null   float64
 7   Q3Time              888 non-null    float64
 8   QualiGridPos        1813 non-null   float64
 9   RaceGridPos         1814 non-null   float64
 10  RacePosition        1814 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 170.1+ KB


### 5 - Inspecting missing GridPos and Handling Missing Values

In [359]:
null_gridpos_df = df[df['QualiGridPos'].isnull()]
null_gridpos_df

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
98,2021,Monaco Grand Prix,mick_schumacher,73.139,86.716528,,,,,20.0,18.0


<b>Explanations</b>:
- Mick Schumacher Missed Qualifying so was put in 20th position on the race grid for the Monaco Grand Prix-2021
- We can replace the missing QualiGridPos with his RaceGridPos which is last position to fill in the missing value

In [360]:
df['QualiGridPos'] = df['QualiGridPos'].fillna(df['RaceGridPos'])

In [361]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1814 entries, 0 to 1937
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              1814 non-null   int64  
 1   Race                1814 non-null   object 
 2   DriverID            1814 non-null   object 
 3   FastestPracticeLap  1814 non-null   float64
 4   AvgPracticeLap      1814 non-null   float64
 5   Q1Time              1793 non-null   float64
 6   Q2Time              1347 non-null   float64
 7   Q3Time              888 non-null    float64
 8   QualiGridPos        1814 non-null   float64
 9   RaceGridPos         1814 non-null   float64
 10  RacePosition        1814 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 170.1+ KB


### 6 - Inspecting missing Q1Time and Handling Missing Values

In [362]:
Q1Time_df = df[df['Q1Time'].isnull()]
Q1Time_df

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
31,2021,Emilia Romagna Grand Prix,tsunoda,76.23,90.852872,,,,20.0,20.0,12.0
98,2021,Monaco Grand Prix,mick_schumacher,73.139,86.716528,,,,20.0,20.0,18.0
113,2021,Azerbaijan Grand Prix,stroll,103.682,117.71375,,,,19.0,19.0,19.0
122,2021,Azerbaijan Grand Prix,giovinazzi,102.941,119.273262,,,,20.0,20.0,11.0
133,2021,French Grand Prix,stroll,93.051,112.061625,,,,19.0,19.0,10.0
134,2021,French Grand Prix,tsunoda,92.9,116.05724,,,,20.0,0.0,13.0
222,2021,Hungarian Grand Prix,mick_schumacher,79.406,94.795797,,,,20.0,20.0,12.0
303,2021,Russian Grand Prix,max_verstappen,94.621,107.765458,,,,20.0,20.0,2.0
501,2022,Saudi Arabian Grand Prix,tsunoda,90.415,112.414649,,,,20.0,19.0,19.0
520,2022,Australian Grand Prix,stroll,80.611,97.38176,,,,20.0,19.0,12.0


In [363]:
df[(df["Race"] == "Azerbaijan Grand Prix") & (df["Season"] == 2024)].sort_values(by="QualiGridPos")

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
1773,2024,Azerbaijan Grand Prix,leclerc,102.527,119.922533,102.775,102.056,101.365,1.0,1.0,2.0
1785,2024,Azerbaijan Grand Prix,piastri,102.749,114.441576,103.033,102.598,101.686,2.0,2.0,1.0
1782,2024,Azerbaijan Grand Prix,sainz,102.968,119.543886,103.357,102.503,101.805,3.0,3.0,18.0
1771,2024,Azerbaijan Grand Prix,perez,103.024,116.844818,103.213,102.263,101.813,4.0,4.0,17.0
1783,2024,Azerbaijan Grand Prix,russell,102.514,114.273258,103.139,102.329,101.874,5.0,5.0,3.0
1767,2024,Azerbaijan Grand Prix,max_verstappen,102.862,119.434378,103.097,102.042,102.023,6.0,6.0,5.0
1780,2024,Azerbaijan Grand Prix,hamilton,103.301,118.742793,103.089,102.765,102.289,7.0,19.0,9.0
1772,2024,Azerbaijan Grand Prix,alonso,103.474,120.943231,103.472,102.426,102.369,8.0,7.0,6.0
1779,2024,Azerbaijan Grand Prix,colapinto,103.238,114.728733,103.138,102.473,102.53,9.0,8.0,8.0
1776,2024,Azerbaijan Grand Prix,albon,103.194,118.546485,102.899,102.84,102.859,10.0,9.0,7.0


<b>Explanations</b>:
- Most drivers missing Q1time are actually missing all times for qualifying apart from albon (2024 dutch grand prix) and gasly (2024 Azerbaijan grand prix)
- Albon was disqualified from qualifying
- Most of these drivers started 20th or 19th apart from gasly who qualified 15th and started 18th (2024 Azerbaijan grand prix)
- Thus it is safe to remove these data points as they add no value to the prediction

In [364]:
df = df.dropna(subset=['Q1Time'])
df

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
0,2021,Bahrain Grand Prix,ricciardo,91.230,108.376614,90.795,90.222,89.927,6.0,6.0,7.0
1,2021,Bahrain Grand Prix,norris,90.942,110.350455,90.902,90.099,89.974,7.0,7.0,4.0
2,2021,Bahrain Grand Prix,vettel,91.769,108.599510,92.056,,,18.0,20.0,15.0
3,2021,Bahrain Grand Prix,latifi,93.400,108.627216,91.936,,,17.0,17.0,18.0
4,2021,Bahrain Grand Prix,raikkonen,91.862,108.040345,91.547,91.238,,14.0,14.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...
1933,2024,Abu Dhabi Grand Prix,stroll,84.531,106.555658,83.729,83.784,,13.0,13.0,14.0
1934,2024,Abu Dhabi Grand Prix,tsunoda,84.343,111.313000,83.735,83.419,,11.0,11.0,12.0
1935,2024,Abu Dhabi Grand Prix,albon,84.269,105.476520,83.821,,,16.0,18.0,11.0
1936,2024,Abu Dhabi Grand Prix,sainz,83.871,100.196750,83.487,82.985,82.824,3.0,3.0,2.0


In [365]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1793 entries, 0 to 1937
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season              1793 non-null   int64  
 1   Race                1793 non-null   object 
 2   DriverID            1793 non-null   object 
 3   FastestPracticeLap  1793 non-null   float64
 4   AvgPracticeLap      1793 non-null   float64
 5   Q1Time              1793 non-null   float64
 6   Q2Time              1345 non-null   float64
 7   Q3Time              887 non-null    float64
 8   QualiGridPos        1793 non-null   float64
 9   RaceGridPos         1793 non-null   float64
 10  RacePosition        1793 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 168.1+ KB


In [366]:
df

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition
0,2021,Bahrain Grand Prix,ricciardo,91.230,108.376614,90.795,90.222,89.927,6.0,6.0,7.0
1,2021,Bahrain Grand Prix,norris,90.942,110.350455,90.902,90.099,89.974,7.0,7.0,4.0
2,2021,Bahrain Grand Prix,vettel,91.769,108.599510,92.056,,,18.0,20.0,15.0
3,2021,Bahrain Grand Prix,latifi,93.400,108.627216,91.936,,,17.0,17.0,18.0
4,2021,Bahrain Grand Prix,raikkonen,91.862,108.040345,91.547,91.238,,14.0,14.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...
1933,2024,Abu Dhabi Grand Prix,stroll,84.531,106.555658,83.729,83.784,,13.0,13.0,14.0
1934,2024,Abu Dhabi Grand Prix,tsunoda,84.343,111.313000,83.735,83.419,,11.0,11.0,12.0
1935,2024,Abu Dhabi Grand Prix,albon,84.269,105.476520,83.821,,,16.0,18.0,11.0
1936,2024,Abu Dhabi Grand Prix,sainz,83.871,100.196750,83.487,82.985,82.824,3.0,3.0,2.0


### 7 - Adding Feature of Qualifying Stage Reached

<b>Explanation</b>
- Mostly due to penalties and missing qualifying times for sessions, it is difficult to discern the qualifying pace of drivers
- Thus determining which stage of Qualifying they reached can be an important indicator of quali pace

In [367]:
def compute_quali_stage(df):
    df = df.copy()

    # Rank Q1 and Q2 times within each (Season, Race) group
    df['Q1_Rank'] = df.groupby(['Season', 'Race'])['Q1Time'].rank(method='min')
    df['Q2_Rank'] = df.groupby(['Season', 'Race'])['Q2Time'].rank(method='min')

    def infer_stage(row):
        if pd.notna(row['Q3Time']):
            return 3  # Reached Q3
        elif pd.isna(row['Q3Time']) and pd.notna(row['Q2Time']) and row['Q2_Rank'] <= 10 and row['QualiGridPos'] <= 10:
            return 3  # Likely reached Q3
        elif pd.notna(row['Q2Time']):
            return 2  # Reached Q2
        elif pd.isna(row['Q2Time']) and pd.notna(row['Q1Time']) and row['Q1_Rank'] <= 15 and row['QualiGridPos'] <= 15:
            return 2  # Likely reached Q2
        else:
            return 1  # Eliminated in Q1

    df['QualiStage'] = df.apply(infer_stage, axis=1)
    return df

In [368]:
df = compute_quali_stage(df)

In [369]:
# Checking drivers who reached q3 but did not set a time in q3
df_q3 = df[df['QualiStage'] == 3]
df_q3[df_q3['Q3Time'].isna()][['Season', 'Race', 'DriverID', 'QualiGridPos', 'RaceGridPos']]

Unnamed: 0,Season,Race,DriverID,QualiGridPos,RaceGridPos
30,2021,Emilia Romagna Grand Prix,stroll,10.0,10.0
230,2021,Belgian Grand Prix,norris,10.0,15.0
518,2022,Australian Grand Prix,alonso,10.0,10.0
548,2022,Emilia Romagna Grand Prix,sainz,10.0,4.0
679,2022,Austrian Grand Prix,gasly,10.0,14.0
705,2022,French Grand Prix,kevin_magnussen,10.0,20.0
711,2022,French Grand Prix,sainz,9.0,19.0
767,2022,Dutch Grand Prix,stroll,10.0,10.0
784,2022,Italian Grand Prix,alonso,10.0,6.0
899,2022,São Paulo Grand Prix,leclerc,10.0,5.0


In [370]:
df = df.drop(['Q1_Rank', 'Q2_Rank'], axis=1)
df

Unnamed: 0,Season,Race,DriverID,FastestPracticeLap,AvgPracticeLap,Q1Time,Q2Time,Q3Time,QualiGridPos,RaceGridPos,RacePosition,QualiStage
0,2021,Bahrain Grand Prix,ricciardo,91.230,108.376614,90.795,90.222,89.927,6.0,6.0,7.0,3
1,2021,Bahrain Grand Prix,norris,90.942,110.350455,90.902,90.099,89.974,7.0,7.0,4.0,3
2,2021,Bahrain Grand Prix,vettel,91.769,108.599510,92.056,,,18.0,20.0,15.0,1
3,2021,Bahrain Grand Prix,latifi,93.400,108.627216,91.936,,,17.0,17.0,18.0,1
4,2021,Bahrain Grand Prix,raikkonen,91.862,108.040345,91.547,91.238,,14.0,14.0,11.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1933,2024,Abu Dhabi Grand Prix,stroll,84.531,106.555658,83.729,83.784,,13.0,13.0,14.0,2
1934,2024,Abu Dhabi Grand Prix,tsunoda,84.343,111.313000,83.735,83.419,,11.0,11.0,12.0,2
1935,2024,Abu Dhabi Grand Prix,albon,84.269,105.476520,83.821,,,16.0,18.0,11.0,1
1936,2024,Abu Dhabi Grand Prix,sainz,83.871,100.196750,83.487,82.985,82.824,3.0,3.0,2.0,3


### 8 - Separate features and target columns

In [None]:
X = df.drop('RacePosition', axis=1)