In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv('data/cleaned_f1_data.csv')

Data Preparation

In [17]:
def preprocess_stints(df):
    # Drop outdated columns (ignore if missing)
    df.drop(columns=['RaceName', 'TyresChange', 'LapStint', 'StintNum', 'StintID'], inplace=True, errors='ignore')

    # Convert Time column to timedelta
    df['Time'] = pd.to_timedelta(df['Time'], errors='coerce')

    # Detect tire compound changes
    df['TireChange'] = df['Compound'] != df.groupby(['Driver', 'RaceID'])['Compound'].shift(1)
    df['TireChange'] = df['TireChange'].fillna(False)
    df['IsPitLap'] = df['PitInTime'].notnull()
    df['NumPitStops'] = (
        df.groupby(['Year', 'RaceName', 'Driver'])['IsPitLap'].cumsum()
    )

    df['IsRedFlagTireSwap'] = df['TiresChange'] & ~df['IsPitLap']
    # Assign stint numbers
    df['StintNum'] = df.groupby(['Driver', 'RaceID'])['TireChange'].cumsum().astype(int)  # start at 
    # Lap number within the stint
    df['LapStint'] = df.groupby(['Driver', 'RaceID', 'StintNum']).cumcount() + 1

    # Stint start time = first lap time in that stint
    df['StintStartTime'] = df.groupby(['Driver', 'StintNum'])['Time'].transform('first')

    # Time on tire = time since stint start
    df['TimeOnTireSeconds'] = (df['Time'] - df['StintStartTime']).dt.total_seconds()

    return df


In [18]:
df = preprocess_stints(df)

In [19]:
df[['Driver', 'RaceID', 'LapNumber', 'Time', 'StintStartTime', 'TimeOnTireSeconds', 'Compound', 'LapStint', 'StintNum', 'LapTime', 'Position']].head(50)


Unnamed: 0,Driver,RaceID,LapNumber,Time,StintStartTime,TimeOnTireSeconds,Compound,LapStint,StintNum,LapTime,Position
0,NOR,2024_Singapore_Grand_Prix,1.0,0 days 00:58:41.675000,0 days 00:58:41.675000,0.0,MEDIUM,1,1,100.424,1.0
1,VER,2024_Singapore_Grand_Prix,1.0,0 days 00:58:42.664000,0 days 00:58:42.664000,0.0,MEDIUM,1,1,101.413,2.0
2,HAM,2024_Singapore_Grand_Prix,1.0,0 days 00:58:43.953000,0 days 00:58:43.953000,0.0,SOFT,1,1,102.702,3.0
3,RUS,2024_Singapore_Grand_Prix,1.0,0 days 00:58:45.073000,0 days 00:58:45.073000,0.0,MEDIUM,1,1,103.822,4.0
4,PIA,2024_Singapore_Grand_Prix,1.0,0 days 00:58:46.519000,0 days 00:58:46.519000,0.0,MEDIUM,1,1,105.268,5.0
5,HUL,2024_Singapore_Grand_Prix,1.0,0 days 00:58:46.974000,0 days 00:58:46.974000,0.0,MEDIUM,1,1,105.723,6.0
6,ALO,2024_Singapore_Grand_Prix,1.0,0 days 00:58:47.635000,0 days 00:58:47.635000,0.0,MEDIUM,1,1,106.384,7.0
7,LEC,2024_Singapore_Grand_Prix,1.0,0 days 00:58:48.341000,0 days 00:58:48.341000,0.0,MEDIUM,1,1,107.09,8.0
8,COL,2024_Singapore_Grand_Prix,1.0,0 days 00:58:48.947000,0 days 00:58:48.947000,0.0,MEDIUM,1,1,107.696,9.0
9,PER,2024_Singapore_Grand_Prix,1.0,0 days 00:58:49.401000,0 days 00:58:49.401000,0.0,MEDIUM,1,1,108.15,10.0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77799 entries, 0 to 77798
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype          
---  ------             --------------  -----          
 0   Driver             77799 non-null  object         
 1   Team               77799 non-null  object         
 2   LapNumber          77799 non-null  float64        
 3   LapTime            77799 non-null  float64        
 4   Compound           77799 non-null  object         
 5   TrackStatus        77799 non-null  int64          
 6   Position           77799 non-null  float64        
 7   Time               77799 non-null  timedelta64[ns]
 8   AirTemp            77799 non-null  float64        
 9   Humidity           77799 non-null  float64        
 10  Pressure           77799 non-null  float64        
 11  Rainfall           77799 non-null  bool           
 12  TrackTemp          77799 non-null  float64        
 13  WindSpeed          77799 non-null  float64    

In [12]:
df.groupby(['RaceID', 'Driver'])['LapTime'].apply(lambda x : (x > 200).sum()).sort_values(ascending=False).head(50)

RaceID                         Driver
2024_Monaco_Grand_Prix         SAI       1
                               RIC       1
                               BOT       1
                               GAS       1
                               HAM       1
                               LEC       1
                               NOR       1
                               PIA       1
                               RUS       1
2022_Monaco_Grand_Prix         LAT       1
2024_Monaco_Grand_Prix         SAR       1
                               STR       1
                               TSU       1
2022_Monaco_Grand_Prix         STR       1
2024_Monaco_Grand_Prix         VER       1
                               ZHO       1
                               ALO       1
                               ALB       1
2022_British_Grand_Prix        OCO       1
                               TSU       1
2022_Pre-Season_Track_Session  TSU       1
                               OCO       1
2024_Australian_

In [55]:
target = 'LapTime'
features = [
    'Driver', 'Team', 'Compound', 'TrackStatus', 'Position',
    'AirTemp', 'Humidity', 'WindSpeed', 'WindDirection',
    'IsPitLap', 'NumPitStops',
    'StintNum', 'LapStint', 'TimeOnTireSeconds'
]

In [56]:
categorical_cols = ['Driver', 'Team', 'Compound', 'RaceID']

In [57]:
df_label = df.copy()
for col in categorical_cols:
    le = LabelEncoder()  # create a new instance each time
    df_label[col] = le.fit_transform(df_label[col].astype(str))

In [59]:
X = df[features]
y = df[target]
gkf = GroupKFold(n_splits=5)

model = RandomForestRegressor(n_estimators=100, random_state=42)

cv_results = cross_validate(
    model, X, y,
    cv=gkf.split(X, y, groups=df_label['RaceID']),
    scoring=['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'],
    return_train_score=True
)
print("Avg R²:", cv_results['test_r2'].mean())
print("Avg MAE:", -cv_results['test_neg_mean_absolute_error'].mean())
print("Avg RMSE:", -cv_results['test_neg_root_mean_squared_error'].mean())

Avg R²: -0.41763935532077123
Avg MAE: 11.626696290616389
Avg RMSE: 27.72941043324547
