In [1]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# Data Access
merged_data = pd.read_csv("C:/Users/yunus/Downloads/Formula1-2/output.csv", low_memory=False)


In [3]:
# Checking data summary
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23660 entries, 0 to 23659
Data columns (total 96 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EventName           23660 non-null  object 
 1   Year                23660 non-null  int64  
 2   Time_x              23660 non-null  object 
 3   Driver              23660 non-null  object 
 4   DriverNumber_left   23660 non-null  int64  
 5   LapTime             23510 non-null  object 
 6   LapNumber           23660 non-null  float64
 7   Stint               23660 non-null  float64
 8   PitOutTime          697 non-null    object 
 9   PitInTime           706 non-null    object 
 10  Sector1Time         23199 non-null  object 
 11  Sector2Time         23634 non-null  object 
 12  Sector3Time         23610 non-null  object 
 13  Sector1SessionTime  23147 non-null  object 
 14  Sector2SessionTime  23634 non-null  object 
 15  Sector3SessionTime  23610 non-null  object 
 16  Spee

In [4]:
# Data Preprocessing
# 1. Handling Missing Values
# Drop columns with very high missingness and repeated data due to merge
merged_data.drop(['Deleted', 'FastF1Generated', 'IsAccurate', 'Message', 'IsPersonalBest', 'Q1', 'Q2', 'Q3','PitOutTime', 'PitInTime', 
                  'DeletedReason', 'HeadshotUrl', 'Time_right2', 'Status_left3', 'Days_left3', 'Time', 'Status_right3', 'Days_right3',
                 'Time_y', 'Date_left', 'Source_left', 'Time_left', 'SessionTime_left', 'Days_left', 'Date_right', 'Status_left', 
                  'Source_right', 'Time_right', 'SessionTime_right', 'Days_right', 'DriverNumber_right', 'BroadcastName', 'DriverId', 
                  'TeamName', 'TeamColor', 'TeamId', 'FirstName', 'LastName', 'FullName', 'CountryCode', 'Time_left2', 'Status_right', 
                   'LapStartDate', 'Days_x', 'Days_y'],
                 axis=1, inplace=True)


In [5]:
# Save the model ready dataframe to CSV
merged_data.to_csv('modeldata.csv', index=False)

In [6]:
# ML MODELS
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [7]:
# Checking data summary
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23660 entries, 0 to 23659
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EventName           23660 non-null  object 
 1   Year                23660 non-null  int64  
 2   Time_x              23660 non-null  object 
 3   Driver              23660 non-null  object 
 4   DriverNumber_left   23660 non-null  int64  
 5   LapTime             23510 non-null  object 
 6   LapNumber           23660 non-null  float64
 7   Stint               23660 non-null  float64
 8   Sector1Time         23199 non-null  object 
 9   Sector2Time         23634 non-null  object 
 10  Sector3Time         23610 non-null  object 
 11  Sector1SessionTime  23147 non-null  object 
 12  Sector2SessionTime  23634 non-null  object 
 13  Sector3SessionTime  23610 non-null  object 
 14  SpeedI1             20305 non-null  float64
 15  SpeedI2             23634 non-null  float64
 16  Spee

In [8]:
# Selecting relevant columns for modeling
target_columns = ['Number_of_Laps', 'Used_Tire_Compound', 'Avg_Lap_Time', 'Number_of_Stints']

feature_columns = ['EventName', 'Driver', 'GridPosition', 'Year', 'TyreLife', 'FreshTyre', 'TrackStatus', 'Rainfall',
                   'Speed', 'Throttle', 'Brake', 'DRS', 'ClassifiedPosition', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindSpeed']

# Cleaning data: dropping rows with missing target values and converting 'Avg_Lap_Time' to seconds
data_cleaned = merged_data.dropna()
data_cleaned['Avg_Lap_Time'] = pd.to_timedelta(data_cleaned['Avg_Lap_Time']).dt.total_seconds()

# Splitting data for each target
X = data_cleaned[feature_columns]

# Convert categorical columns to category dtype
X['Driver'] = X['Driver'].astype('category')
X['EventName'] = X['EventName'].astype('category')
X['Year'] = X['Year'].astype('category')
X['ClassifiedPosition'] = X['ClassifiedPosition'].astype('category')
X['FreshTyre'] = X['FreshTyre'].astype('category')
X['Brake'] = X['Brake'].astype('category')
X['Rainfall'] = X['Rainfall'].astype('category')
X['TrackStatus'] = X['TrackStatus'].astype('category')
X['DRS'] = X['DRS'].astype('category')

# 'Driver', 'EventName', 'Year', 'ClassifiedPosition', 'FreshTyre', 'Brake', 'Rainfall', 'TrackStatus', 'DRS'

# Splitting the data for each model
y_laps = data_cleaned['Number_of_Laps']
y_tires = data_cleaned['Used_Tire_Compound']
y_avg_lap_time = data_cleaned['Avg_Lap_Time']
y_stints = data_cleaned['Number_of_Stints']

X_train_laps, X_test_laps, y_train_laps, y_test_laps = train_test_split(X, y_laps, test_size=0.2, random_state=42)
X_train_tires, X_test_tires, y_train_tires, y_test_tires = train_test_split(X, y_tires, test_size=0.2, random_state=42)
X_train_avg_lap_time, X_test_avg_lap_time, y_train_avg_lap_time, y_test_avg_lap_time = train_test_split(X, y_avg_lap_time, test_size=0.2, random_state=42)
X_train_stints, X_test_stints, y_train_stints, y_test_stints = train_test_split(X, y_stints, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['Avg_Lap_Time'] = pd.to_timedelta(data_cleaned['Avg_Lap_Time']).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Driver'] = X['Driver'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['EventName'] = X['EventName'].astype('category')
A value is tr

In [9]:
# Checking data summary
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17249 entries, 1 to 23658
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EventName           17249 non-null  object 
 1   Year                17249 non-null  int64  
 2   Time_x              17249 non-null  object 
 3   Driver              17249 non-null  object 
 4   DriverNumber_left   17249 non-null  int64  
 5   LapTime             17249 non-null  object 
 6   LapNumber           17249 non-null  float64
 7   Stint               17249 non-null  float64
 8   Sector1Time         17249 non-null  object 
 9   Sector2Time         17249 non-null  object 
 10  Sector3Time         17249 non-null  object 
 11  Sector1SessionTime  17249 non-null  object 
 12  Sector2SessionTime  17249 non-null  object 
 13  Sector3SessionTime  17249 non-null  object 
 14  SpeedI1             17249 non-null  float64
 15  SpeedI2             17249 non-null  float64
 16  SpeedFL  

In [10]:
cat_cols = ['Driver', 'EventName', 'Year', 'ClassifiedPosition', 'FreshTyre', 'Brake', 'Rainfall', 'TrackStatus', 'DRS']

In [11]:
# Model 1: Predicting Number_of_Stints (Regression) - XGBoost
model_stints_xgb = XGBRegressor(n_estimators= 3500,learning_rate= 0.05,max_depth= 5, objective='reg:squarederror', 
                              eval_metric= 'rmse', enable_categorical=True, random_state=42)
model_stints_xgb.fit(X_train_stints, y_train_stints)
y_pred_stints_xgb = model_stints_xgb.predict(X_test_stints)

stints_mse_xgb = mean_squared_error(y_test_stints, y_pred_stints_xgb)
stints_mse_xgb

3.715100858079041e-05

In [220]:
# Model 1.2: Predicting Number_of_Stints (Regression) - LightGBM
model_stints_lgbm = lgb.LGBMRegressor(n_estimators= 3500,learning_rate= 0.08,max_depth= 9, objective='regression', 
                              random_state=42, verbose=-1)
model_stints_lgbm.fit(X_train_stints, y_train_stints)
y_pred_stints_lgbm = model_stints_lgbm.predict(X_test_stints)

stints_mse_lgbm = mean_squared_error(y_test_stints, y_pred_stints_lgbm)
stints_mse_lgbm

6.130215285626466e-05

In [13]:
# Model 1.3: Predicting Number_of_Stints (Regression) - CatBoost
model_stints_catboost = CatBoostRegressor(iterations= 3500,learning_rate= 0.05,depth= 5,objective='RMSE',
            eval_metric= 'RMSE',verbose= 500,border_count=254,task_type='CPU',use_best_model=False)
model_stints_catboost.fit(X_train_stints, y_train_stints, cat_features = cat_cols)
y_pred_stints_catboost = model_stints_catboost.predict(X_test_stints)

stints_mse_catboost = mean_squared_error(y_test_stints, y_pred_stints_catboost)
stints_mse_catboost

0:	learn: 0.7215556	total: 159ms	remaining: 9m 15s
500:	learn: 0.1359065	total: 11.8s	remaining: 1m 10s
1000:	learn: 0.0942985	total: 23.7s	remaining: 59.1s
1500:	learn: 0.0797973	total: 35.5s	remaining: 47.3s
2000:	learn: 0.0719597	total: 47.1s	remaining: 35.3s
2500:	learn: 0.0658441	total: 58.9s	remaining: 23.5s
3000:	learn: 0.0611871	total: 1m 10s	remaining: 11.8s
3499:	learn: 0.0569347	total: 1m 22s	remaining: 0us


0.0015488171162580046

In [230]:
# Model 2: Predicting Used_Tire_Compound (Classification) - LightGBM

# Define and train the LightGBM classifier
model_tires_lgbm = LGBMClassifier(n_estimators= 3500,learning_rate= 0.08,max_depth= 5, objective='multiclass',random_state=42, 
                                  verbose=-1) # verbose=-1 disable warnings
model_tires_lgbm.fit(X_train_tires, y_train_tires)

# Make predictions
y_pred_tires_lgbm = model_tires_lgbm.predict(X_test_tires)

tires_accuracy_lgbm = accuracy_score(y_test_tires, y_pred_tires_lgbm)
tires_accuracy_lgbm

0.9643478260869566

In [81]:
# Model 2.2: Predicting Used_Tire_Compound (Classification) - CatBoost

# Define and train the CatBoost classifier
model_tires_catboost = CatBoostClassifier(iterations= 3500,learning_rate= 0.08,depth= 9,objective='MultiClass',
            eval_metric= 'MultiClass',verbose= 500,border_count=254,task_type='CPU',use_best_model=False)

#iterations=1000, random_state=42, verbose=100
model_tires_catboost.fit(X_train_tires, y_train_tires, cat_features=cat_cols)
y_pred_tires_catboost = model_tires_catboost.predict(X_test_tires)

tires_accuracy_catboost = accuracy_score(y_test_tires, y_pred_tires_catboost)
tires_accuracy_catboost

0:	learn: 1.8523789	total: 581ms	remaining: 33m 51s
500:	learn: 0.1552995	total: 6m 14s	remaining: 37m 19s
1000:	learn: 0.0755135	total: 12m 56s	remaining: 32m 17s
1500:	learn: 0.0449241	total: 20m 1s	remaining: 26m 39s
2000:	learn: 0.0300849	total: 27m 20s	remaining: 20m 28s
2500:	learn: 0.0219279	total: 34m 54s	remaining: 13m 56s
3000:	learn: 0.0166036	total: 41m 43s	remaining: 6m 56s
3499:	learn: 0.0132254	total: 48m 17s	remaining: 0us


0.9414492753623188

In [16]:
# Model 3: Predicting Number_of_Laps (Regression) - XGBoost
model_laps_xgb = XGBRegressor(n_estimators= 3500,learning_rate= 0.05,max_depth= 5, objective='reg:squarederror', 
                              eval_metric= 'rmse', enable_categorical=True, random_state=42)
# n_estimators= 3500,learning_rate= 0.05,max_depth= 5, objective='reg:squarederror', eval_metric= 'rmse', enable_categorical=True, random_state=42
model_laps_xgb.fit(X_train_laps, y_train_laps)
y_pred_laps_xgb = model_laps_xgb.predict(X_test_laps)

laps_mse_xgb = mean_squared_error(y_test_laps, y_pred_laps_xgb)
laps_mse_xgb

6.987340431963285

In [206]:
# Model 3.2: Predicting Number_of_Laps (Regression) - LightGBM
model_laps_lgbm = lgb.LGBMRegressor(n_estimators= 3500,learning_rate= 0.07,max_depth= 9, objective='regression', metric='rmse',
                              random_state=42, verbose=-1, num_leaves=45, subsample=0.8, colsample_bytree=0.9)
model_laps_lgbm.fit(X_train_laps, y_train_laps)
y_pred_laps_lgbm = model_laps_lgbm.predict(X_test_laps)

laps_mse_lgbm = mean_squared_error(y_test_laps, y_pred_laps_lgbm)
laps_mse_lgbm

7.385198664031037

In [80]:
# Model 3.3: Predicting Number_of_Laps (Regression) - CatBoost
model_laps_catboost = CatBoostRegressor(iterations= 3500,learning_rate= 0.08,depth= 9,objective='RMSE',
            eval_metric= 'RMSE',verbose= 500,border_count=254,task_type='CPU',use_best_model=False)
model_laps_catboost.fit(X_train_laps, y_train_laps, cat_features=cat_cols)
y_pred_laps_catboost = model_laps_catboost.predict(X_test_laps)

laps_mse_catboost = mean_squared_error(y_test_laps, y_pred_laps_catboost)
laps_mse_catboost

0:	learn: 14.2776231	total: 61.1ms	remaining: 3m 33s
500:	learn: 2.9159303	total: 28.5s	remaining: 2m 50s
1000:	learn: 1.9527293	total: 57.7s	remaining: 2m 24s
1500:	learn: 1.4724966	total: 1m 27s	remaining: 1m 56s
2000:	learn: 1.1565328	total: 1m 56s	remaining: 1m 27s
2500:	learn: 0.9311863	total: 2m 26s	remaining: 58.5s
3000:	learn: 0.7715122	total: 2m 56s	remaining: 29.3s
3499:	learn: 0.6484202	total: 3m 25s	remaining: 0us


8.374876171248118

In [78]:
# Model 4: Predicting Avg_Lap_Time (Regression) - XGBoost
model_avg_lap_time_xgb = XGBRegressor(n_estimators= 3500,learning_rate= 0.05,max_depth= 5, objective='reg:squarederror', 
                              eval_metric= 'rmse', enable_categorical=True, random_state=42)
model_avg_lap_time_xgb.fit(X_train_avg_lap_time, y_train_avg_lap_time)
y_pred_avg_lap_time_xgb = model_avg_lap_time_xgb.predict(X_test_avg_lap_time)

avg_lap_time_mse_xgb = mean_squared_error(y_test_avg_lap_time, y_pred_avg_lap_time_xgb)
avg_lap_time_mse_xgb

0.566850452081473

In [178]:
# Model 4.2: Predicting Avg_Lap_Time (Regression) - LightGBM
model_avg_lap_time_lgbm = lgb.LGBMRegressor(n_estimators= 3500,learning_rate= 0.08,max_depth= 9, objective='regression', metric='rmse',
                              random_state=42, verbose=-1, num_leaves=45, subsample=0.6, colsample_bytree=0.9)
model_avg_lap_time_lgbm.fit(X_train_avg_lap_time, y_train_avg_lap_time)
y_pred_avg_lap_time_lgbm = model_avg_lap_time_lgbm.predict(X_test_avg_lap_time)

avg_lap_time_mse_lgbm = mean_squared_error(y_test_avg_lap_time, y_pred_avg_lap_time_lgbm)
avg_lap_time_mse_lgbm

0.6029215315678983

In [74]:
# Model 4.3: Predicting Avg_Lap_Time (Regression) - CatBoost
model_avg_lap_time_catboost = CatBoostRegressor(iterations= 3500,learning_rate= 0.08,depth= 9,objective='RMSE',
            eval_metric= 'RMSE',verbose= 500,border_count=254,task_type='CPU',use_best_model=False)
model_avg_lap_time_catboost.fit(X_train_avg_lap_time, y_train_avg_lap_time, cat_features=cat_cols)
y_pred_avg_lap_time_catboost = model_avg_lap_time_catboost.predict(X_test_avg_lap_time)

avg_lap_time_mse_catboost = mean_squared_error(y_test_avg_lap_time, y_pred_avg_lap_time_catboost)
avg_lap_time_mse_catboost

0:	learn: 8.9189860	total: 55.1ms	remaining: 3m 12s
500:	learn: 0.6157220	total: 27.6s	remaining: 2m 45s
1000:	learn: 0.3622087	total: 1m 1s	remaining: 2m 34s
1500:	learn: 0.2511445	total: 1m 30s	remaining: 2m
2000:	learn: 0.1895521	total: 2m	remaining: 1m 29s
2500:	learn: 0.1509009	total: 2m 29s	remaining: 59.9s
3000:	learn: 0.1238245	total: 3m	remaining: 30s
3499:	learn: 0.1025507	total: 3m 29s	remaining: 0us


0.5919152330647612

In [232]:
# Saving the models
import joblib

joblib.dump(model_stints_xgb, 'model_stints.pkl')
joblib.dump(model_tires_lgbm, 'model_tires.pkl')
joblib.dump(model_laps_xgb, 'model_laps.pkl')
joblib.dump(model_avg_lap_time_xgb, 'model_avg_lap_time.pkl')

['model_avg_lap_time.pkl']

In [236]:
# load the models 
loaded_model_stints = joblib.load('model_stints.pkl')
loaded_model_tires = joblib.load('model_tires.pkl')
loaded_model_laps = joblib.load('model_laps.pkl')
loaded_model_avg_lap_time = joblib.load('model_avg_lap_time.pkl')

In [24]:
data_cleaned['EventName'].unique(), data_cleaned['Driver'].unique(), data_cleaned['GridPosition'].unique(), data_cleaned['Year'].unique()

(array(['Bahrain Grand Prix', 'Saudi Arabian Grand Prix',
        'Australian Grand Prix', 'Japanese Grand Prix',
        'Chinese Grand Prix', 'Miami Grand Prix',
        'Emilia Romagna Grand Prix', 'Monaco Grand Prix',
        'Canadian Grand Prix', 'Spanish Grand Prix', 'Austrian Grand Prix',
        'British Grand Prix', 'Hungarian Grand Prix', 'Belgian Grand Prix',
        'Dutch Grand Prix', 'Mexican Grand Prix'], dtype=object),
 array(['VER', 'PER', 'SAI', 'LEC', 'RUS', 'NOR', 'HAM', 'PIA', 'ALO',
        'STR', 'ZHO', 'MAG', 'RIC', 'TSU', 'ALB', 'HUL', 'OCO', 'GAS',
        'BOT', 'SAR', 'BEA', 'VAN', 'HAR', 'SIR', 'VET', 'RAI', 'GRO',
        'ERI', 'KVY', 'GIO', 'KUB', 'LAT', 'MAZ', 'MSC'], dtype=object),
 array([ 1.,  5.,  4.,  2.,  3.,  7.,  9.,  8.,  6., 12., 17., 15., 14.,
        11., 13., 10., 19., 20., 16., 18.,  0.]),
 array([2024, 2018, 2019, 2021, 2022, 2023], dtype=int64))

In [25]:
"""
Current Formula 1 Drivers (2024 season):
VER – Max Verstappen
PER – Sergio Pérez
SAI – Carlos Sainz
LEC – Charles Leclerc
RUS – George Russell
NOR – Lando Norris
HAM – Lewis Hamilton
PIA – Oscar Piastri
ALO – Fernando Alonso
STR – Lance Stroll
ZHO – Zhou Guanyu
MAG – Kevin Magnussen
RIC – Daniel Ricciardo
TSU – Yuki Tsunoda
ALB – Alex Albon
HUL – Nico Hülkenberg
OCO – Esteban Ocon
GAS – Pierre Gasly
BOT – Valtteri Bottas
SAR – Logan Sargeant
"""

'\nCurrent Formula 1 Drivers (2024 season):\nVER – Max Verstappen\nPER – Sergio Pérez\nSAI – Carlos Sainz\nLEC – Charles Leclerc\nRUS – George Russell\nNOR – Lando Norris\nHAM – Lewis Hamilton\nPIA – Oscar Piastri\nALO – Fernando Alonso\nSTR – Lance Stroll\nZHO – Zhou Guanyu\nMAG – Kevin Magnussen\nRIC – Daniel Ricciardo\nTSU – Yuki Tsunoda\nALB – Alex Albon\nHUL – Nico Hülkenberg\nOCO – Esteban Ocon\nGAS – Pierre Gasly\nBOT – Valtteri Bottas\nSAR – Logan Sargeant\n'

In [26]:
data_cleaned['TyreLife'].unique(), data_cleaned['FreshTyre'].unique(), data_cleaned['TrackStatus'].unique(), data_cleaned['Rainfall'].unique()

(array([ 5.,  6.,  8., 10., 13., 19.,  1.,  2.,  4., 11., 16., 18.,  3.,
         9., 15., 17.,  7., 14., 20., 21., 22., 23., 12., 24., 25., 26.,
        27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
        40., 41., 42., 43., 44., 45., 46., 47., 48., 50., 49., 51., 52.,
        53., 54., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66.,
        67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 55., 78.]),
 array([False,  True]),
 array([   1,   12,   21,   41,   24,  126,    6, 1267,   71,  671,   26,
        2671,    4,  124,   64,   16,  167,  412, 2167,   67], dtype=int64),
 array([False, True], dtype=object))

In [27]:
data_cleaned['Speed'].unique(), data_cleaned['Throttle'].unique(), data_cleaned['Brake'].unique(), data_cleaned['DRS'].unique()

(array([282, 235, 224, 202, 142, 208, 205, 262, 218, 278, 161, 144, 200,
         87, 193,  83, 177, 248, 265, 285, 247, 169, 155, 258, 204, 210,
        182,  89, 281, 295, 233, 279, 192, 260, 273, 240, 250, 259, 245,
        263, 293, 106, 199, 242, 246, 133,  77, 160, 122, 219,  90, 146,
        203, 226, 215, 167, 143,  85, 288, 272, 238, 261, 251, 179, 101,
        254, 187, 253, 287, 286, 190, 175, 178, 134, 236, 188, 163, 128,
        294, 216, 231, 117, 197, 120, 104, 211, 234, 274, 277, 292, 257,
        118, 244,  80, 166, 283, 172, 139, 267, 276, 291, 151, 168, 113,
        147, 138,  86, 176, 189, 137,  92, 280, 230, 119,  73, 221,  93,
        165, 132, 275, 156, 173, 256,  71, 131,  81, 180, 225,  94, 298,
        129, 154, 252, 266, 130, 290, 268, 184, 127,  84,  76, 159, 209,
        237, 227, 206, 150, 223, 158, 110, 153, 162, 243, 171, 183, 212,
        112, 270,  97, 241,  88, 196, 140, 228, 152, 114, 222, 108, 181,
        164,  72, 141, 232,  79, 249, 239, 170, 229

In [28]:
data_cleaned['ClassifiedPosition'].unique(), data_cleaned['AirTemp'].unique()

(array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
        '13', '14', '15', '16', '17', '18', '19', '20', 'R', 'D'],
       dtype=object),
 array([18.3, 18.2, 18.1, 18. , 17.9, 17.8, 17.6, 17.7, 25.4, 25.6, 25.7,
        25.5, 25.3, 20.5, 20.6, 20.7, 20.8, 20.9, 21. , 21.1, 21.4, 21.3,
        21.2, 21.5, 21.8, 21.7, 21.6, 21.9, 22. , 22.5, 22.3, 22.1, 20.3,
        20.1, 19.8, 19.5, 22.4, 22.2, 20.4, 20. , 19.7, 19.6, 19.9, 18.7,
        18.8, 19. , 18.9, 18.6, 18.5, 18.4, 28.6, 28.7, 28.9, 29. , 28.8,
        28.5, 28.4, 28.2, 28.1, 28.3, 27.9, 28. , 25.1, 25. , 24.9, 24.8,
        25.2, 25.8, 25.9, 24.7, 26. , 16.3, 16.2, 16. , 15.9, 16.5, 17. ,
        17.3, 17.5, 17.4, 17.2, 17.1, 19.1, 16.1, 16.8, 24.1, 23.9, 24. ,
        24.3, 24.2, 24.4, 23.8, 24.6, 24.5, 23.7, 23.6, 29.2, 29.3, 29.4,
        29.5, 29.6, 29.7, 29.8, 29.9, 30. , 30.2, 30.3, 30.4, 30.6, 30.1,
        30.5, 16.4, 16.7, 16.6, 15.6, 15.2, 15.1, 15. , 14.9, 14.6, 14.5,
        14.8, 15.3, 15.4, 

In [29]:
data_cleaned['Humidity'].unique(), data_cleaned['Pressure'].unique(), data_cleaned['TrackTemp'].unique(), data_cleaned['WindSpeed'].unique()

(array([49. , 50. , 51. , 62. , 61. , 63. , 64. , 65. , 66. , 45. , 44. ,
        43. , 42. , 37. , 39. , 40. , 38. , 47. , 48. , 52. , 56. , 57. ,
        41. , 46. , 53. , 54. , 55. , 67. , 68. , 59. , 58. , 60. , 75. ,
        76. , 78. , 77. , 80. , 79. , 72. , 74. , 73. , 70. , 71. , 69. ,
        36. , 35. , 34. , 33. , 32. , 53.2, 53.4, 52.8, 50.1, 49.8, 49.1,
        48.5, 48.3, 47.9, 47.5, 47.1, 46.7, 46.9, 46.3, 47.4, 48.4, 46.4,
        46.5, 46.8, 45.7, 45.9, 46.1, 45.4, 45.2, 54.2, 47.8, 49.4, 49.6,
        47.7, 44.8, 45.5, 46.6, 46.2, 45.6, 44.9, 44.7, 45.3, 44.6, 43.9,
        43.6, 42.1, 44.5, 45.1, 47.3, 47.6, 43.8, 43.1, 44.2, 44.1, 44.4,
        47.2, 42.9, 43.5, 31.2, 30.5, 31. , 31.1, 31.9, 32.1, 31.6, 32.4,
        31.5, 30.8, 31.8, 32.6, 31.7, 31.3, 31.4, 30.7, 30.6, 30. , 30.1,
        29.9, 30.4, 29.4, 29.2, 28.2, 28.5, 27.7, 27.9, 28.8, 28.1, 27.8,
        28. , 27.3, 29.8, 29.6, 28.3, 27.6, 30.9, 27.4, 29.1, 32.9, 26. ,
        27. , 25. , 23. , 22. , 24. , 

In [238]:
# Test inputs to make predictions (example inputs)
test_input = pd.DataFrame({
    'EventName': ['Mexican Grand Prix'],     # Categorical - event name
    'Driver': ['NOR'],                       # Categorical - driver name
    'GridPosition': [10],                     # Integer - starting position
    'Year': [2024],                          # Categorical - race year
    'TyreLife': [10],                         # Integer - tire age in laps
    'FreshTyre': ['False'],                   # Boolean - fresh tires (True means fresh)
    'TrackStatus': [2671],                      # Categorical - track status (1: clear)
    'Rainfall': ['True'],                   # Boolean - rainfall (True means rainy)
    'Speed': [285.0],                        # Float - car speed in km/h
    'Throttle': [85],                        # Integer - throttle percentage
    'Brake': ['False'],                       # Boolean - brake applied (False: no)
    'DRS': [1],                              # Categorical - DRS active (1: yes)
    'ClassifiedPosition': [1],               # Categorical - current race position
    'AirTemp': [30],                       # Float - air temperature in Celsius
    'Humidity': [80.0],                      # Float - humidity percentage
    'Pressure': [990.0],                    # Float - air pressure in hPa
    'TrackTemp': [35],                     # Float - track temperature in Celsius
    'WindSpeed': [1]                       # Float - wind speed in km/h
})

# Convert categorical columns to 'category' dtype
test_input['Driver'] = test_input['Driver'].astype('category')
test_input['EventName'] = test_input['EventName'].astype('category')
test_input['ClassifiedPosition'] = test_input['ClassifiedPosition'].astype('category')
test_input['FreshTyre'] = test_input['FreshTyre'].astype('category')
test_input['Brake'] = test_input['Brake'].astype('category')
test_input['Rainfall'] = test_input['Rainfall'].astype('category')
test_input['Year'] = test_input['Year'].astype('category')
test_input['TrackStatus'] = test_input['TrackStatus'].astype('category')
test_input['DRS'] = test_input['DRS'].astype('category')

In [240]:
# Predictions from loaded models
# Model 1 - Number of Stints
pred_stints = loaded_model_stints.predict(test_input)
pred_stints = round(pred_stints[0])  # Rounding the prediction

print(f'Predicted Number of Stints: {pred_stints}')

# (pred_stints, pred_tires, pred_laps, pred_avg_lap_time)

Predicted Number of Stints: 3


In [242]:
# Model 2- Used Tire Compound
pred_tires = loaded_model_tires.predict(test_input)
print(f'Predicted Tire Compounds Used: {pred_tires}')

Predicted Tire Compounds Used: ['MEDIUM']


In [244]:
# Model 3 - Lap Number per Stint
pred_laps = loaded_model_laps.predict(test_input)
pred_laps = round(pred_laps[0])  # Rounding the prediction

print(f'Predicted Number of Laps within Each Stint: {pred_laps}')

Predicted Number of Laps within Each Stint: 18


In [246]:
# Model 4 - Average Lap Time per Stint
pred_avg_lap_time = loaded_model_avg_lap_time.predict(test_input)
print(f'Predicted Average Lap Time per Stint: {pred_avg_lap_time}')

Predicted Average Lap Time per Stint: [84.31518]
