In [2]:
import pandas as pd

In [8]:
lap_times = pd.read_csv('lap_times.csv')
print(lap_times.head())

pit_stops = pd.read_csv('pit_stops.csv')
print(pit_stops.head())

races = races = pd.read_csv('races.csv')[["raceId", "circuitId"]]
print(races.head())

   raceId  driverId  lap  position      time  milliseconds
0     841        20    1         1  1:38.109         98109
1     841        20    2         1  1:33.006         93006
2     841        20    3         1  1:32.713         92713
3     841        20    4         1  1:32.803         92803
4     841        20    5         1  1:32.342         92342
   raceId  driverId  stop  lap      time duration  milliseconds
0     841       153     1    1  17:05:23   26.898         26898
1     841        30     1    1  17:05:52   25.021         25021
2     841        17     1   11  17:20:48   23.426         23426
3     841         4     1   12  17:22:34   23.251         23251
4     841        13     1   13  17:24:10   23.842         23842
   raceId  circuitId
0       1          1
1       2          2
2       3         17
3       4          3
4       5          4


In [9]:
#Data Preprocessing

lap_times = lap_times.merge(races, on='raceId', how='left')
lap_times['normalized_time'] = lap_times['milliseconds'] / lap_times.groupby('raceId')['milliseconds'].transform('min')
lap_times = lap_times.merge(pit_stops[['raceId', 'driverId', 'lap']], on=['raceId', 'driverId', 'lap'], how='left', indicator=True)
lap_times['is_pit_stop'] = lap_times['_merge'] == 'both'
lap_times.drop(columns=['_merge'], inplace=True)
# Example: Filter out laps with normalized_time > 1.5 (assuming it's an anomaly)
lap_times = lap_times[lap_times['normalized_time'] <= 1.5]


In [10]:
#Feature Engineering

features = ['circuitId', 'lap', 'is_pit_stop']

lap_times['lag_1'] = lap_times.groupby(['raceId', 'driverId'])['normalized_time'].shift(1)
lap_times['lag_2'] = lap_times.groupby(['raceId', 'driverId'])['normalized_time'].shift(2)
features += ['lag_1', 'lag_2']
print(lap_times.head())

   raceId  driverId  lap  position      time  milliseconds  circuitId  \
0     841        20    1         1  1:38.109         98109          1   
1     841        20    2         1  1:33.006         93006          1   
2     841        20    3         1  1:32.713         92713          1   
3     841        20    4         1  1:32.803         92803          1   
4     841        20    5         1  1:32.342         92342          1   

   normalized_time  is_pit_stop     lag_1     lag_2  
0         1.103005        False       NaN       NaN  
1         1.045634        False  1.103005       NaN  
2         1.042340        False  1.045634  1.103005  
3         1.043352        False  1.042340  1.045634  
4         1.038169        False  1.043352  1.042340  


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Drop NaNs and prepare data
lap_times.dropna(subset=features + ['normalized_time'], inplace=True)
X = lap_times[features]
y = lap_times['normalized_time']

# Split data and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('MSE:', mean_squared_error(y_test, y_pred))


MSE: 0.0033871243821748196


In [8]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict using the Random Forest model
y_pred_rf = rf_model.predict(X_test)

# Calculate the MSE for the Random Forest model
mse_rf = mean_squared_error(y_test, y_pred_rf)
print('Random Forest MSE:', mse_rf)

# For comparison, print the MSE of the linear regression model if you haven't already
y_pred_lr = model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
print('Linear Regression MSE:', mse_lr)

# Optionally, you could calculate the improvement in MSE
improvement = mse_lr - mse_rf
print('MSE Improvement:', improvement)


Random Forest MSE: 0.001960514475127411
Linear Regression MSE: 0.0033871715948670562
MSE Improvement: 0.0014266571197396453


In [9]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
print('Gradient Boosting MSE:', mse_gb)


Gradient Boosting MSE: 0.002491870310213923


In [19]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print('XGBoost MSE:', mse_xgb)


XGBoost MSE: 0.0019985357516995247


In [14]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
print('LightGBM MSE:', mse_lgb)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 642
[LightGBM] [Info] Number of data points in the train set: 412496, number of used features: 5
[LightGBM] [Info] Start training from score 1.061508
LightGBM MSE: 0.0021038731094620514


In [10]:
from sklearn.neural_network import MLPRegressor

nn_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)
y_pred_nn = nn_model.predict(X_test)
mse_nn = mean_squared_error(y_test, y_pred_nn)
print('Neural Network MSE:', mse_nn)


Neural Network MSE: 0.009784947713594403
