In [2]:
# predict probability of podium based on data in form ([lap number, time per lap, pit stops?, grid number], podium?) -> or expected position
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [3]:
# load data
drivers = pd.read_excel('data/drivers.xlsx').set_index('driverId')
lapTimes = pd.read_csv('data/lapTimes.csv')
results = pd.read_csv('data/results.csv')
races = pd.read_csv('data/races.csv')

In [30]:
# add elapsed column
lapTimes = lapTimes[lapTimes['raceId'] == 948].sort_values(['driverId', 'lap']) #FOR LOOP FOR ALL RACES CONSIDERED
lapTimes['elapsed'] = np.zeros(len(lapTimes))
for i in range(len(lapTimes) - 1):
    if lapTimes.iloc[i]['lap'] == 1:
        lapTimes.iloc[i, lapTimes.columns.get_loc('elapsed')] = 0
    lapTimes.iloc[i+1, lapTimes.columns.get_loc('elapsed')] = \
    lapTimes.iloc[i, lapTimes.columns.get_loc('elapsed')] + lapTimes.iloc[i, lapTimes.columns.get_loc('milliseconds')] / 1000

In [81]:
# features [results: grid, lap number (should increase confidence), position, lap time / min lap time for raceId, elapsed time / min for raceId]
# label [results: positionOrder]
# because of elapsed time, for 2nd lap onwards

# for one race, then all in 2012-2016, predict on 2017
race_id = 948
X = []
grid_x = []
y = []

lap_mins = [None]
for g in (lap_groups := lapTimes[lapTimes['raceId'] == 948].groupby('lap')).groups:
    lap_mins.append(lap_groups.get_group(g)['milliseconds'].min())

elapsed_mins = [None]
for g in lap_groups.groups:
    elapsed_mins.append(lap_groups.get_group(g)['elapsed'].min())

for (index, row) in lapTimes[lapTimes['raceId'] == 948].sort_values('lap').iterrows():
    if (row['lap'] >= 2):
        X.append([
            row['lap'],
            row['position'],
            row['milliseconds'] / lap_mins[row['lap']],
            row['elapsed']
        ])
        grid_x.append(int(results[(results['raceId'] == 948) & (results['driverId'] == row['driverId'])]['grid']))
        y.append(int(results[(results['raceId'] == 948) & (results['driverId'] == row['driverId'])]['positionOrder']) in (1,2,3))

In [73]:
# fit logistic regression model
clf = LogisticRegression().fit(X, y)
# add a random normal distributed variable scaled up by lap constant to probability

In [84]:
grid_clf = LogisticRegression().fit(np.array(grid_x).reshape(-1,1), y)

In [86]:
grid_clf.predict_proba([[1]])

array([[1.89115072e-05, 9.99981088e-01]])