In [286]:
# predict probability of podium based on data in form ([lap number, time per lap, pit stops?, grid number], podium?) -> or expected position
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [217]:
# load data
drivers = pd.read_excel('data/drivers.xlsx').set_index('driverId')
lapTimes = pd.read_csv('data/lapTimes.csv')
results = pd.read_csv('data/results.csv')
races = pd.read_csv('data/races.csv')

In [285]:
# for one race, then all in 2012-2016, predict on 2017
# features [results: grid, lap number (should increase confidence), position, lap time / min lap time for raceId, elapsed time / min for raceId]
# label [results: positionOrder]
# because of elapsed time, for 2nd lap onwards

race_ids = list(range(948, 969)) + list(range(931, 946)) + list(range(900, 919)) + list(range(880, 900)) + list(range(860, 880))
X = []
X_grid = []
y = []
y_pos = []

for race_id in race_ids:
    if race_id == 917:
        continue
    lapData = lapTimes[lapTimes['raceId'] == race_id].sort_values(['driverId', 'lap']) #FOR LOOP FOR ALL RACES CONSIDERED
    lapData['elapsed'] = np.zeros(len(lapData))
    for i in range(len(lapData) - 1):
        if lapData.iloc[i]['lap'] == 1:
            lapData.iloc[i, lapData.columns.get_loc('elapsed')] = 0
        lapData.iloc[i+1, lapData.columns.get_loc('elapsed')] = \
        lapData.iloc[i, lapData.columns.get_loc('elapsed')] + lapData.iloc[i, lapData.columns.get_loc('milliseconds')] / 1000
        lap_mins = [None]

    for g in (lap_groups := lapData[lapData['raceId'] == race_id].groupby('lap')).groups:
        lap_mins.append(lap_groups.get_group(g)['milliseconds'].min())

    elapsed_mins = [None]
    for g in lap_groups.groups:
        elapsed_mins.append(lap_groups.get_group(g)['elapsed'].min())

    for (index, row) in lapData[lapData['raceId'] == race_id].sort_values('lap').iterrows():
        if (row['lap'] >= 2):
            X.append([
                row['lap'] / lapData[lapData['raceId'] == race_id]['lap'].max(),
                row['position'],
                row['milliseconds'] / lap_mins[row['lap']],
                row['elapsed'] / elapsed_mins[row['lap']]
            ])
            X_grid.append(int(results[(results['raceId'] == race_id) & (results['driverId'] == row['driverId'])]['grid']))
            y.append(int(results[(results['raceId'] == race_id) & (results['driverId'] == row['driverId'])]['positionOrder']) in (1,2,3))
            y_pos.append(int(results[(results['raceId'] == race_id) & (results['driverId'] == row['driverId'])]['positionOrder']))

In [283]:
# logistic regression classifier to predict chance of winning for each initial grid position (cannot take % since uneven data)
clf_grid = LogisticRegression(fit_intercept=False).fit([[x] for x in X_grid], y)

for i in range(1,21):
    print(f'Grid Pos {i}: {round(clf_grid.predict_proba([[i]])[0][1], 3)}')

Grid Pos 1: 0.438
Grid Pos 2: 0.378
Grid Pos 3: 0.321
Grid Pos 4: 0.269
Grid Pos 5: 0.223
Grid Pos 6: 0.183
Grid Pos 7: 0.149
Grid Pos 8: 0.12
Grid Pos 9: 0.096
Grid Pos 10: 0.076
Grid Pos 11: 0.06
Grid Pos 12: 0.048
Grid Pos 13: 0.038
Grid Pos 14: 0.03
Grid Pos 15: 0.023
Grid Pos 16: 0.018
Grid Pos 17: 0.014
Grid Pos 18: 0.011
Grid Pos 19: 0.009
Grid Pos 20: 0.007


In [313]:
# fit SGDClassifier to predict final position
reg = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', penalty='l2', max_iter=1000))
reg.fit(X, y_pos)
reg.predict([[.9, 2, 1.04, 1.04]])

array([4])