In [438]:
# predict probability of podium based on data in form ([lap number, time per lap, pit stops?, grid number], podium?) -> or expected position
import numpy as np
import pandas as pd
import pickle

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier

In [439]:
# load data
drivers = pd.read_excel('data/drivers.xlsx').set_index('driverId')
lapTimes = pd.read_csv('data/lapTimes.csv')
results = pd.read_csv('data/results.csv')
races = pd.read_csv('data/races.csv')

In [440]:
# for one race, then all in 2012-2016
# features [results: grid, lap number (should increase confidence), position, lap time / min lap time for raceId, elapsed time / min for raceId]
# label [results: positionOrder]
# because of elapsed time, for 2nd lap onwards

race_ids = list(range(948, 969)) + list(range(931, 946)) + list(range(900, 919)) + list(range(880, 900)) + list(range(860, 880))
X = []
X_grid = []
y = []
y_pos = []

for race_id in race_ids:
    if race_id == 917:
        continue
    lapData = lapTimes[lapTimes['raceId'] == race_id].sort_values(['driverId', 'lap']) #FOR LOOP FOR ALL RACES CONSIDERED
    lapData['elapsed'] = np.zeros(len(lapData))
    for i in range(len(lapData) - 1):
        if lapData.iloc[i]['lap'] == 1:
            lapData.iloc[i, lapData.columns.get_loc('elapsed')] = 0
        lapData.iloc[i+1, lapData.columns.get_loc('elapsed')] = \
        lapData.iloc[i, lapData.columns.get_loc('elapsed')] + lapData.iloc[i, lapData.columns.get_loc('milliseconds')] / 1000

    lap_mins = [None]
    for g in (lap_groups := lapData[lapData['raceId'] == race_id].groupby('lap')).groups:
        lap_mins.append(lap_groups.get_group(g)['milliseconds'].min())

    elapsed_mins = [None]
    for g in lap_groups.groups:
        elapsed_mins.append(lap_groups.get_group(g)['elapsed'].min())

    for (index, row) in lapData[lapData['raceId'] == race_id].sort_values('lap').iterrows():
        if (row['lap'] >= 2):
            X.append([
                row['lap'] / lapData[lapData['raceId'] == race_id]['lap'].max(),
                row['position'],
                row['milliseconds'] / lap_mins[row['lap']],
                row['elapsed'] / elapsed_mins[row['lap']]
            ])
            X_grid.append(int(results[(results['raceId'] == race_id) & (results['driverId'] == row['driverId'])]['grid']))
            y.append(int(results[(results['raceId'] == race_id) & (results['driverId'] == row['driverId'])]['positionOrder']) == 1)
            y_pos.append(int(results[(results['raceId'] == race_id) & (results['driverId'] == row['driverId'])]['positionOrder']))

In [441]:
# logistic regression classifier to predict chance of winning for each initial grid position (cannot take % since uneven data)
clf_grid = LogisticRegression(fit_intercept=False).fit([[x] for x in X_grid], y)

for i in range(1,21):
    print(f'Grid Pos {i}: {round(clf_grid.predict_proba([[i]])[0][1], 3)}')

Grid Pos 1: 0.351
Grid Pos 2: 0.226
Grid Pos 3: 0.136
Grid Pos 4: 0.079
Grid Pos 5: 0.044
Grid Pos 6: 0.024
Grid Pos 7: 0.013
Grid Pos 8: 0.007
Grid Pos 9: 0.004
Grid Pos 10: 0.002
Grid Pos 11: 0.001
Grid Pos 12: 0.001
Grid Pos 13: 0.0
Grid Pos 14: 0.0
Grid Pos 15: 0.0
Grid Pos 16: 0.0
Grid Pos 17: 0.0
Grid Pos 18: 0.0
Grid Pos 19: 0.0
Grid Pos 20: 0.0


In [442]:
# fit SGDClassifier to predict finish position
sgd_clf = make_pipeline(StandardScaler(), SGDClassifier(loss='modified_huber', penalty='elasticnet', max_iter=10000))
sgd_clf.fit(X, y_pos)

# fit neural network to predict finish position
nn_clf = MLPClassifier(hidden_layer_sizes=(12,), random_state=1, max_iter=100)
nn_clf.fit(X, y_pos)



In [446]:
# neural network performs better and more consistent, since probabilities better spread out
vec = [.1, 5, 1, 1.01]
print(nn_clf.predict_proba([vec])[0])
print(nn_clf.predict([vec]))
nn_clf.score(X, y_pos)

[0.00673825 0.04320008 0.05123266 0.10059823 0.15279608 0.17470421
 0.1287309  0.07513394 0.04845333 0.04529304 0.02358048 0.01645719
 0.01883897 0.0086729  0.01108026 0.00791394 0.01088938 0.00889445
 0.0176549  0.02107401 0.00892152 0.01301186 0.00409496 0.00203444]
[6]


0.27339317228684745

In [444]:
print(sgd_clf.predict_proba([vec]))
print(sgd_clf.predict([vec]))
sgd_clf.score(X, y_pos)

[[0.         0.09106643 0.09779911 0.14169046 0.11784386 0.14033768
  0.12027855 0.0608513  0.         0.01901675 0.04922827 0.06057154
  0.02147929 0.0125017  0.02383935 0.         0.         0.
  0.         0.04349572 0.         0.         0.         0.        ]]
[4]


0.16127104009715923

In [445]:
# pickle neural network for use in get_data and send_data
with open('models/exp_pos_model.pkl', 'wb') as f:
    pickle.dump(nn_clf, f)