In [7]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor

In [4]:
data = pd.read_csv("f1_clean.csv")
data.head()

Unnamed: 0,raceId,race,year,circuit,constructor,driver,qualifyingPos,position,teammate
0,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Ferrari,ALO,3,1,OTHER
1,337,Bahrain Grand Prix,2010,Bahrain International Circuit,McLaren,HAM,4,3,OTHER
2,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Red Bull,VET,1,4,OTHER
3,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Williams,HUL,13,14,OTHER
4,338,Australian Grand Prix,2010,Albert Park Grand Prix Circuit,Ferrari,ALO,3,4,OTHER


## Create a model using all the data

In [6]:
# only use data from before 2023, so that we can predict on 2023 data
# prev_seasons = data.loc[data['year'] != 2023]
# prev_seasons

Unnamed: 0,raceId,race,year,circuit,constructor,driver,qualifyingPos,position,teammate
0,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Ferrari,ALO,3,1,OTHER
1,337,Bahrain Grand Prix,2010,Bahrain International Circuit,McLaren,HAM,4,3,OTHER
2,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Red Bull,VET,1,4,OTHER
3,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Williams,HUL,13,14,OTHER
4,338,Australian Grand Prix,2010,Albert Park Grand Prix Circuit,Ferrari,ALO,3,4,OTHER
...,...,...,...,...,...,...,...,...,...
2457,1096,Abu Dhabi Grand Prix,2022,Yas Marina Circuit,AlphaTauri,GAS,17,14,TSU
2458,1096,Abu Dhabi Grand Prix,2022,Yas Marina Circuit,Alfa Romeo,BOT,18,15,ZHO
2459,1096,Abu Dhabi Grand Prix,2022,Yas Marina Circuit,Haas F1 Team,MAG,16,17,OTHER
2460,1096,Abu Dhabi Grand Prix,2022,Yas Marina Circuit,Mercedes,HAM,5,18,RUS


In [23]:
# apply one hot encoding to race, constuctor, driver, circuit, teammate

# idea maybe turn year into the numbers of year ago like negatives?

one_hot_encoded_data = pd.get_dummies(data, columns = ['race', 'circuit', 'constructor', 'driver', 'teammate'])
one_hot_encoded_data

Unnamed: 0,raceId,year,qualifyingPos,position,race_70th Anniversary Grand Prix,race_Abu Dhabi Grand Prix,race_Australian Grand Prix,race_Austrian Grand Prix,race_Azerbaijan Grand Prix,race_Bahrain Grand Prix,...,teammate_PER,teammate_RAI,teammate_RIC,teammate_RUS,teammate_SAI,teammate_STR,teammate_TSU,teammate_VER,teammate_VET,teammate_ZHO
0,337,2010,3,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,337,2010,4,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,337,2010,1,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,337,2010,13,14,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,338,2010,3,4,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2733,1117,2023,9,15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2734,1117,2023,30,17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2735,1117,2023,13,30,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2736,1117,2023,16,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# only use the 2023 data for testing 
one_hot_training = one_hot_encoded_data.loc[one_hot_encoded_data['year'] != 2023]
one_hot_training

Unnamed: 0,raceId,year,qualifyingPos,position,race_70th Anniversary Grand Prix,race_Abu Dhabi Grand Prix,race_Australian Grand Prix,race_Austrian Grand Prix,race_Azerbaijan Grand Prix,race_Bahrain Grand Prix,...,teammate_PER,teammate_RAI,teammate_RIC,teammate_RUS,teammate_SAI,teammate_STR,teammate_TSU,teammate_VER,teammate_VET,teammate_ZHO
0,337,2010,3,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,337,2010,4,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,337,2010,1,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,337,2010,13,14,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,338,2010,3,4,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2457,1096,2022,17,14,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2458,1096,2022,18,15,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2459,1096,2022,16,17,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2460,1096,2022,5,18,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [26]:
X_train = one_hot_training.drop('position',axis = 1)
X_train

Unnamed: 0,raceId,year,qualifyingPos,race_70th Anniversary Grand Prix,race_Abu Dhabi Grand Prix,race_Australian Grand Prix,race_Austrian Grand Prix,race_Azerbaijan Grand Prix,race_Bahrain Grand Prix,race_Belgian Grand Prix,...,teammate_PER,teammate_RAI,teammate_RIC,teammate_RUS,teammate_SAI,teammate_STR,teammate_TSU,teammate_VER,teammate_VET,teammate_ZHO
0,337,2010,3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,337,2010,4,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,337,2010,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,337,2010,13,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,338,2010,3,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2457,1096,2022,17,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2458,1096,2022,18,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2459,1096,2022,16,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2460,1096,2022,5,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [27]:
y_train = one_hot_training['position']
y_train

0        1
1        3
2        4
3       14
4        4
        ..
2457    14
2458    15
2459    17
2460    18
2461    30
Name: position, Length: 2462, dtype: int64

In [28]:
# model 
regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)


In [29]:
this_season = one_hot_encoded_data.loc[one_hot_encoded_data['year'] == 2023]
this_season

Unnamed: 0,raceId,year,qualifyingPos,position,race_70th Anniversary Grand Prix,race_Abu Dhabi Grand Prix,race_Australian Grand Prix,race_Austrian Grand Prix,race_Azerbaijan Grand Prix,race_Bahrain Grand Prix,...,teammate_PER,teammate_RAI,teammate_RIC,teammate_RUS,teammate_SAI,teammate_STR,teammate_TSU,teammate_VER,teammate_VET,teammate_ZHO
2462,1098,2023,1,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2463,1098,2023,2,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2464,1098,2023,5,3,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2465,1098,2023,4,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2466,1098,2023,7,5,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2733,1117,2023,9,15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2734,1117,2023,30,17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2735,1117,2023,13,30,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2736,1117,2023,16,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
one_hot_testing = one_hot_encoded_data.loc[one_hot_encoded_data['year'] == 2023]
one_hot_testing

Unnamed: 0,raceId,year,qualifyingPos,position,race_70th Anniversary Grand Prix,race_Abu Dhabi Grand Prix,race_Australian Grand Prix,race_Austrian Grand Prix,race_Azerbaijan Grand Prix,race_Bahrain Grand Prix,...,teammate_PER,teammate_RAI,teammate_RIC,teammate_RUS,teammate_SAI,teammate_STR,teammate_TSU,teammate_VER,teammate_VET,teammate_ZHO
2462,1098,2023,1,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2463,1098,2023,2,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2464,1098,2023,5,3,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2465,1098,2023,4,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2466,1098,2023,7,5,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2733,1117,2023,9,15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2734,1117,2023,30,17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2735,1117,2023,13,30,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2736,1117,2023,16,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
X_test = one_hot_testing.drop('position',axis = 1)
X_test

Unnamed: 0,raceId,year,qualifyingPos,race_70th Anniversary Grand Prix,race_Abu Dhabi Grand Prix,race_Australian Grand Prix,race_Austrian Grand Prix,race_Azerbaijan Grand Prix,race_Bahrain Grand Prix,race_Belgian Grand Prix,...,teammate_PER,teammate_RAI,teammate_RIC,teammate_RUS,teammate_SAI,teammate_STR,teammate_TSU,teammate_VER,teammate_VET,teammate_ZHO
2462,1098,2023,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2463,1098,2023,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2464,1098,2023,5,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2465,1098,2023,4,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2466,1098,2023,7,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2733,1117,2023,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2734,1117,2023,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2735,1117,2023,13,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2736,1117,2023,16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
y_test = one_hot_testing['position']
y_test

2462     1
2463     2
2464     3
2465     4
2466     5
        ..
2733    15
2734    17
2735    30
2736    30
2737    30
Name: position, Length: 276, dtype: int64

In [38]:
regr.predict(X_test[:20])

array([ 7.60208368,  7.75972642, 10.85184397,  9.6650395 ,  7.90025389,
       11.21804633,  8.79322274, 14.83650359, 16.83007549, 15.72666548,
       16.21438309, 18.06436429, 16.09936173, 15.03702781, 12.93861643,
       12.82359616,  8.84679933,  8.33998106, 14.59775349, 10.53120974])

## Create model for each circuit

In [55]:
# get all the unique circuit values
current_circuits = data.loc[data['year'] == 2023].circuit.unique()
current_circuits

array(['Bahrain International Circuit', 'Jeddah Corniche Circuit',
       'Albert Park Grand Prix Circuit', 'Baku City Circuit',
       'Circuit de Monaco', 'Circuit de Barcelona-Catalunya',
       'Circuit Gilles Villeneuve', 'Red Bull Ring',
       'Silverstone Circuit', 'Hungaroring',
       'Circuit de Spa-Francorchamps', 'Autodromo Nazionale di Monza',
       'Marina Bay Street Circuit', 'Suzuka Circuit',
       'Circuit of the Americas', 'Autódromo Hermanos Rodríguez'],
      dtype=object)

In [62]:
for circuit in current_circuits: 
    circuit_data = data.loc[data['circuit'] == circuit]

    # one hot encode the categorical variables
    one_hot_encoded_data = pd.get_dummies(circuit_data, columns = ['race', 'circuit', 'constructor', 'driver', 'teammate'])

    # split the data into previous seasons for training and current season for testing 
    one_hot_training = one_hot_encoded_data.loc[one_hot_encoded_data['year'] != 2023]
    one_hot_testing = one_hot_encoded_data.loc[one_hot_encoded_data['year'] == 2023]
    
    # get the training and testing data
    X_train = one_hot_training.drop('position',axis = 1)
    y_train = one_hot_training['position']

    X_test = one_hot_testing.drop('position',axis = 1)
    y_test = one_hot_testing['position']

    # create the model
    regr = MLPRegressor(hidden_layer_sizes=(500,),random_state=1, max_iter=1000).fit(X_train, y_train)


    print(circuit)
    print(y_test[:20])
    print(regr.predict(X_test[:20]))

    



Bahrain International Circuit
2462     1
2463     2
2464     3
2465     4
2466     5
2467     6
2468     7
2469     8
2470     9
2471    10
2472    11
2473    13
2474    15
2475    16
2476    17
2477    30
2478    30
Name: position, dtype: int64
[25.07903975 25.15169919 25.22349015 24.99772907 25.24999075 25.24188272
 25.19396253 25.17496588 25.64822967 25.58165751 25.38169759 25.4402136
 25.51725534 25.29012406 25.32454966 25.16226497 24.89682962]
Jeddah Corniche Circuit
2479     1
2480     2
2481     3
2482     4
2483     5
2484     6
2485     7
2486     8
2487     9
2488    10
2489    11
2490    12
2491    13
2492    17
2493    18
2494    30
2495    30
Name: position, dtype: int64
[13.76314556 13.84518428 13.82121394 13.80334929 13.81086909 13.73589143
 13.79297731 13.91378454 13.95904225 14.02197827 13.91943824 14.01176196
 13.82336247 14.10411079 14.00128187 14.17473007 14.00806506]
Albert Park Grand Prix Circuit
2496     1
2497     2
2498     3
2499     4
2500     5
2501     6
25