In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
import itertools

In [2]:
data_raw = pd.read_csv("f1_clean.csv")
data_raw.head()

Unnamed: 0,raceId,race,year,circuit,constructor,driver,qualifyingPos,position,points,teammate
0,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Ferrari,ALO,3,1,25.0,OTHER
1,337,Bahrain Grand Prix,2010,Bahrain International Circuit,McLaren,HAM,4,3,15.0,OTHER
2,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Red Bull,VET,1,4,12.0,OTHER
3,337,Bahrain Grand Prix,2010,Bahrain International Circuit,Williams,HUL,13,14,0.0,OTHER
4,338,Australian Grand Prix,2010,Albert Park Grand Prix Circuit,Ferrari,ALO,3,4,12.0,OTHER


In [3]:
data_train = data_raw[["year", "circuit", "constructor", "driver", "qualifyingPos", "teammate", "position", "points"]]
data_train

Unnamed: 0,year,circuit,constructor,driver,qualifyingPos,teammate,position,points
0,2010,Bahrain International Circuit,Ferrari,ALO,3,OTHER,1,25.0
1,2010,Bahrain International Circuit,McLaren,HAM,4,OTHER,3,15.0
2,2010,Bahrain International Circuit,Red Bull,VET,1,OTHER,4,12.0
3,2010,Bahrain International Circuit,Williams,HUL,13,OTHER,14,0.0
4,2010,Albert Park Grand Prix Circuit,Ferrari,ALO,3,OTHER,4,12.0
...,...,...,...,...,...,...,...,...
2733,2023,Autódromo Hermanos Rodríguez,Alfa Romeo,BOT,9,ZHO,15,0.0
2734,2023,Autódromo Hermanos Rodríguez,Aston Martin,STR,0,ALO,17,0.0
2735,2023,Autódromo Hermanos Rodríguez,Aston Martin,ALO,13,STR,0,0.0
2736,2023,Autódromo Hermanos Rodríguez,Haas F1 Team,MAG,16,HUL,0,0.0


In [4]:
def standard_scaler_scale(data):
    return (data - data.mean()) / data.std()

def standard_scaler_unscale(data, mean, std):
    return data * std + mean

In [5]:
circuits = data_train.circuit.unique()
drivers = data_train.driver.unique()
teammates = data_train.teammate.unique()
constructors = data_train.constructor.unique()
# qualifying_positions = data_train.qualifyingPos.unique()
qualifying_positions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
all_list = [circuits, drivers, teammates, qualifying_positions, constructors]
 
res = list(itertools.product(*all_list))

all_permutations = pd.DataFrame(res, columns=['circuit', 'driver', 'teammate', 'qualifyingPos', 'constructor'])
cleaned_permutations = all_permutations.loc[all_permutations['driver'] != all_permutations['teammate']]
cleaned_permutations


Unnamed: 0,circuit,driver,teammate,qualifyingPos,constructor
0,Bahrain International Circuit,ALO,OTHER,1,Ferrari
1,Bahrain International Circuit,ALO,OTHER,1,McLaren
2,Bahrain International Circuit,ALO,OTHER,1,Red Bull
3,Bahrain International Circuit,ALO,OTHER,1,Williams
4,Bahrain International Circuit,ALO,OTHER,1,Alfa Romeo
...,...,...,...,...,...
1755395,Jeddah Corniche Circuit,ZHO,TSU,20,Alpine F1 Team
1755396,Jeddah Corniche Circuit,ZHO,TSU,20,AlphaTauri
1755397,Jeddah Corniche Circuit,ZHO,TSU,20,Aston Martin
1755398,Jeddah Corniche Circuit,ZHO,TSU,20,Mercedes


In [8]:
dnf_circuit = pd.read_csv('f1_dnf_circuit.csv')
dnf_circuit

Unnamed: 0,circuit,percentDnf
0,Albert Park Grand Prix Circuit,0.25
1,Autodromo Enzo e Dino Ferrari,0.176471
2,Autodromo Nazionale di Monza,0.135294
3,Autódromo Hermanos Rodríguez,0.138211
4,Autódromo José Carlos Pace,0.136691
5,Bahrain International Circuit,0.131148
6,Baku City Circuit,0.209091
7,Circuit Gilles Villeneuve,0.107914
8,Circuit de Barcelona-Catalunya,0.115607
9,Circuit de Monaco,0.154839


In [19]:
current_circuits = data_train.loc[data_train['year'] == 2023]['circuit'].unique()
current_drivers = data_train.loc[data_train['year'] == 2023]['driver'].unique()

In [20]:
all_preds = []

# create a dataframe to store the results
final_results = pd.DataFrame(columns=['circuit', 'driver', 'pred_pos'])

for circuit in current_circuits: 

    # create training data
    circuit_data = data_train.loc[(data_train['circuit'] == circuit) & (data_train['year'] !=2023)]

    # combine the circuit data and permutations 
    cleaned_circuit_data = circuit_data.drop(['position', 'year', 'points'],axis = 1)

    # track the split for when combining the cleaned and the permutations
    len_cleaned_circuit_data = cleaned_circuit_data.shape[0]

    # remove drivers/teammates that didn't drive this year
    not_driving = ['GRO', 'VET', 'RAI']

    current_permutations = cleaned_permutations[(~cleaned_permutations['driver'].isin(not_driving))&(~cleaned_permutations['teammate'].isin(not_driving))]

    # add the permutation data to the circuit data
    combined_circuit_permutations = cleaned_circuit_data.append(current_permutations, ignore_index=True)

    # one hot encode all the data
    one_hot_encoded_data = pd.get_dummies(combined_circuit_permutations, columns = ['circuit', 'constructor', 'driver', 'teammate'])

    # get the training data
    data_train_scaled = one_hot_encoded_data.copy()[:len_cleaned_circuit_data]
    data_train_scaled.qualifyingPos = standard_scaler_scale(data_train_scaled.qualifyingPos)

    X_train = data_train_scaled
    y_train = standard_scaler_scale(circuit_data['position'])

    # get the testing data
    X_test = one_hot_encoded_data.copy()[len_cleaned_circuit_data:]
    # make sure the testing data is just per circuit 
    X_test = X_test.loc[X_test['circuit_'+circuit] == 1]
    X_test.qualifyingPos = standard_scaler_scale(X_test.qualifyingPos)

    # create the circuit specific model
    regr = MLPRegressor(
        hidden_layer_sizes=(500, 250, 100, 50), 
        max_iter=500,
        random_state=203043043
        ).fit(X_train, y_train)

    # randomly add in dnf 
    dnf_rate = dnf_circuit.loc[dnf_circuit['circuit'] == circuit]['percentDnf'].iloc[0]

    num_values_to_replace = int(dnf_rate * len(X_test['qualifyingPos']))
    indices_to_replace = np.random.choice(X_test.index, num_values_to_replace, replace=False)
    X_test.loc[indices_to_replace, 'qualifyingPos'] = 0

    # predict and unscale the predicted positions
    pred_vals = standard_scaler_unscale(regr.predict(X_test), circuit_data.position.mean(), 
                                            circuit_data.position.std()).astype(np.int32)
    X_test['all_preds'] = pred_vals     

    for driver in current_drivers:

        # find the median position for each driver
        med_pos = X_test.loc[X_test['driver_'+driver] == 1]['all_preds'].median()
        # add the results to the dataframe
        final_results = final_results.append({'circuit': circuit, 'driver': driver, 'pred_pos': med_pos}, ignore_index=True)  

final_results['pred_points'] = final_results['pred_pos']   
final_results['pred_points'] = final_results['pred_points'].replace({1.0: 25, 2.0: 18, 3.0: 15, 4.0: 12, 5.0: 10, 6.0: 8, 7.0: 6, 8.0: 4, 9.0: 2, 10.0: 1})



In [21]:
final_results

Unnamed: 0,circuit,driver,pred_pos,pred_points
0,Bahrain International Circuit,VER,6.0,8.0
1,Bahrain International Circuit,PER,9.0,2.0
2,Bahrain International Circuit,ALO,6.0,8.0
3,Bahrain International Circuit,SAI,3.0,15.0
4,Bahrain International Circuit,HAM,7.0,6.0
...,...,...,...,...
283,Autódromo Hermanos Rodríguez,ZHO,9.0,2.0
284,Autódromo Hermanos Rodríguez,NOR,4.0,12.0
285,Autódromo Hermanos Rodríguez,OCO,7.0,6.0
286,Autódromo Hermanos Rodríguez,LEC,5.0,10.0


In [22]:
final_results.groupby('driver').sum().sort_values(by=['pred_points'], ascending=False)

Unnamed: 0_level_0,pred_pos,pred_points
driver,Unnamed: 1_level_1,Unnamed: 2_level_1
HAM,66.0,202.0
SAI,74.0,187.0
LEC,73.0,187.0
ALO,75.0,184.0
VER,82.0,160.0
PER,83.0,158.0
RIC,87.0,157.0
TSU,108.0,153.0
HUL,95.0,151.0
GAS,98.0,146.0


## Permutation Significance Test

In [31]:
# do a permutation test using kendalltau
from scipy.stats import kendalltau

def kendall_permutation_test(observed, predicted, num_permutations=150000):
    name_to_rank = {name: rank for rank, name in enumerate(observed, start=1)}

    observed_ranks = np.array([name_to_rank[name] for name in observed])
    predicted_ranks = np.array([name_to_rank[name] for name in predicted])

    observed_tau, _ = kendalltau(observed_ranks, predicted_ranks)

    permuted_taus = np.zeros(num_permutations)

    for i in range(num_permutations):
        permuted_predicted_ranks = np.random.permutation(predicted_ranks)
        permuted_tau, _ = kendalltau(observed_ranks, permuted_predicted_ranks)
        permuted_taus[i] = permuted_tau

    p_value = np.mean(np.abs(permuted_taus) >= np.abs(observed_tau))

    return observed_tau, p_value

actual_rankings = list(data_train.loc[data_train['year'] == 2023].groupby('driver').sum().sort_values(by=['points'], ascending=False).index)
predicted_rankings = list(final_results.groupby('driver').sum().sort_values(by=['pred_points'], ascending=False).index)

observed_tau, p_value = kendall_permutation_test(actual_rankings, predicted_rankings)
print(f"Observed Kendall's Tau: {observed_tau}")
print(f"P-value: {p_value}")


Observed Kendall's Tau: 0.33333333333333337
P-value: 0.057806666666666666


## ignore below

In [14]:
circuit = 'Bahrain International Circuit'
all_preds = []
final_results = pd.DataFrame(columns=['circuit', 'driver', 'pred_pos'])

# create training data
circuit_data = data_train.loc[(data_train['circuit'] == circuit) & (data_train['year'] !=2023)]

# combine the circuit data and permutations 
cleaned_circuit_data = circuit_data.drop(['position', 'year', 'points'],axis = 1)

# track the split for when combining the cleaned and the permutations
len_cleaned_circuit_data = cleaned_circuit_data.shape[0]

# remove drivers/teammates that didn't drive this year
not_driving = ['GRO', 'VET', 'RAI']

current_permutations = cleaned_permutations[(~cleaned_permutations['driver'].isin(not_driving))&(~cleaned_permutations['teammate'].isin(not_driving))]

# add the permutation data to the circuit data
combined_circuit_permutations = cleaned_circuit_data.append(current_permutations, ignore_index=True)

# one hot encode all the data
one_hot_encoded_data = pd.get_dummies(combined_circuit_permutations, columns = ['circuit', 'constructor', 'driver', 'teammate'])

# get the training data
data_train_scaled = one_hot_encoded_data.copy()[:len_cleaned_circuit_data]
data_train_scaled.qualifyingPos = standard_scaler_scale(data_train_scaled.qualifyingPos)

X_train = data_train_scaled
y_train = standard_scaler_scale(circuit_data['position'])

# get the testing data
X_test = one_hot_encoded_data.copy()[len_cleaned_circuit_data:]
X_test = X_test.loc[X_test['circuit_'+circuit] == 1]
X_test.qualifyingPos = standard_scaler_scale(X_test.qualifyingPos)

# create the circuit specific model
regr = MLPRegressor(
    hidden_layer_sizes=(500, 250, 100, 50), 
    max_iter=500,
    random_state=203043043
    ).fit(X_train, y_train)

# randomly add in dnf 
dnf_rate = dnf_circuit.loc[dnf_circuit['circuit'] == circuit]['percentDnf'].iloc[0]

num_values_to_replace = int(dnf_rate * len(X_test['qualifyingPos']))
indices_to_replace = np.random.choice(X_test.index, num_values_to_replace, replace=False)
X_test.loc[indices_to_replace, 'qualifyingPos'] = 0

# predict and unscale the predicted positions
pred_vals = standard_scaler_unscale(regr.predict(X_test), circuit_data.position.mean(), 
                                        circuit_data.position.std()).astype(np.int32)
X_test['all_preds'] = pred_vals     

for driver in drivers:

    # find the median position for each driver
    med_pos = X_test.loc[X_test['driver_'+driver] == 1]['all_preds'].median()
    # add the results to the dataframe
    final_results = final_results.append({'circuit': circuit, 'driver': driver, 'pred_pos': med_pos}, ignore_index=True)  

final_results['pred_points'] = final_results['pred_pos']   
final_results['pred_points'] = final_results['pred_points'].replace({1.0: 25, 2.0: 18, 3.0: 15, 4.0: 12, 5.0: 10, 6.0: 8, 7.0: 6, 8.0: 4, 9.0: 2, 10.0: 1})







In [18]:
circuit = 'Bahrain International Circuit'
# iterate through entire dataset

all_preds = []

# create a dataframe to store the results
final_results = pd.DataFrame(columns=['circuit', 'driver', 'pred_pos'])

# for circuit in current_circuits: 
# create model specific to a circuit
circuit_data = data_train.loc[(data_train['circuit'] == circuit)]
# circuit_training_drivers = data_train
one_hot_encoded_data = pd.get_dummies(circuit_data, columns = ['circuit', 'constructor', 'driver', 'teammate'])
data_train_scaled = one_hot_encoded_data.copy()
data_train_scaled.qualifyingPos = standard_scaler_scale(one_hot_encoded_data.qualifyingPos)
data_train_scaled.position = standard_scaler_scale(one_hot_encoded_data.position)

train_set = data_train_scaled[data_train_scaled.year != 2023]
test_set = data_train_scaled[data_train_scaled.year == 2023]

X_train = train_set.drop(['position', 'year', 'points'],axis = 1)
y_train = train_set['position']

# circuit_training_drivers = X_train.driver.unique().str.split()

regr = MLPRegressor(
    hidden_layer_sizes=(500, 250, 100, 50), 
    max_iter=500,
    random_state=203043043
    ).fit(X_train, y_train)


# get the data specific to the circuit

# remove columns that are in test set but are not in training set 
circuit_training_drivers = data_train.driver.loc[(data_train['circuit'] == circuit) & (data_train['year'] != 2023)].unique()
circuit_training_teammates = data_train.teammate.loc[(data_train['circuit'] == circuit) & (data_train['year'] != 2023)].unique()

# if there are drivers in the permutation but not in the training set then remove them
circuit_permutation = cleaned_permutations.loc[(cleaned_permutations['circuit'] == circuit)]
circuit_permutation = circuit_permutation[circuit_permutation['driver'].isin(circuit_training_drivers)]
circuit_permutation = circuit_permutation[circuit_permutation['teammate'].isin(circuit_training_teammates)]
# do one hot encoding requires all possible combinations to be same as training set
one_hot_circuit = pd.get_dummies(circuit_permutation, columns = ['circuit', 'constructor', 'driver', 'teammate'])



# need to randomly make a couple qualifying positions dnq
dnf_rate = dnf_circuit.loc[dnf_circuit['circuit'] == circuit]['percentDnf'].iloc[0]

num_values_to_replace = int(dnf_rate * len(one_hot_circuit['qualifyingPos']))
indices_to_replace = np.random.choice(one_hot_circuit.index, num_values_to_replace, replace=False)
one_hot_circuit.loc[indices_to_replace, 'qualifyingPos'] = 0

# predict on the circuit
pred_vals = standard_scaler_unscale(regr.predict(one_hot_circuit), one_hot_encoded_data.position.mean(), 
                                        one_hot_encoded_data.position.std()).astype(np.int32)
one_hot_circuit['all_preds'] = pred_vals     

for driver in circuit_training_drivers:

    # find the median position for each driver
    med_pos = one_hot_circuit.loc[one_hot_circuit['driver_'+driver] == 1]['all_preds'].median()
    # add the results to the dataframe
    final_results = final_results.append({'circuit': circuit, 'driver': driver, 'pred_pos': med_pos}, ignore_index=True)  

final_results['pred_points'] = final_results['pred_pos']   
final_results['pred_points'] = final_results['pred_points'].replace({1.0: 25, 2.0: 18, 3.0: 15, 4.0: 12, 5.0: 10, 6.0: 8, 7.0: 6, 8.0: 4, 9.0: 2, 10.0: 1})

In [15]:
final_results

Unnamed: 0,circuit,driver,pred_pos,pred_points
0,Bahrain International Circuit,ALO,6.0,8.0
1,Bahrain International Circuit,HAM,7.0,6.0
2,Bahrain International Circuit,VET,,
3,Bahrain International Circuit,HUL,9.0,2.0
4,Bahrain International Circuit,PER,9.0,2.0
5,Bahrain International Circuit,RAI,,
6,Bahrain International Circuit,RIC,6.0,8.0
7,Bahrain International Circuit,GRO,,
8,Bahrain International Circuit,BOT,7.0,6.0
9,Bahrain International Circuit,MAG,6.0,8.0
