In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

np.set_printoptions(precision=4)

In [2]:
# The following exercise will study the Accuracy Score of various Regression Models to predict a Team Change for any given Driver based on the factors WOE and IV defined as Strong Predictors.

# As always, let's deploy our Dataset first. 

data = pd.read_csv('/Users/alejandropalacios/Desktop/Ironhack/Data Analytics Bootcamp/Advanced Data Analysis Techniques/Project/F1-Grand-Prix-Predictor/Data Manipulation/Modeling/Modeled Databases/team_change_dataset.csv')

data.head()

Unnamed: 0,driver,season,round,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,nationality,...,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age,wins_percentage,change_teams
0,baldi,1983,1,jacarepagua,False,False,True,False,False,Italian,...,0,0,0,0,0,0,1.454,29,,1
1,serra,1983,1,jacarepagua,False,False,True,False,False,Brazilian,...,0,0,0,0,0,0,5.293,26,0.0,1
2,surer,1983,1,jacarepagua,False,False,True,False,False,Swiss,...,0,0,0,0,0,0,3.796,31,,1
3,manfred_winkelhock,1983,1,jacarepagua,False,False,True,False,False,German,...,0,0,0,0,0,0,6.481,31,,1
4,patrese,1983,1,jacarepagua,False,False,True,False,False,Italian,...,0,0,0,0,0,0,1.286,28,,1


In [3]:
# We must get dummies for the categorical variables dataset in order to properly run our Model.

df_woeiv = data[['season','round','constructor_standings_pos','constructor_wins','driver','qualifying_time','constructor','podium','change_teams']]

df_dum = pd.get_dummies(df_woeiv, columns = ['constructor','driver'] )

for col in df_dum.columns:
    if 'constructor' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
    
    elif 'driver' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
    else:
        pass

df_dum.to_csv('/Users/alejandropalacios/Desktop/Ironhack/Data Analytics Bootcamp/Advanced Data Analysis Techniques/Project/F1-Grand-Prix-Predictor/Data Manipulation/Modeling/Modeled Databases/team_change_dummies_dataset.csv', index = False)

In [4]:
# Now let's read our Dummified Data.

df = pd.read_csv('/Users/alejandropalacios/Desktop/Ironhack/Data Analytics Bootcamp/Advanced Data Analysis Techniques/Project/F1-Grand-Prix-Predictor/Data Manipulation/Modeling/Modeled Databases/team_change_dummies_dataset.csv')

In [5]:
# Fill the null rows with the mean of respective columns
df = df.fillna(df.mean())

In [6]:
# Send our Dependent Variable to the end of the Dataset.
df = df[[c for c in df if c not in ['change_teams']] + ['change_teams']]

df.head()

Unnamed: 0,season,round,constructor_standings_pos,constructor_wins,qualifying_time,podium,constructor_arrows,constructor_bar,constructor_benetton,constructor_brabham,...,driver_raikkonen,driver_ralf_schumacher,driver_ricciardo,driver_rosberg,driver_senna,driver_trulli,driver_vettel,driver_villeneuve,driver_webber,change_teams
0,1983,1,0,0,1.454,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1983,1,0,0,5.293,8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1983,1,0,0,3.796,5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1983,1,0,0,6.481,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1983,1,0,0,1.286,23,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# Let's First Split our data into Training and Testing Sets. The Train set contains all races from 1983 to 2018. The Test set consists of all 21 races in the season of 2019.

train = df[df.season <2019]
X_train = train.drop(['change_teams'], axis = 1)
y_train = train.change_teams

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [8]:
def score_regression(model):
    score = 0
    for circuit in df[df.season == 2019]['round'].unique():

        test = df[(df.season == 2019) & (df['round'] == circuit)]
        X_test = test.drop(['change_teams'], axis = 1)
        y_test = test.change_teams

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['change_teams'] = y_test.reset_index(drop = True)
        prediction_df['actual'] = prediction_df.change_teams.map(lambda x: 1 if x == 1 else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / df[df.season == 2019]['round'].unique().max()
    return model_score

In [9]:
comparison_dict ={'model':[],
                  'params': [],
                  'score': []}

In [10]:
# Linear Regression

params={'fit_intercept': ['True', 'False']}

for fit_intercept in params['fit_intercept']:
    model_params = (fit_intercept)
    model = LinearRegression(fit_intercept = fit_intercept)
    model.fit(X_train, y_train)
            
    model_score = score_regression(model)
            
    comparison_dict['model'].append('linear_regression')
    comparison_dict['params'].append(model_params)
    comparison_dict['score'].append(model_score)

In [11]:
# Random Forest Regressor

params={'criterion': ['mse'],
        'max_features': [0.8, 'auto', None],
        'max_depth': list(np.linspace(5, 55, 26)) + [None]}

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:
            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion = criterion,
                                          max_features = max_features, max_depth = max_depth, random_state = 1)
            model.fit(X_train, y_train)
            
            model_score = score_regression(model)
            
            comparison_dict['model'].append('random_forest_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)

In [12]:
# Support Vector Machines

params={'gamma': np.logspace(-4, -1, 10),
        'C': np.logspace(-2, 1, 10),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 

for gamma in params['gamma']:
    for c in params['C']:
        for kernel in params['kernel']:
            model_params = (gamma, c, kernel)
            model = svm.SVR(gamma = gamma, C = c, kernel = kernel)
            model.fit(X_train, y_train)
            
            model_score = score_regression(model)
            
            comparison_dict['model'].append('svm_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)

In [13]:
# Neural network

params={'hidden_layer_sizes': [(80,20,40,5), (75,30,50,10,3)], 
        'activation': ['identity', 'relu','logistic', 'tanh',], 
        'solver': ['lbfgs','sgd', 'adam'], 
        'alpha': np.logspace(-4,1,20)} 

for hidden_layer_sizes in params['hidden_layer_sizes']:
    for activation in params['activation']:
        for solver in params['solver']:
            for alpha in params['alpha']:
                model_params = (hidden_layer_sizes, activation, solver, alpha )
                model = MLPRegressor(hidden_layer_sizes = hidden_layer_sizes,
                                      activation = activation, solver = solver, alpha = alpha, random_state = 1)
                model.fit(X_train, y_train)

                model_score = score_regression(model)

                comparison_dict['model'].append('nn_regressor')
                comparison_dict['params'].append(model_params)
                comparison_dict['score'].append(model_score)

In [14]:
pd.DataFrame(comparison_dict).groupby('model')['score'].max()

model
linear_regression          0.047619
nn_regressor               0.095238
random_forest_regressor    0.047619
svm_regressor              0.047619
Name: score, dtype: float64