In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import  LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score

In [2]:
# The following exercise will study the Accuracy Score of various Regression Models to predict a Team Change for any given Driver based on the factors WOE and IV defined as Strong Predictors.

# As always, let's deploy our Dataset first. 

data = pd.read_csv('/Users/alejandropalacios/Desktop/Ironhack/Data Analytics Bootcamp/Advanced Data Analysis Techniques/Project/F1-Grand-Prix-Predictor/Data Manipulation/Modeling/Modeled Databases/team_change_dataset_shift.csv')

data.head()

Unnamed: 0,driver,season,round,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,nationality,...,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age,wins_percentage,change_teams
0,baldi,1983,1,jacarepagua,False,False,True,False,False,Italian,...,0,0,0,0,0,0,1.454,29,,1
1,serra,1983,1,jacarepagua,False,False,True,False,False,Brazilian,...,0,0,0,0,0,0,5.293,26,0.0,1
2,surer,1983,1,jacarepagua,False,False,True,False,False,Swiss,...,0,0,0,0,0,0,3.796,31,,1
3,manfred_winkelhock,1983,1,jacarepagua,False,False,True,False,False,German,...,0,0,0,0,0,0,6.481,31,,1
4,patrese,1983,1,jacarepagua,False,False,True,False,False,Italian,...,0,0,0,0,0,0,1.286,28,,1


In [3]:
# We must get dummies for the categorical variables dataset in order to properly run our Model.

df_woeiv = data[['season','constructor_standings_pos','constructor_wins','driver','qualifying_time','constructor','podium','change_teams']]

df_dum = pd.get_dummies(df_woeiv, columns = ['constructor','driver'] )

for col in df_dum.columns:
    if 'constructor' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
    
    elif 'driver' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
    else:
        pass

df_dum.to_csv('/Users/alejandropalacios/Desktop/Ironhack/Data Analytics Bootcamp/Advanced Data Analysis Techniques/Project/F1-Grand-Prix-Predictor/Data Manipulation/Modeling/Modeled Databases/team_change_dummies_dataset.csv', index = False)

In [4]:
# Now let's actually do some good good stuff.   

df = pd.read_csv('/Users/alejandropalacios/Desktop/Ironhack/Data Analytics Bootcamp/Advanced Data Analysis Techniques/Project/F1-Grand-Prix-Predictor/Data Manipulation/Modeling/Modeled Databases/team_change_dummies_dataset.csv')


# Moreover, the following Variables will be used based on our results (in order of Predictive Power):

"""

- constructor_standings_pos
- constructor_wins
- driver
- qualifying_time
- constructor
- podium

"""

'\n\n- constructor_standings_pos\n- constructor_wins\n- driver\n- qualifying_time\n- constructor\n- podium\n\n'

In [5]:
df = df[[c for c in df if c not in ['change_teams']] + ['change_teams']]

In [6]:
# Train the Model

train = df[df.season < 2000]
test = df[df.season >= 2000]

X_train = train.drop('change_teams',axis=1)
y_train = train.change_teams

X_test = test.drop('change_teams',axis=1)
y_test = test.change_teams

In [7]:
# Logistic Regression

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nModel Accuracy Score:\n",accuracy_score(y_test,y_pred))
print("\nModel Precision Score:\n",precision_score(y_test,y_pred))
print("\nModel Recall Score:\n",recall_score(y_test,y_pred))



Confusion Matrix:
 [[6935    6]
 [ 170  326]]

Model Accuracy Score:
 0.9763345434987226

Model Precision Score:
 0.9819277108433735

Model Recall Score:
 0.657258064516129


In [8]:
# Decision Tree Classifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train,y_train)
y_pred = tree_model.predict(X_test)

print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nModel Accuracy Score:\n",accuracy_score(y_test, y_pred))
print("\nModel Precision Score:\n",precision_score(y_test,y_pred))
print("\nModel Recall Score:\n",recall_score(y_test,y_pred))


Confusion Matrix:
 [[6709  232]
 [ 124  372]]

Model Accuracy Score:
 0.9521312357133253

Model Precision Score:
 0.6158940397350994

Model Recall Score:
 0.75


In [9]:
# Linear Discriminant Analysis

linear_model = LinearDiscriminantAnalysis()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nModel Accuracy Score:\n",accuracy_score(y_test, y_pred))
print("\nModel Precision Score:\n",precision_score(y_test,y_pred))
print("\nModel Recall Score:\n",recall_score(y_test,y_pred))


Confusion Matrix:
 [[6019  922]
 [ 249  247]]

Model Accuracy Score:
 0.8425440365738873

Model Precision Score:
 0.21129170230966637

Model Recall Score:
 0.49798387096774194


In [10]:
# Random Forest Classifier

forest_model = RandomForestClassifier(n_jobs=2, random_state=0)
forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nModel Accuracy Score:\n",accuracy_score(y_test, y_pred))
print("\nModel Precision Score:\n",precision_score(y_test,y_pred))
print("\nModel Recall Score:\n",recall_score(y_test,y_pred))


Confusion Matrix:
 [[6937    4]
 [ 134  362]]

Model Accuracy Score:
 0.9814441306978621

Model Precision Score:
 0.9890710382513661

Model Recall Score:
 0.7298387096774194


In [11]:
# Support Vector Classifier

svc_model = SVC()
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)

print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nModel Accuracy Score:\n",accuracy_score(y_test, y_pred))
print("\nModel Precision Score:\n",precision_score(y_test,y_pred))
print("\nModel Recall Score:\n",recall_score(y_test,y_pred))


Confusion Matrix:
 [[6941    0]
 [ 496    0]]

Model Accuracy Score:
 0.9333064407691274

Model Precision Score:
 0.0

Model Recall Score:
 0.0


In [12]:
# K Neighbors Classifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nModel Accuracy Score:\n",accuracy_score(y_test, y_pred))
print("\nModel Precision Score:\n",precision_score(y_test,y_pred))
print("\nModel Recall Score:\n",recall_score(y_test,y_pred))


Confusion Matrix:
 [[6930   11]
 [ 283  213]]

Model Accuracy Score:
 0.9604679306171844

Model Precision Score:
 0.9508928571428571

Model Recall Score:
 0.42943548387096775
