In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Exercises

Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the mpg dataset used in the lesson.

In [2]:
df = data('mpg')
df

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [4]:
#change trans column to auto and manual
df.trans = np.where(df.trans.str.startswith('auto'),'auto','manual')
df.trans.value_counts()

auto      157
manual     77
Name: trans, dtype: int64

In [12]:
df['fl'].value_counts()

r    168
p     52
e      8
d      5
c      1
Name: fl, dtype: int64

In [13]:
#split into X and y
X = df[['displ', 'year', 'cyl', 'cty', 'hwy']]
y = df.trans

In [14]:
#split into trains and tests
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

In [22]:
tree = DecisionTreeClassifier(random_state=42)
tree_params = {'criterion':['gini', 'entropy', 'log_loss'],
                'max_depth':range(1,21),
                'min_samples_leaf':range(1,11)}
forest = RandomForestClassifier(random_state=42)
forest_params = {'criterion':['gini', 'entropy', 'log_loss'],
                'max_depth':range(1,21),
                'min_samples_leaf':range(1,11)}
knn = KNeighborsClassifier()
knn_params = {'n_neighbors':range(1,21),
                'weights':['uniform', 'distance'],
                'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
                'p':[1,2]}
log = LogisticRegression(random_state=42,max_iter=500)
log_params = {'penalty':['l1', 'l2', 'elasticnet'],
                'C':range(1,11),
                'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

In [17]:
def grids(model,params,X_train,y_train,cv=5):
    grid = GridSearchCV(model,params,cv=cv)
    grid.fit(X_train,y_train)
    best_model = grid.best_estimator_
    best_score = grid.best_score_
    best_params = grid.best_params_
    return best_model,best_score,best_params

In [18]:
grids(tree,tree_params,X_train,y_train)

(DecisionTreeClassifier(criterion='entropy', max_depth=9, min_samples_leaf=4,
                        random_state=42),
 0.7257142857142858,
 {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 4})

In [19]:
grids(forest,forest_params,X_train,y_train)

(RandomForestClassifier(max_depth=9, random_state=42),
 0.7142857142857143,
 {'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 1})

In [20]:
grids(knn,knn_params,X_train,y_train)

(KNeighborsClassifier(n_neighbors=19, weights='distance'),
 0.7542857142857142,
 {'algorithm': 'auto', 'n_neighbors': 19, 'p': 2, 'weights': 'distance'})

In [23]:
grids(log,log_params,X_train,y_train)

500 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



(LogisticRegression(C=1, max_iter=500, penalty='l1', random_state=42,
                    solver='saga'),
 0.6685714285714287,
 {'C': 1, 'penalty': 'l1', 'solver': 'saga'})