**Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the cars dataset used in the lesson.**

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler

from pydataset import data

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [36]:
mpg = data('mpg')

mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [37]:
#Prep data

mpg.trans = np.where(mpg.trans.str.startswith('auto'), 'auto', 'manual')

In [38]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto,f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual,f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual,f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto,f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto,f,16,26,p,compact


In [39]:
dummy_features = ['manufacturer', 'model', 'drv', 'fl', 'class']

dummy_df = pd.get_dummies(mpg[dummy_features], dummy_na=False, drop_first=[True, True, True, True, True])

mpg = pd.concat([mpg, dummy_df], axis=1)

mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,...,fl_d,fl_e,fl_p,fl_r,class_compact,class_midsize,class_minivan,class_pickup,class_subcompact,class_suv
1,audi,a4,1.8,1999,4,auto,f,18,29,p,...,0,0,1,0,1,0,0,0,0,0
2,audi,a4,1.8,1999,4,manual,f,21,29,p,...,0,0,1,0,1,0,0,0,0,0
3,audi,a4,2.0,2008,4,manual,f,20,31,p,...,0,0,1,0,1,0,0,0,0,0
4,audi,a4,2.0,2008,4,auto,f,21,30,p,...,0,0,1,0,1,0,0,0,0,0
5,audi,a4,2.8,1999,6,auto,f,16,26,p,...,0,0,1,0,1,0,0,0,0,0


In [40]:
mpg = mpg.drop(columns=dummy_features)

In [41]:
mpg.head()

Unnamed: 0,displ,year,cyl,trans,cty,hwy,manufacturer_chevrolet,manufacturer_dodge,manufacturer_ford,manufacturer_honda,...,fl_d,fl_e,fl_p,fl_r,class_compact,class_midsize,class_minivan,class_pickup,class_subcompact,class_suv
1,1.8,1999,4,auto,18,29,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1.8,1999,4,manual,21,29,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,2.0,2008,4,manual,20,31,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,2.0,2008,4,auto,21,30,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
5,2.8,1999,6,auto,16,26,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [86]:
features = list(mpg.columns.values)#.remove('trans')

In [87]:
features.remove('trans')

In [88]:
features

['displ',
 'year',
 'cyl',
 'cty',
 'hwy',
 'manufacturer_chevrolet',
 'manufacturer_dodge',
 'manufacturer_ford',
 'manufacturer_honda',
 'manufacturer_hyundai',
 'manufacturer_jeep',
 'manufacturer_land rover',
 'manufacturer_lincoln',
 'manufacturer_mercury',
 'manufacturer_nissan',
 'manufacturer_pontiac',
 'manufacturer_subaru',
 'manufacturer_toyota',
 'manufacturer_volkswagen',
 'model_a4',
 'model_a4 quattro',
 'model_a6 quattro',
 'model_altima',
 'model_c1500 suburban 2wd',
 'model_camry',
 'model_camry solara',
 'model_caravan 2wd',
 'model_civic',
 'model_corolla',
 'model_corvette',
 'model_dakota pickup 4wd',
 'model_durango 4wd',
 'model_expedition 2wd',
 'model_explorer 4wd',
 'model_f150 pickup 4wd',
 'model_forester awd',
 'model_grand cherokee 4wd',
 'model_grand prix',
 'model_gti',
 'model_impreza awd',
 'model_jetta',
 'model_k1500 tahoe 4wd',
 'model_land cruiser wagon 4wd',
 'model_malibu',
 'model_maxima',
 'model_mountaineer 4wd',
 'model_mustang',
 'model_nav

In [89]:
#features = list(mpg.columns.values).remove('trans')
X, y = mpg[features], mpg.trans

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [90]:
mms = MinMaxScaler()

In [91]:
features_to_scale = ['displ', 'year', 'cyl', 'cty', 'hwy']

X_train[features_to_scale] = mms.fit_transform(X_train[features_to_scale])
X_test[features_to_scale] = mms.transform(X_test[features_to_scale])

In [92]:
X_train.head()

Unnamed: 0,displ,year,cyl,cty,hwy,manufacturer_chevrolet,manufacturer_dodge,manufacturer_ford,manufacturer_honda,manufacturer_hyundai,...,fl_d,fl_e,fl_p,fl_r,class_compact,class_midsize,class_minivan,class_pickup,class_subcompact,class_suv
178,0.444444,1.0,0.5,0.269231,0.25,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
203,0.203704,1.0,0.0,0.307692,0.3125,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
173,0.166667,1.0,0.0,0.423077,0.46875,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
141,0.62963,0.0,1.0,0.153846,0.15625,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
13,0.222222,0.0,0.5,0.307692,0.40625,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [93]:
X_test.head()

Unnamed: 0,displ,year,cyl,cty,hwy,manufacturer_chevrolet,manufacturer_dodge,manufacturer_ford,manufacturer_honda,manufacturer_hyundai,...,fl_d,fl_e,fl_p,fl_r,class_compact,class_midsize,class_minivan,class_pickup,class_subcompact,class_suv
78,0.444444,0.0,0.5,0.192308,0.15625,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
97,0.555556,1.0,1.0,0.230769,0.34375,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
137,0.703704,1.0,1.0,0.115385,0.1875,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
127,0.574074,1.0,1.0,0.0,0.0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
151,0.314815,0.0,0.5,0.192308,0.15625,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [94]:
tree = DecisionTreeClassifier()

cross_val_score(tree, X_train, y_train, cv=5)

array([0.65714286, 0.62857143, 0.74285714, 0.54285714, 0.6       ])

In [95]:
cross_val_score(tree, X_train, y_train, cv=5).mean()

0.6457142857142857

In [96]:
params = {'max_depth': range(1,11),
         'min_samples_leaf': range(1,11)}

In [97]:
grid = GridSearchCV(tree, params, cv=5)

In [98]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_leaf': range(1, 11)})

In [100]:
grid.cv_results_['mean_test_score']

array([0.66285714, 0.66285714, 0.66285714, 0.66285714, 0.66285714,
       0.66285714, 0.66285714, 0.66285714, 0.66285714, 0.66285714,
       0.62857143, 0.62857143, 0.62857143, 0.63428571, 0.62857143,
       0.62857143, 0.62857143, 0.62857143, 0.62857143, 0.62857143,
       0.63428571, 0.63428571, 0.63428571, 0.64571429, 0.65142857,
       0.66857143, 0.68      , 0.66285714, 0.65714286, 0.66285714,
       0.64571429, 0.62857143, 0.60571429, 0.59428571, 0.60571429,
       0.58857143, 0.60571429, 0.60571429, 0.59428571, 0.58857143,
       0.62285714, 0.62285714, 0.60571429, 0.60571429, 0.58857143,
       0.62285714, 0.58285714, 0.58857143, 0.58857143, 0.59428571,
       0.64      , 0.63428571, 0.63428571, 0.58857143, 0.6       ,
       0.62285714, 0.58285714, 0.59428571, 0.58857143, 0.59428571,
       0.6       , 0.58857143, 0.60571429, 0.6       , 0.62285714,
       0.62857143, 0.58857143, 0.59428571, 0.6       , 0.59428571,
       0.62285714, 0.61714286, 0.62857143, 0.61142857, 0.60571