# Cross Validation and Grid Search for Model Selection in Python

### Source
https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/

### Load libraries

In [2]:
import pandas as pd
import numpy as np

input_file = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

df = pd.read_csv(input_file, sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [0]:
X = df.iloc[:, 0:11].values
y = df.iloc[:, 11].values

### Normalization

In [5]:
for col in df.columns:
  print(col, df[col].min(), df[col].max())

fixed acidity 4.6 15.9
volatile acidity 0.12 1.58
citric acid 0.0 1.0
residual sugar 0.9 15.5
chlorides 0.012 0.611
free sulfur dioxide 1.0 72.0
total sulfur dioxide 6.0 289.0
density 0.9900700000000001 1.00369
pH 2.74 4.01
sulphates 0.33 2.0
alcohol 8.4 14.9
quality 3 8


因為每個欄位的範圍都不同，所以要做 normalization

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Cross validation

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators=300, random_state=0)
cross_val = cross_val_score(estimator=clf, X=X, y=y, cv=5)

print(cross_val)
print(cross_val.mean())
print(cross_val.std())

[0.50931677 0.53271028 0.62305296 0.58176101 0.57097792]
0.5635637868664212
0.03954680133032447


### Grid search

In [12]:
from sklearn.model_selection import GridSearchCV

grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

grid = GridSearchCV(estimator=clf,
                    param_grid = grid_param,
                    scoring='accuracy',
                    cv=5,
                    n_jobs=-1)

grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'bootst

In [13]:
best_parameters = grid.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 1000}


In [14]:
best_results = grid.best_score_
print(best_results)

0.6622361219702892
