### cross_validation_and_grid_search

학습 날짜 : 2019-06-15

학습 내용 : Cross Validation and Grid Search for Model Selection in Python

학습 사이트 : https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python

참고 : https://datascienceschool.net/view-notebook/266d699d748847b3a3aa7b9805b846ae/

In [1]:
import pandas as pd
import numpy as np

In [5]:
dataset = pd.read_csv('wineQualityReds.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
dataset.drop(['Unnamed: 0'], axis=1, inplace=True)

In [9]:
dataset.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [11]:
X = dataset.iloc[:, 0:11].values
y = dataset.iloc[:, 11].values

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)

In [16]:
from sklearn.preprocessing import StandardScaler

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test) 

In [17]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300, random_state=0)

In [18]:
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)

In [19]:
all_accuracies

array([0.62548263, 0.6459144 , 0.65882353, 0.67716535, 0.68503937])

In [20]:
print(all_accuracies.mean())

0.6584850552381998


In [21]:
print(all_accuracies.std())

0.021454051350514182


### grid search

In [23]:
from sklearn.model_selection import GridSearchCV

In [43]:
grid_param = {
    'n_estimators' : [100, 300, 500, 800, 1000],
    'criterion' :['gini', 'entropy']
}

In [44]:
gd_sr = GridSearchCV(
    estimator=classifier,
    param_grid=grid_param,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

In [45]:
gd_sr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'criter

In [46]:
best_parameters = gd_sr.best_params_
print(best_parameters)

{'criterion': 'entropy', 'n_estimators': 100}


In [47]:
best_result = gd_sr.best_score_
print(best_result)

0.6606724003127443
