# Chapter 9: Demo GridSearchCV & RandomSearch

In [None]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load dataset
iris = datasets.load_iris()

In [None]:
type(iris)

In [None]:
# print the label species(setosa, versicolor,virginica)
print(iris.target_names)

In [None]:
# print the names of the four features
print(iris.feature_names)

In [None]:
# print the iris data (top 5 records)
print(iris.data[0:5])

In [None]:
# print the iris labels (0:setosa, 1:versicolor, 2:virginica)
print(iris.target[:5])

In [None]:
# Creating a DataFrame of given iris dataset.
import pandas as pd
data=pd.DataFrame({
    'sepal length':iris.data[:,0],
    'sepal width':iris.data[:,1],
    'petal length':iris.data[:,2],
    'petal width':iris.data[:,3],
    'species':iris.target
})
data.head()

In [None]:
X=data[['petal length', 'petal width']]  
y=data['species']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                  random_state = 42) 

# GridSearchCV

In [None]:
# Dùng Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid = { 
    'n_estimators': [30, 50, 100, 150, 200, 250, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ["gini", "entropy"]    
}

In [None]:
from datetime import datetime
from datetime import timedelta

In [None]:
start_time = datetime.now()

In [None]:
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), 
                      param_grid=param_grid, cv= 5)

In [None]:
CV_rfc.fit(X_train, y_train)

In [None]:
end_time = datetime.now()

In [None]:
dt = end_time - start_time
seconds_1 = (dt.days * 24 * 60 * 60 + dt.seconds) 
print(seconds_1)

In [None]:
print(CV_rfc.best_params_)

In [None]:
# Dự đoán trên test dataset
y_pred_1=CV_rfc.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
# Độ chính xác lúc này
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_1))

In [None]:
# với petal length = 5, petal width = 2 => loại hoa gì?
CV_rfc.predict([[5, 2]])

# Sử dụng Random Search

In [None]:
# dùng random search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
param_dist = {"n_estimators":[30, 50, 100, 150, 200, 250, 300],
              "max_features": ['auto', 'sqrt', 'log2'],             
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [None]:
start_time = datetime.now()

In [None]:
forest_random = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_dist,
                                   cv=5, random_state=1)

In [None]:
forest_random.fit(X_train,y_train)

In [None]:
end_time = datetime.now()

In [None]:
dt = end_time - start_time
seconds_2 = (dt.days * 24 * 60 * 60 + dt.seconds) 
print(seconds_2)

In [None]:
forest_random_best = forest_random.best_estimator_ 
forest_random_best

In [None]:
print("Best Model Parameter: ",forest_random.best_params_)

In [None]:
# Dự đoán trên test dataset
y_pred_1=forest_random.predict(X_test)

In [None]:
# Độ chính xác lúc này
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_1))

In [None]:
# với petal length = 5, petal width = 2 => loại hoa gì?
forest_random.predict([[5, 2]])