In [1]:
%load_ext autoreload
%autoreload 2

!python --version

Python 3.7.4


In [2]:
import sys

sys.path.append('..')

In [3]:
from sklearn.metrics import classification_report

# Hyper-Parameter Tunning Using Nature Inspired Algorithms 

Load the [Covertype](https://archive.ics.uci.edu/ml/datasets/Covertype) dataset.

In [4]:
import random

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_covtype

X, y = fetch_covtype(return_X_y=True)

# shrink the dataset to the 1% of its original size
rows_id = random.sample(range(0, len(X) - 1), (int)(len(X) * 0.01))

X = X[rows_id, :]
y = y[rows_id]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'train size - {len(X_train)}\ntest size - {len(X_test)}')

train size - 4648
test size - 1162


Define the parameter grid.

In [5]:
param_grid = { 
    'n_estimators': range(20, 180, 20), 
    'max_depth': [2, 4, 6, 8, 10], 
    'min_samples_split': range(2, 10, 2) 
}

### GridSearch

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

In [7]:
%%time

clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 800 out of 800 | elapsed:  4.1min finished


CPU times: user 3min 49s, sys: 1.47 s, total: 3min 50s
Wall time: 4min 6s


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                  

In [8]:
grid_search.best_params_

{'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 60}

In [9]:
clf = RandomForestClassifier(**grid_search.best_params_, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.71      0.69      0.70       400
           2       0.74      0.84      0.79       571
           3       0.62      0.84      0.71        73
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00        20
           6       1.00      0.15      0.26        47
           7       0.95      0.44      0.61        45

    accuracy                           0.73      1162
   macro avg       0.57      0.42      0.44      1162
weighted avg       0.72      0.73      0.71      1162



  _warn_prf(average, modifier, msg_start, len(result))


### NatureInspiredSearch

In [10]:
from sklearn_nature_inspired_algorithms.model_selection.nature_inspired_search_cv import NatureInspiredSearchCV

In [11]:
%%time

clf = RandomForestClassifier(random_state=42)

nia_search = NatureInspiredSearchCV(
    clf,
    param_grid,
    cv=5,
    verbose=0,
    max_n_gen=100,
    max_stagnating_gen=5,
    scoring='accuracy')

nia_search.fit(X_train, y_train)

CPU times: user 18.8 s, sys: 224 ms, total: 19.1 s
Wall time: 21 s




NatureInspiredSearchCV(cv=5, error_score=nan,
                       estimator=RandomForestClassifier(bootstrap=True,
                                                        ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='auto',
                                                        max_leaf_nodes=None,
                                                        max_samples=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                             

In [12]:
nia_search.best_params_

{'n_estimators': 80, 'max_depth': 10, 'min_samples_split': 4}

In [13]:
clf = RandomForestClassifier(**nia_search.best_params_, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.71      0.69      0.70       400
           2       0.74      0.84      0.79       571
           3       0.63      0.84      0.72        73
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00        20
           6       0.67      0.13      0.21        47
           7       0.95      0.47      0.63        45

    accuracy                           0.72      1162
   macro avg       0.53      0.42      0.43      1162
weighted avg       0.71      0.72      0.71      1162



  _warn_prf(average, modifier, msg_start, len(result))
