# Selecting Optimal Parameter
## Introduction

In this notebook, there are several sections that describe the function.
1. OOB_ParamGridSearch function
    - Load Example file and necessery package
    - Decomposing code for testing
      + fit
      + fit_score
      + oob_score_accuracy

+ ####  Load Example file and necessery package

In [None]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
import numpy as np
import pandas as pd

from copy import deepcopy
import joblib
from sklearn.datasets import load_iris
from sklearn.model_selection import ParameterGrid

iris = load_iris()
X = iris.data
y = iris.target

clf = RandomForestClassifier()

param_grid = {
    'n_estimators': [20, 30, 100],
    'criterion': ['gini'],
    'max_depth': [2, 3]
}


####  Full function of OOB_ParamGridSearch

In [None]:
class OOB_ParamGridSearch:

    def __init__(self,
                 estimator,
                 param_grid,
                 n_jobs = -1):

        self.n_jobs = n_jobs
        self.estimator = estimator
        self.param_grid = param_grid

    def fit(self,
            X_train,
            y_train):

        params_iterable = list(ParameterGrid(self.param_grid))
        parallel = joblib.Parallel(self.n_jobs)

        output = parallel(
            joblib.delayed(self.fit_and_score)(deepcopy(self.estimator),
                                           X_train, y_train, parameters)
            for parameters in params_iterable)


        n_candidates = len(params_iterable)
        a=np.array(output, dtype=np.float64)

        best_index = np.argmin(a)
        best_score_ = a[best_index]
        best_param_ = params_iterable[best_index]

        cv_results = pd.DataFrame(output, columns=['OOB_Error_Score'])
        df_params = pd.DataFrame(params_iterable)
        cv_results = pd.concat([cv_results, df_params], axis=1)

        cv_results = (cv_results.
                      sort_values(['OOB_Error_Score'], ascending=True).
                      reset_index(drop=True))

        return cv_results

    def fit_and_score(self,
                      estimator,
                      X_train,
                      y_train,
                      parameters):

        estimator.set_params(**parameters, random_state=1)
        estimator.fit(X_train, y_train)
        oob_error = 1 - self.oob_score_accuracy(estimator, X_train, y_train)

        return oob_error

    def oob_score_accuracy(self,
                           rf,
                           X_train,
                           y_train):

        from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap

        X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
        y = y_train.values if isinstance(y_train, pd.Series) else y_train

        n_samples = len(X)
        n_classes = len(np.unique(y))
        predictions = np.zeros((n_samples, n_classes))

        for tree in rf.estimators_:
            n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
            unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)

            tree_preds = tree.predict_proba(X[unsampled_indices, :])
            predictions[unsampled_indices] += tree_preds

        predicted_class_indexs = np.argmax(predictions, axis=1)
        predicted_class = [rf.classes_[i] for i in predicted_class_indexs]

        oob_score = np.mean(y == predicted_class)

        return oob_score

oob_gridsearch = OOB_ParamGridSearch(n_jobs=1,
                                     estimator=RandomForestClassifier(),
                                     param_grid=param_grid)

oob_gridsearch.fit(X_train=X, y_train=y)


### Decomposing code for testing
+ fit
+ fit_score
+ oob_score_accuracy

In [None]:
def fit(X, y, param_grid):

    params_iterable = list(ParameterGrid(param_grid))

    parallel = joblib.Parallel(n_jobs=1)

    output = parallel(
              joblib.delayed(_fit_and_score)(deepcopy(
                RandomForestClassifier()), X, y,parameters)
            for parameters in params_iterable)


    n_candidates = len(params_iterable)
    a=np.array(output, dtype=np.float64)

    best_index = np.argmin(a)
    best_score_ = a[best_index]
    best_param_ = params_iterable[best_index]

    cv_results = pd.DataFrame(output, columns=['OOB_Error_Score'])
    df_params = pd.DataFrame(params_iterable)
    cv_results = pd.concat([cv_results, df_params], axis = 1)


    cv_results = (cv_results.
                  sort_values(['OOB_Error_Score'],ascending=True).
                  reset_index(drop=True))

    return cv_results

def _fit_and_score(estimator, X, y, parameters):


    estimator.set_params(**parameters, random_state=1)
    estimator.fit(X, y)
    oob_error = 1 - oob_score_accuracy(estimator, X, y)

    return oob_error


def oob_score_accuracy(rf, X, y):
    from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap

    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    n_samples = len(X)
    n_classes = len(np.unique(y))
    predictions = np.zeros((n_samples, n_classes))
    for tree in rf.estimators_:
        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
        unsampled_indices = _generate_unsampled_indices(
            tree.random_state, n_samples, n_samples_bootstrap)

        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexs = np.argmax(predictions, axis=1)
    predicted_class = [rf.classes_[i] for i in predicted_class_indexs]

    oob_score = np.mean(y == predicted_class)

    return oob_score

a=fit(X,y,param_grid)
print(a)