# Selecting Optimal Parameter
## Introduction

In this notebook, there are several sections that describe the function.
1. OOB_ParamGridSearch function -> gridsearch_model.py
    - Full function of OOB_ParamGridSearch
    - Decomposing code for testing
      + fit
      + fit_score
      + oob_score_accuracy
2. Evaluation function -> eval_metrics.py  

####  Full function of OOB_ParamGridSearch

In [14]:
from sklearn.model_selection import ParameterGrid
from copy import deepcopy
import numpy as np
import pandas as pd
import joblib
import eval_metrics

class OOB_ParamGridSearch:
    def __init__(self, 
                 estimator, 
                 param_grid,
                 seed,
                 n_jobs=-1, 
                 refit=True, 
                 task="regression", 
                 metric="mse"):
        """
        Initializes the OOB_ParamGridSearch class.

       
        :param estimator (object): The base estimator to be used.
        :param param_grid (dict or list of dicts): The parameter grid to search over.
        :param seed (int): The random 
        :param n_jobs (int, optional): The number of jobs to run in parallel. Defaults to -1.
        :param refit (bool, optional): Indicates whether to refit the model with the best hyperparameters. Defaults to True.
        :param task (str, optional): The task type, either "classification" or "regression". Defaults to "classification".
        :param metric (str, optional): The evaluation metric to use. Defaults to "mse".
        """
        self.n_jobs = n_jobs
        self.seed = seed 
        self.estimator = estimator
        self.param_grid = param_grid
        self.refit = refit
        self.task = task
        self.metric = metric

    def fit(self, 
            X_train, 
            y_train):
        """
        Fits the model with the given training data using the parameter grid search.

        :param X_train (array-like): The input features for training.
        :param y_train (array-like): The target values for training.

        :return self (object): Returns self.
        """
        params_iterable = list(ParameterGrid(self.param_grid))
        parallel = joblib.Parallel(self.n_jobs)

        output = parallel(
            joblib.delayed(self.fit_and_score)(deepcopy(self.estimator), X_train, y_train, parameters)
            for parameters in params_iterable)

        output_array = np.array(output, dtype=np.float64)

        best_index = np.argmin(output_array)
        self.best_score_ = output_array[best_index]
        self.best_param_ = params_iterable[best_index]

        cv_results = pd.DataFrame(output, columns=['OOB_Error_Score'])
        df_params = pd.DataFrame(params_iterable)
        cv_results = pd.concat([cv_results, df_params], axis=1)
        cv_results["params"] = params_iterable
        self.cv_results = (cv_results.
                           sort_values(['OOB_Error_Score'], ascending=True).
                           reset_index(drop=True))

        if self.refit:
            # Final fit with best hyperparameters
            self.cv_model = deepcopy(self.estimator)(rseed=1, **self.best_param_)
            self.cv_model.fit(X_train, y_train, feature_weight=None)
            self.cv_model.save_model("/exeh_4/yuping/123.pkl")

        return self

    def fit_and_score(self, 
                      estimator, 
                      X_train, 
                      y_train, 
                      parameters):
        """
        Fits the model and calculates the out-of-bag (OOB) error score.

        :param estimator (object): The estimator object.
        :param X_train (array-like): The input features for training.
        :param y_train (array-like): The target values for training.
        :param parameters (dict): The hyperparameters to use for fitting the model.

        :return oob_error (float): The calculated out-of-bag error score.
        """
        train_model = estimator(rseed=self.seed, **parameters)
        train_model.fit(X_train, y_train, feature_weight=None)
        oob_error = 1 - self.oob_score_accuracy(train_model, X_train, y_train, task=self.task, metric=self.metric)

        return oob_error

    def oob_score_accuracy(self, 
                           rf, 
                           X_train, 
                           y_train, 
                           task, 
                           metric):
        """
        Calculates the out-of-bag (OOB) score accuracy.

       
        :param rf (object): The random forest model.
        :param X_train (array-like): The input features for training.
        :param y_train (array-like): The target values for training.
        :param task (str): The task type, either "classification" or "regression".
        :param metric (str): The evaluation metric to use.

        :return oob_score (float): The calculated out-of-bag score accuracy.
        """
        from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap

        X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
        y = y_train.values if isinstance(y_train, pd.Series) else y_train

        if task == "classification":
            n_samples = len(X)
            n_classes = len(np.unique(y))
            predictions = np.zeros((n_samples, n_classes))
            for tree in getattr(rf, "model").estimators_:
                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
                unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)

                tree_preds = tree.predict_proba(X[unsampled_indices, :])
                predictions[unsampled_indices] += tree_preds

            oob_score = eval_metrics.get_evaluation_report(predictions, y, task, metric)

            return oob_score

        else:
            n_samples = len(X)
            predictions = np.zeros(n_samples)
            n_predictions = np.zeros(n_samples)
            for tree in getattr(rf, "model").estimators_:
                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
                unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)

                tree_preds = tree.predict(X[unsampled_indices, :])
                predictions[unsampled_indices] += tree_preds
                n_predictions[unsampled_indices] += 1

            predictions /= n_predictions

            oob_score = eval_metrics.get_evaluation_report(predictions, y, task, metric)

            return oob_score

In [15]:
from sklearn.datasets import make_regression
import model as im

X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
param_grid = {
    'n_estimators': [20, 30, 100],
    'max_depth': [2, 3]
}

oob_gridsearch = OOB_ParamGridSearch(n_jobs=1,
                                     estimator=im.IterativeRFRegression,
                                     param_grid=param_grid,
                                     seed=123,
                                     refit=True,
                                     task="regression",
                                     metric="mse")


oob_gridsearch.fit(X_train=X, y_train=y)

<__main__.OOB_ParamGridSearch at 0x7fe42d286790>

### Decomposing code for testing
+ fit
+ fit_score
+ oob_score_accuracy

In [17]:
from sklearn.model_selection import ParameterGrid
from sklearn import datasets
from copy import deepcopy
import numpy as np
import pandas as pd
import joblib
import model as im


# Load the dataset
dataset = datasets.load_iris(as_frame=True)

# Separate out the data
X = dataset['data']
y = dataset['target']

param_grid = {
    'n_estimators': [20, 30, 100],
    'max_depth': [2, 3]
}

def fit(X, y, param_grid):

    params_iterable = list(ParameterGrid(param_grid))

    parallel = joblib.Parallel(n_jobs=1)

    output = parallel(
              joblib.delayed(_fit_and_score)(deepcopy(
                im.IterativeRFClassifier), X, y,parameters)
            for parameters in params_iterable)


    n_candidates = len(params_iterable)
    a=np.array(output, dtype=np.float64)

    best_index = np.argmin(a)
    best_score_ = a[best_index]
    best_param_ = params_iterable[best_index]

    cv_results = pd.DataFrame(output, columns=['OOB_Error_Score'])
    df_params = pd.DataFrame(params_iterable)
    cv_results = pd.concat([cv_results, df_params], axis = 1)


    cv_results = (cv_results.
                  sort_values(['OOB_Error_Score'],ascending=True).
                  reset_index(drop=True))

    return cv_results

def _fit_and_score(estimator, X, y, parameters):


    train_model = estimator(rseed=1, **parameters)
    train_model.fit(X, y,  feature_weight=None)
    oob_error = 1 - oob_score_accuracy(train_model, X, y)

    return oob_error


def oob_score_accuracy(rf, X, y):
    from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap

    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    n_samples = len(X)
    n_classes = len(np.unique(y))
    predictions = np.zeros((n_samples, n_classes))
    for tree in getattr(rf, "model").estimators_:
        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
        unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)

        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexs = np.argmax(predictions, axis=1)
    predicted_class = [getattr(rf, "model").classes_[i] for i in predicted_class_indexs]
    
    oob_score = np.mean(y == predicted_class)
    
    return oob_score

oob_gridsearch = fit(X, y, param_grid)
print(oob_gridsearch)

   OOB_Error_Score  max_depth  n_estimators
0         0.033333          3           100
1         0.040000          3            30
2         0.046667          3            20
3         0.053333          2            30
4         0.060000          2            20
5         0.060000          2           100


#### Evaulation Function 

In [1]:
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X, y)
prediction = regr.predict(X)

def get_evaluation_report(y_pred, y_true, task, metric):
    """
    Get values for common evaluation metrics

    :param y_pred: predicted values
    :param y_true: true values
    :param task: ML task to solve
    :param metic: choose specificed metric to assess the performance

    :return: dictionary with specificed metrics
    """
   
    if task == 'classification':
        average = 'micro' if len(np.unique(y_true)) > 2 else 'binary'
        eval_report_dict = {
            'auroc': sklearn.metrics.roc_auc_score(y_true=y_true, y_pred=y_pred, average=average),
            'aupr': sklearn.metrics.average_precision_score(y_true=y_true, y_pred=y_pred, average=average)
        }
        eval_report_dict = eval_report_dict[metric]
    else:
        eval_report_dict = {
            'mse': sklearn.metrics.mean_squared_error(y_true=y_true, y_pred=y_pred),
            'rmse': sklearn.metrics.mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False),
            'r2_score': sklearn.metrics.r2_score(y_true=y_true, y_pred=y_pred),
        }
        eval_report_dict = eval_report_dict[metric]
        
    return eval_report_dict


get_evaluation_report(prediction, y, task="regression",metric="mse")

232.4546273335677