# Selecting Optimal Parameter
## Introduction

In this notebook, there are several sections that describe the function.
- OOB_ParamGridSearch function (gridsearch_model.py)
    - Full function of OOB_ParamGridSearch
    - Decomposing code for testing
      + fit
      + fit_score
      + oob_score_accuracy
- Evaluation function (eval_metrics.py) 
- RF_OOB_Dataset function (dataset_model.py)
    - from_config
    - get_samples
    - get_features
    - shuffle_data
    - train_test_split
    - generate_labels
    - save_pipeline
    - load_pipeline
- REFACTORED RF_OOB_Dataset function (dataset_model.py)
- Group_shuffle_spliu

###  OOB_ParamGridSearch function

In [3]:
from sklearn.model_selection import ParameterGrid
from copy import deepcopy
import numpy as np
import pandas as pd
import joblib
import eval_metrics

class OOB_ParamGridSearch:
    def __init__(self, 
                 estimator, 
                 param_grid,
                 seed,
                 n_jobs=-1, 
                 refit=True, 
                 task="regression", 
                 metric="mse"):
        """
        Initializes the OOB_ParamGridSearch class.

       
        :param estimator (object): The base estimator to be used.
        :param param_grid (dict or list of dicts): The parameter grid to search over.
        :param seed (int): The random for reproducibility
        :param n_jobs (int, optional): The number of jobs to run in parallel. Defaults to -1.
        :param refit (bool, optional): Indicates whether to refit the model with the best hyperparameters. Defaults to True.
        :param task (str, optional): The task type, either "classification" or "regression". Defaults to "classification".
        :param metric (str, optional): The evaluation metric to use. Defaults to "mse".
        """
        self.n_jobs = n_jobs
        self.seed = seed 
        self.estimator = estimator
        self.param_grid = param_grid
        self.refit = refit
        self.task = task
        self.metric = metric

    def fit(self, 
            X_train, 
            y_train):
        """
        Fits the model with the given training data using the parameter grid search.

        :param X_train (array-like): The input features for training.
        :param y_train (array-like): The target values for training.

        :return self (object): Returns self.
        """
        params_iterable = list(ParameterGrid(self.param_grid))
        parallel = joblib.Parallel(self.n_jobs)

        output = parallel(
            joblib.delayed(self.fit_and_score)(deepcopy(self.estimator), X_train, y_train, parameters)
            for parameters in params_iterable)

        output_array = np.array(output)

        best_index = np.argmin(output_array)
        self.best_score_ = output_array[best_index]
        self.best_param_ = params_iterable[best_index]

        cv_results = pd.DataFrame(output, columns=['OOB_Error_Score'])
        df_params = pd.DataFrame(params_iterable)
        cv_results = pd.concat([cv_results, df_params], axis=1)
        cv_results["params"] = params_iterable
        self.cv_results = (cv_results.
                           sort_values(['OOB_Error_Score'], ascending=True).
                           reset_index(drop=True))

        if self.refit:
            # Final fit with best hyperparameters
            cv_model = deepcopy(self.estimator)(rseed=self.seed, **self.best_param_)
            cv_model.fit(X_train, y_train, feature_weight=None)
            self.cv_model = cv_model

        return self

    def fit_and_score(self, 
                      estimator, 
                      X_train, 
                      y_train, 
                      parameters):
        """
        Fits the model and calculates the out-of-bag (OOB) error score.

        :param estimator (object): The estimator object.
        :param X_train (array-like): The input features for training.
        :param y_train (array-like): The target values for training.
        :param parameters (dict): The hyperparameters to use for fitting the model.

        :return oob_error (float): The calculated out-of-bag error score.
        """
        train_model = estimator(rseed=self.seed, **parameters)
        train_model.fit(X_train, y_train, feature_weight=None)
        oob_error = 1 - self.oob_score_accuracy(train_model, X_train, y_train, task=self.task, metric=self.metric)

        return oob_error

    def oob_score_accuracy(self, 
                           rf, 
                           X_train, 
                           y_train, 
                           task, 
                           metric):
        """
        Calculates the out-of-bag (OOB) score accuracy.

       
        :param rf (object): The random forest model.
        :param X_train (array-like): The input features for training.
        :param y_train (array-like): The target values for training.
        :param task (str): The task type, either "classification" or "regression".
        :param metric (str): The evaluation metric to use.

        :return oob_score (float): The calculated out-of-bag score accuracy.
        """
        from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap

        X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
        y = y_train.values if isinstance(y_train, pd.Series) else y_train

        if task == "classification":
            n_samples = len(X)
            n_classes = len(np.unique(y))
            predictions = np.zeros((n_samples, n_classes))
            for tree in getattr(rf, "model").estimators_:
                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
                unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)

                tree_preds = tree.predict_proba(X[unsampled_indices, :])
                predictions[unsampled_indices] += tree_preds

            oob_score = eval_metrics.get_evaluation_report(predictions, y, task, metric)

            return oob_score

        else:
            n_samples = len(X)
            predictions = np.zeros(n_samples)
            n_predictions = np.zeros(n_samples)
            for tree in getattr(rf, "model").estimators_:
                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
                unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)

                tree_preds = tree.predict(X[unsampled_indices, :])
                predictions[unsampled_indices] += tree_preds
                n_predictions[unsampled_indices] += 1

            predictions /= n_predictions

            oob_score = eval_metrics.get_evaluation_report(predictions, y, task, metric)

            return oob_score

In [None]:
from sklearn.datasets import make_regression
import model as im

X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
param_grid = {
    'n_estimators': [20, 30, 100],
    'max_depth': [2, 3]
}

oob_gridsearch = OOB_ParamGridSearch(n_jobs=1,
                                     estimator=im.IterativeRFRegression,
                                     param_grid=param_grid,
                                     seed=123,
                                     refit=True,
                                     task="regression",
                                     metric="mse")


oob_gridsearch.fit(X_train=X, y_train=y)
oob_gridsearch.cv_results

print(X)
print(y)

### Decomposing code for testing
+ fit
+ fit_score
+ oob_score_accuracy

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn import datasets
from copy import deepcopy
import numpy as np
import pandas as pd
import joblib
import model as im


# Load the dataset
dataset = datasets.load_iris(as_frame=True)

# Separate out the data
X = dataset['data']
y = dataset['target']

param_grid = {
    'n_estimators': [20, 30, 100],
    'max_depth': [2, 3]
}

def fit(X, y, param_grid):

    params_iterable = list(ParameterGrid(param_grid))

    parallel = joblib.Parallel(n_jobs=1)

    output = parallel(
              joblib.delayed(_fit_and_score)(deepcopy(
                im.IterativeRFClassifier), X, y,parameters)
            for parameters in params_iterable)


    n_candidates = len(params_iterable)
    a=np.array(output, dtype=np.float64)

    best_index = np.argmin(a)
    best_score_ = a[best_index]
    best_param_ = params_iterable[best_index]

    cv_results = pd.DataFrame(output, columns=['OOB_Error_Score'])
    df_params = pd.DataFrame(params_iterable)
    cv_results = pd.concat([cv_results, df_params], axis = 1)


    cv_results = (cv_results.
                  sort_values(['OOB_Error_Score'],ascending=True).
                  reset_index(drop=True))

    return cv_results

def _fit_and_score(estimator, X, y, parameters):


    train_model = estimator(rseed=1, **parameters)
    train_model.fit(X, y,  feature_weight=None)
    oob_error = 1 - oob_score_accuracy(train_model, X, y)

    return oob_error


def oob_score_accuracy(rf, X, y):
    from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap

    X = X.values if isinstance(X, pd.DataFrame) else X
    y = y.values if isinstance(y, pd.Series) else y

    n_samples = len(X)
    n_classes = len(np.unique(y))
    predictions = np.zeros((n_samples, n_classes))
    for tree in getattr(rf, "model").estimators_:
        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
        unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)

        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexs = np.argmax(predictions, axis=1)
    predicted_class = [getattr(rf, "model").classes_[i] for i in predicted_class_indexs]
    
    oob_score = np.mean(y == predicted_class)
    
    return oob_score

oob_gridsearch = fit(X, y, param_grid)
print(oob_gridsearch)

### Evaulation Function 

In [None]:
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X, y)
prediction = regr.predict(X)

def get_evaluation_report(y_pred, y_true, task, metric):
    """
    Get values for common evaluation metrics

    :param y_pred: predicted values
    :param y_true: true values
    :param task: ML task to solve
    :param metic: choose specificed metric to assess the performance

    :return: dictionary with specificed metrics
    """
   
    if task == 'classification':
        average = 'micro' if len(np.unique(y_true)) > 2 else 'binary'
        eval_report_dict = {
            'auroc': sklearn.metrics.roc_auc_score(y_true=y_true, y_pred=y_pred, average=average),
            'aupr': sklearn.metrics.average_precision_score(y_true=y_true, y_pred=y_pred, average=average)
        }
        eval_report_dict = eval_report_dict[metric]
    else:
        eval_report_dict = {
            'mse': sklearn.metrics.mean_squared_error(y_true=y_true, y_pred=y_pred),
            'rmse': sklearn.metrics.mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False),
            'r2_score': sklearn.metrics.r2_score(y_true=y_true, y_pred=y_pred),
        }
        eval_report_dict = eval_report_dict[metric]
        
    return eval_report_dict


get_evaluation_report(prediction, y, task="regression",metric="mse")

### RF_OOB_Dataset function

In [None]:
from abc import ABC, abstractmethod
from  pathlib import Path
import joblib
import numpy as np
import pandas as pd
import yaml
import sys

class ExpressionDataset(ABC):
    """ 
    The base dataset defining the API for datasets in this project
    """
    
    @abstractmethod
    def __init__(self):
        """
        Abstract initializer.
        """
        pass

    @classmethod
    @abstractmethod
    def from_config(class_object):
        """
        A function to initialize a ExpressionDataset object
        """
        raise NotImplementedError
    
    @abstractmethod
    def get_samples(self):
        """
        Return the sample ids for all samples in the dataset
        """
        raise NotImplementedError
        
    @abstractmethod
    def get_features(self):
        """
        Return the list of the ids of all the features in the dataset
        """
        raise NotImplementedError
    
    @abstractmethod
    def generate_labels(self):
        """
        Process the y matrix for the given phenotype trait
        """
        raise NotImplementedError
    
    @abstractmethod
    def save_pipeline(self):
         """
         Save the version of the pipline
         """
         raise NotImplementedError
     
    @abstractmethod
    def load_pipeline(self):
         """
         Load the version of the pipline
         """
         raise NotImplementedError
    

class TrainTestSplit(ExpressionDataset):
    """
    A base train_test_split defining the API for train-test splitting
    """
    @abstractmethod
    def train_test_split(self,
                         train_fraction,
                         test_fraction,
                         seed):
        """
        Split the dataset into two portion, 
        as seen in scikit-learn's `train_test_split` function
        """
        raise NotImplementedError
    
    @abstractmethod
    def shuffle_data(self, X, y, seed):
        """
        Random shuffle of the samples in X and y
        """
        raise NotImplementedError

    
class RF_OOB_Dataset(TrainTestSplit):
    """
    A class containing logic used by all the types of gwas datasets for computing out of bag score
    The RF_OOB_Dataset inheritance pattern from class ExpressionDataset and TrainTestSplit
    """
    
    def __init__(self,
                 gwas_gen_dir,
                 label_df_dir,
                 env_df_dir):
        
        """
        An initializer for the class
        """
        
        self.all_gen_df = pd.read_csv(gwas_gen_dir, sep=",")
        self.all_gen_df = self.all_gen_df.drop(['FID','IID'], axis=1)
        self.env_df = pd.read_csv(env_df_dir, sep="\t")
        
        self.all_gwas_df = pd.concat([self.all_gen_df, self.env_df], axis=1)
        self.label_df = pd.read_csv(label_df_dir, sep="\t")
    
    @classmethod 
    def from_config(class_object,
                    config_file,
                    weight_tissue):
        """
        A function to create a new object from paths to its data
        """
        
        data_dir = Path(config_file['dataset']['data_dir'])
        gwas_df_dir = data_dir / weight_tissue / ("predict_expression_" + weight_tissue + "_output.csv")
        
        return class_object(gwas_df_dir, config_file['dataset']['phentoype_dir'], config_file['dataset']['env_dir'])
    
    def get_samples(self):
        """
        Return the list of sample accessions for all samples currently available in the dataset
        """
        return list(self.all_gwas_df.index)
    
    def get_features(self):
        """
        Return the list of the ids of all the features in the currently available in the dataset 
        """
        return list(self.all_gwas_df.columns)
    
    def shuffle_data(self, X, y, seed):
        """
        Random shuffle of the samples in X and y
        """
        
        np.random.seed(seed)
        idx = np.arange(X.shape[0])
        np.random.shuffle(idx)
        
        return X[idx], y[idx]
    
    def train_test_split(self, X, y, seed, test_size=0.2):
        """
        Split the data into train and test sets
        """
        
        X, y = self.shuffle_data(X, y, seed)
        split_i = len(y) - int(len(y)// (1 / test_size))
        X_train, X_test = X[:split_i], X[split_i:]
        y_train , y_test = y[:split_i], y[split_i:]
        
        return X_train, X_test, y_train, y_test
    
    def generate_labels(self, phen_trait):
        """
        Random shuffle of the samples in X and y
        """
        y_given_phen = self.label_df.loc[:, [phen_trait]]
        
        return y_given_phen
    
    @staticmethod
    def save_pipeline(pipeline_to_save, save_file_name):
        """
        Save the version of the pipline
        """
        joblib.dump(pipeline_to_save, save_file_name)
    
    @staticmethod
    def load_pipeline(pipline_file_path):
        """
        Load the version of the pipline
        """
        pipline_file_path  = joblib.load(filename=pipline_file_path)
        return pipline_file_path


try:
    with open("/exeh_4/yuping/Epistasis_Interaction/02_Select_Parameter_Model/config.yaml") as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
   

RF_OOB_Dataset = RF_OOB_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")

y_given_raw_df = RF_OOB_Dataset.generate_labels("BDS_Total")


X_raw_df = RF_OOB_Dataset.all_gwas_df.values if isinstance(RF_OOB_Dataset.all_gwas_df, pd.DataFrame) else RF_OOB_Dataset.all_gwas_df
y_raw_df = y_given_raw_df.values if isinstance(y_given_raw_df, pd.DataFrame) else y_given_raw_df


X_train_raw_df, X_test_raw_df, y_train_raw_df, y_test_raw_df = RF_OOB_Dataset.train_test_split(X_raw_df, 
                                                                                               y_raw_df, seed=1)

X_train_df, X_test_df = pd.DataFrame(X_train_raw_df, columns=RF_OOB_Dataset.all_gwas_df.columns), pd.DataFrame(X_test_raw_df, columns=RF_OOB_Dataset.all_gwas_df.columns)
y_train_df, y_test_df = pd.DataFrame(y_train_raw_df, columns=["BDS_Total"]), pd.DataFrame(y_test_raw_df, columns=["BDS_Total"])

print(np.where(y_train_df.BDS_Total.isnull()))

#### <span style="color:orange">REFACTORED Here we refactored the RF_OOB_Dataset function  into the `dataset_model.py`  modules.</span>


In [None]:
from abc import ABC, abstractmethod
from  pathlib import Path
import joblib
import numpy as np
import pandas as pd
import yaml
import sys


class ExpressionDataset(ABC):
    """ 
    The base dataset defining the API for datasets in this project
    """
    
    @abstractmethod
    def __init__(self):
        """
        Abstract initializer.
        """
        pass

    @classmethod
    @abstractmethod
    def from_config(class_object):
        """
        A function to initialize a ExpressionDataset object
        """
        raise NotImplementedError
    
    @abstractmethod
    def get_samples(self):
        """
        Return the sample ids for all samples in the dataset
        """
        raise NotImplementedError
        
    @abstractmethod
    def get_features(self):
        """
        Return the list of the ids of all the features in the dataset
        """
        raise NotImplementedError
    
    @abstractmethod
    def generate_labels(self):
        """
        Process the y matrix for the given phenotype trait
        """
        raise NotImplementedError
    
    @abstractmethod
    def save_pipeline(self):
         """
         Save the version of the pipline
         """
         raise NotImplementedError
     
    @abstractmethod
    def load_pipeline(self):
         """
         Load the version of the pipline
         """
         raise NotImplementedError
     
class TrainTestSplitMixin():
    """
    A mixin class providing train-test splitting functionality
    """
    
    def shuffle_data(self, X, y, seed):
       """
       Random shuffle of the samples in X and y
       """
       np.random.seed(seed)
       idx = np.arange(X.shape[0])
       np.random.shuffle(idx)
       
       return X[idx], y[idx]   

    def train_test_split(self, X, y, seed, test_size=0.2):
        """
        Split the data into train and test sets
        """
        X, y = self.shuffle_data(X, y, seed)
        split_i = len(y) - int(len(y) // (1 / test_size))
        X_train, X_test = X[:split_i], X[split_i:]
        y_train, y_test = y[:split_i], y[split_i:]
        
        return X_train, X_test, y_train, y_test

class GroupShuffleSplitMixin():
    """
    A mixin class providing group shuffle splitting functionality
    """
    
    def group_shuffle_split(self, X, y, groups, seed, n_splits=1 , test_size=0.2):
        """
        Split the data into train and test sets
        """
        
        from sklearn.model_selection import GroupShuffleSplit
        gss = GroupShuffleSplit(n_splits=n_splits, train_size=test_size, random_state=seed)  
        split = gss.split(X, y, groups=groups)
        train_ids, test_ids = next(split)
        
        X_train, X_test = X[train_ids], X[test_ids]
        y_train, y_test = y[train_ids], y[test_ids]
        
        return X_train, X_test, y_train, y_test
    

class RF_OOB_Dataset(ExpressionDataset, TrainTestSplitMixin, GroupShuffleSplitMixin):
    """
    A class containing logic used by all the types of gwas datasets for computing out of bag score
    The RF_OOB_Dataset inheritance pattern from class ExpressionDataset, TrainTestSplitMixin and GropShufflesSplitMixin
    """

    def __init__(self, gwas_gen_dir, label_df_dir, env_df_dir):
        """
        An initializer for the class
        """
        self.all_gen_df = pd.read_csv(gwas_gen_dir, sep=",")
        self.all_gen_df = self.all_gen_df.drop(['FID', 'IID'], axis=1)
        self.env_df = pd.read_csv(env_df_dir, sep="\t")
        self.all_gwas_df = pd.concat([self.all_gen_df, self.env_df], axis=1)
        self.label_df = pd.read_csv(label_df_dir, sep="\t")

    @classmethod
    def from_config(cls, config_file, weight_tissue):
        """
        A function to create a new object from paths to its data
        """
        data_dir = Path(config_file['dataset']['data_dir'])
        gwas_df_dir = data_dir / weight_tissue / ("predict_expression_" + weight_tissue + "_output.csv")
        return cls(gwas_df_dir, config_file['dataset']['phentoype_dir'], config_file['dataset']['env_dir'])

    def get_samples(self):
        """
        Return the list of sample accessions for all samples currently available in the dataset
        """
        return list(self.all_gwas_df.index)

    def get_features(self):
        """
        Return the list of the ids of all the features in the currently available in the dataset 
        """
        return list(self.all_gwas_df.columns)

    def generate_labels(self, phen_trait):
        """
        Process the y matrix for the given phenotype trait
        """
        y_given_phen = self.label_df.loc[:, [phen_trait]]
        return y_given_phen

    @staticmethod
    def save_pipeline(pipeline_to_save, save_file_name):
        """
        Save the version of the pipeline
        """
        joblib.dump(pipeline_to_save, save_file_name)

    @staticmethod
    def load_pipeline(pipeline_file_path):
        """
        Load the version of the pipeline
        """
        pipeline = joblib.load(filename=pipeline_file_path)
        return pipeline



# for train-test splitting
try:
    with open("/exeh_4/yuping/Epistasis_Interaction/02_Select_Parameter_Model/config.yaml") as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
   
rf_oob_dataset = RF_OOB_Dataset.from_config(config_file=load_configure, 
                                            weight_tissue="Brain_Amygdala")

y_given_raw_df = rf_oob_dataset.generate_labels("BDS_Total")
X_raw_df = rf_oob_dataset.all_gwas_df.values if isinstance(rf_oob_dataset.all_gwas_df, pd.DataFrame) else rf_oob_dataset.all_gwas_df
y_raw_df = y_given_raw_df.values if isinstance(y_given_raw_df, pd.DataFrame) else y_given_raw_df
X_train_raw_df, X_test_raw_df, y_train_raw_df, y_test_raw_df = rf_oob_dataset.train_test_split(X_raw_df,  y_raw_df, seed=1)

# group-shuffle split
X = np.array([[1, 2], [3], [4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([0, 1, 1, 0, 1, 0, 1])
groups = np.array([1, 4, 5, 2, 2, 3, 3])
X_train_raw_df, X_test_raw_df, y_train_raw_df, y_test_raw_df = rf_oob_dataset.group_shuffle_split(X, y, groups, seed=123, n_splits=1 , test_size=0.2)

In [None]:
from sklearn.model_selection import train_test_split

# Example data
X = np.array([[1, 2], [3], [4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([0, 1, 1, 0, 1, 0, 1])
groups = np.array([1, 4, 5, 2, 2, 3, 3])

# Create a dictionary to store the indices of samples for each group
group_indices = {}
for idx, group in enumerate(groups):
    if group not in group_indices:
        group_indices[group] = []
    group_indices[group].append(idx)
    
# Shuffle the indices within each group
for group in group_indices:
        np.random.shuffle(group_indices[group])

# Combine the shuffled indices of all groups   
shuffled_indices = np.concatenate(list(group_indices.values()))

# Split the shuffled indices into train and test sets
train_indices, test_indices = train_test_split(shuffled_indices, test_size=0.2, random_state=6)  
X_train = X[train_indices]
X_test = X[test_indices]

print(X_train)
print(X_test)