# Preprocessing impute gene expression genotype and phenotype enviromnetal variables
## Introduction

This notebook contains multiple sections that describe the functions utilized in the analysis.
- RF_OOB_Dataset function (dataset_model.py)
    - from_config
    - get_samples
    - get_features
    - shuffle_data
    - train_test_split
    - generate_labels
    - save_pipeline
    - load_pipeline
- Feature Engineering Encoder and Imputer
  - NumericalImputer function
    - Numerical missing value imputer for numerica variable
    - Using mean to impute missing value 
  - CategoricalImputer_Education
    - Categorical missing value imputer for Education (FatherEducation, MotherEducation)
    - Using mode to impute missing value
    - crate new variable called Total_Education 
        - above Median(average(sum up FatherEducation and MotherEducation)) = 1
        - blow Median(average(sum up FatherEducation and MotherEducation)) = 0 
  - CatgoricalEncoder_Income
    - String to numbers categorical encoder for Income
    - {"<1000": "0", "10001~20000": "1", "20001~30000": "2", "30001~40000": "3", "40001~50000": "4", ">50001": "5"}

============================================================================================================================================================

### 1. RF_OOB_Dataset function

In [None]:
from abc import ABC, abstractmethod
from  pathlib import Path
import joblib
import numpy as np
import pandas as pd
import yaml
import sys
import dataset_model as dm

class ExpressionDataset(ABC):
    """ 
    The base dataset defining the API for datasets in this project
    """
    
    @abstractmethod
    def __init__(self):
        """
        Abstract initializer.
        """
        pass

    @classmethod
    @abstractmethod
    def from_config(class_object):
        """
        A function to initialize a ExpressionDataset object
        """
        raise NotImplementedError
    
    @abstractmethod
    def get_samples(self):
        """
        Return the sample ids for all samples in the dataset
        """
        raise NotImplementedError
        
    @abstractmethod
    def get_features(self):
        """
        Return the list of the ids of all the features in the dataset
        """
        raise NotImplementedError
    
    @abstractmethod
    def generate_labels(self):
        """
        Process the y matrix for the given phenotype trait
        """
        raise NotImplementedError
    
    @abstractmethod
    def save_pipeline(self):
         """
         Save the version of the pipline
         """
         raise NotImplementedError
     
    @abstractmethod
    def load_pipeline(self):
         """
         Load the version of the pipline
         """
         raise NotImplementedError
    
    

class TrainTestSplit(ExpressionDataset):
    """
    A base train_test_split defining the API for train-test splitting
    """
    @abstractmethod
    def train_test_split(self,
                         train_fraction,
                         test_fraction,
                         seed):
        """
        Split the dataset into two portion, 
        as seen in scikit-learn's `train_test_split` function
        """
        raise NotImplementedError
    
    @abstractmethod
    def shuffle_data(self, X, y, seed):
        """
        Random shuffle of the samples in X and y
        """
        raise NotImplementedError
    
    
class RF_OOB_Dataset(TrainTestSplit):
    """
    A class containing logic used by all the types of gwas datasets for computing out of bag score
    The RF_OOB_Dataset inheritance pattern from class ExpressionDataset and TrainTestSplit
    """
    
    def __init__(self,
                 gwas_gen_dir,
                 label_df_dir,
                 env_df_dir):
        
        """
        An initializer for the class
        """
        
        self.all_gen_df = pd.read_csv(gwas_gen_dir, sep=",")
        self.all_gen_df = self.all_gen_df.drop(['FID','IID'], axis=1)
        self.env_df = pd.read_csv(env_df_dir, sep="\t")
        
        self.all_gwas_df = pd.concat([self.all_gen_df, self.env_df], axis=1)
        self.label_df = pd.read_csv(label_df_dir, sep="\t")
    
    @classmethod 
    def from_config(class_object,
                    config_file,
                    weight_tissue):
        """
        A function to create a new object from paths to its data
        """
        
        data_dir = Path(config_file['dataset']['data_dir'])
        gwas_df_dir = data_dir / weight_tissue / ("predict_expression_" + weight_tissue + "_output.csv")
        
        return class_object(gwas_df_dir, config_file['dataset']['phentoype_dir'], config_file['dataset']['env_dir'])
    
    def get_samples(self):
        """
        Return the list of sample accessions for all samples currently available in the dataset
        """
        return list(self.all_gwas_df.index)
    
    def get_features(self):
        """
        Return the list of the ids of all the features in the currently available in the dataset 
        """
        return list(self.all_gwas_df.columns)
    
    def shuffle_data(self, X, y, seed):
        """
        Random shuffle of the samples in X and y
        """
        
        np.random.seed(seed)
        idx = np.arange(X.shape[0])
        np.random.shuffle(idx)
        
        return X[idx], y[idx]
    
    def train_test_split(self, X, y, seed, test_size=0.2):
        """
        Split the data into train and test sets
        """
        
        X, y = self.shuffle_data(X, y, seed)
        split_i = len(y) - int(len(y)// (1 / test_size))
        X_train, X_test = X[:split_i], X[split_i:]
        y_train , y_test = y[:split_i], y[split_i:]
        
        return X_train, X_test, y_train, y_test
    
    def generate_labels(self, phen_trait):
        """
        Random shuffle of the samples in X and y
        """
        y_given_phen = self.label_df.loc[:, [phen_trait]]
        
        return y_given_phen
    
    @staticmethod
    def save_pipeline(pipeline_to_save, save_file_name):
        """
        Save the version of the pipline
        """
        joblib.dump(pipeline_to_save, save_file_name)
    
    @staticmethod
    def load_pipeline(pipline_file_path):
        """
        Load the version of the pipline
        """
        pipline_file_path  = joblib.load(filename=pipline_file_path)
        return pipline_file_path

try:
    with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
   

RF_OOB_Dataset = dm.RF_OOB_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")

y_given_raw_df = RF_OOB_Dataset.generate_labels("BDS_Total")


X_raw_df = RF_OOB_Dataset.all_gwas_df.values if isinstance(RF_OOB_Dataset.all_gwas_df, pd.DataFrame) else RF_OOB_Dataset.all_gwas_df
y_raw_df = y_given_raw_df.values if isinstance(y_given_raw_df, pd.DataFrame) else y_given_raw_df


X_train_raw_df, X_test_raw_df, y_train_raw_df, y_test_raw_df = RF_OOB_Dataset.train_test_split(X_raw_df, 
                                                                                               y_raw_df, seed=1)

X_train_df, X_test_df = pd.DataFrame(X_train_raw_df, columns=RF_OOB_Dataset.all_gwas_df.columns), pd.DataFrame(X_test_raw_df, columns=RF_OOB_Dataset.all_gwas_df.columns)
y_train_df, y_test_df = pd.DataFrame(y_train_raw_df, columns=["BDS_Total"]), pd.DataFrame(y_test_raw_df, columns=["BDS_Total"])

### 2. Feature Engineering Encoder and Imputer
    - NumericalImputer function
    - CategoricalImputer_Education
    - CatgoricalEncoder_Income
    

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
import pandas as pd
import utils


class NumericalImputer(BaseEstimator, TransformerMixin):
        """
        Numerical missing value imputer
        """
      
        def __init__(self, variable=None):
            if not isinstance(variable, list):
                self.variables = [variable]
            else:
                self.variables = variable
                
        def fit(self, X, y=None):
            self.imputer_dict_ = {}
            for feature in self.variables:
                self.imputer_dict_[feature] = X[feature].mean()
            return self
        
        def transform(self, X):
            X =X.copy()
            for feature in self.variables:
                X[feature].fillna(self.imputer_dict_[feature], inplace=True)
            
            return X
            
class CategoricalImputer_Education(BaseEstimator, TransformerMixin):
    """
    Categorical missing value imputer for Education 
    """

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.imputer_dict = {}
        X['MotherEducation'].replace("#NULL!", pd.NA, inplace=True)
        X['FatherEducation'].replace("#NULL!", pd.NA, inplace=True)
        for feature in self.variables:
            self.imputer_dict[feature] = X[feature].mode()[0]
        return self
    
    def transform(self, X):
        
        for feature in self.variables:
            X[feature] = X[feature].fillna(self.imputer_dict[feature])
        
        X['TotalEducation'] = X.apply(lambda x: (int(x['MotherEducation']) + int(x['FatherEducation']))/2, axis=1)
        median = X.TotalEducation.median()
        X['TotalEducation'] = X['TotalEducation'].apply(lambda x: 0 if x < median else 1)
        
        return X

class CatgoricalEncoder_Income(BaseEstimator, TransformerMixin):
    """
    String to numbers categorical encoder for Income
    """
    
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = {"<1000": "0",
                                          "10001~20000": "1",
                                          "20001~30000": "2",
                                          "30001~40000": "3",
                                          "40001~50000": "4",
                                          ">50001": "5"}
        return self  
    
    def transform(self, X, y=None):
        for feature in self.variables:
            X[feature] = X[feature].map(self.imputer_dict[feature])
            
            if X[feature].isnull().any():
                X[feature].replace("#NULL!", pd.NA, inplace=True)
                X[feature].fillna(X[feature].mode()[0], inplace=True)
        
        return X
   

pipeline = make_pipeline(
    CategoricalImputer_Education(variables=['FatherEducation', 'MotherEducation']),
    CatgoricalEncoder_Income(variables=['Income'])
)

save_file_name = utils.construct_filename("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/results",
                                          "output",
                                          ".pkl",
                                          "pipline",
                                          "version1")

RF_OOB_Dataset.save_pipeline(pipeline, save_file_name)