# Preprocessing impute gene expression genotype and phenotype enviromnetal variables
## Introduction

This notebook contains multiple sections that describe the functions utilized in the analysis.
- Feature Engineering Encoder and Imputer
  - NumericalImputer function
    - Numerical missing value imputer for numerica variable
    - Using mean to impute missing value 
  - CategoricalImputer_Education
    - Categorical missing value imputer for Education (FatherEducation, MotherEducation)
    - Using mode to impute missing value
    - crate new variable called Total_Education 
        - above Median(average(sum up FatherEducation and MotherEducation)) = 1
        - blow Median(average(sum up FatherEducation and MotherEducation)) = 0 
  - CatgoricalEncoder_Income
    - String to numbers categorical encoder for Income
    - {"<1000": "0", "10001~20000": "1", "20001~30000": "2", "30001~40000": "3", "40001~50000": "4", ">50001": "5"}

============================================================================================================================================================

### Feature Engineering Encoder and Imputer
    - NumericalImputer function
    - CategoricalImputer_Education
    - CatgoricalEncoder_Income
    

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
import pandas as pd
import utils
import dataset_model as dm



class NumericalImputer(BaseEstimator, TransformerMixin):
        """
        Numerical missing value imputer
        """
      
        def __init__(self, variable=None):
            if not isinstance(variable, list):
                self.variables = [variable]
            else:
                self.variables = variable
                
        def fit(self, X, y=None):
            self.imputer_dict_ = {}
            for feature in self.variables:
                self.imputer_dict_[feature] = X[feature].mean()
            return self
        
        def transform(self, X):
            X =X.copy()
            for feature in self.variables:
                X[feature].fillna(self.imputer_dict_[feature], inplace=True)
            
            return X
            
class CategoricalImputer_Education(BaseEstimator, TransformerMixin):
    """
    Categorical missing value imputer for Education 
    """

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.imputer_dict = {}
        X['MotherEducation'].replace("#NULL!", pd.NA, inplace=True)
        X['FatherEducation'].replace("#NULL!", pd.NA, inplace=True)
        for feature in self.variables:
            self.imputer_dict[feature] = X[feature].mode()[0]
        return self
    
    def transform(self, X):
        
        for feature in self.variables:
            X[feature] = X[feature].fillna(self.imputer_dict[feature])
        
        X['TotalEducation'] = X.apply(lambda x: (int(x['MotherEducation']) + int(x['FatherEducation']))/2, axis=1)
        median = X.TotalEducation.median()
        X['TotalEducation'] = X['TotalEducation'].apply(lambda x: 0 if x < median else 1)
        
        return X

class CatgoricalEncoder_Income(BaseEstimator, TransformerMixin):
    """
    String to numbers categorical encoder for Income
    """
    
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = {"<1000": "0",
                                          "10001~20000": "1",
                                          "20001~30000": "2",
                                          "30001~40000": "3",
                                          "40001~50000": "4",
                                          ">50001": "5"}
        return self  
    
    def transform(self, X, y=None):
        for feature in self.variables:
            X[feature] = X[feature].map(self.imputer_dict[feature])
            
            if X[feature].isnull().any():
                X[feature].replace("#NULL!", pd.NA, inplace=True)
                X[feature].fillna(X[feature].mode()[0], inplace=True)
        
        return X
   

pipeline = make_pipeline(
    CategoricalImputer_Education(variables=['FatherEducation', 'MotherEducation']),
    CatgoricalEncoder_Income(variables=['Income'])
)

save_file_name = utils.construct_filename("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/results",
                                          "output",
                                          ".pkl",
                                          "pipline",
                                          "version1")

dm.RF_OOB_Dataset.save_pipeline(pipeline, save_file_name)

Pipeline(steps=[('categoricalimputer_education',
                 CategoricalImputer_Education(variables=['FatherEducation',
                                                         'MotherEducation'])),
                ('catgoricalencoder_income',
                 CatgoricalEncoder_Income(variables=['Income']))])
