## Introduction

This notebook contains multiple sections that describe the functions utilized in the analysis.
- Feature Engineering Encoder and Imputer (run_making_pipline.py)
  - NumericalImputer function
    - Numerical missing value imputer for numerica variable
    - Using mean to impute missing value 
  - CategoricalImputer_Education
    - Categorical missing value imputer for Education (FatherEducation, MotherEducation)
    - Using mode to impute missing value
    - crate new variable called Total_Education 
        - above Median(average(sum up FatherEducation and MotherEducation)) = 1
        - blow Median(average(sum up FatherEducation and MotherEducation)) = 0 
        - next step miss forest or MICE for better imputation
  - CatgoricalEncoder_Income
    - String to numbers categorical encoder for Income
    - {"<1000": "0", "10001~20000": "1", "20001~30000": "2", "30001~40000": "3", "40001~50000": "4", ">50001": "5"}
  - NormalizeDataTransformer
- Regress out genetic relationship structure (regress_out_grm.py)
============================================================================================================================================================

### Feature Engineering Encoder and Imputer
    - NumericalImputer function
    - CategoricalImputer_Education
    - CatgoricalEncoder_Income
    

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
import pandas as pd
import dataset_model as dm
import yaml
import sys

class NumericalImputer(BaseEstimator, TransformerMixin):
        """
        Numerical missing value imputer
        """
      
        def __init__(self, variable=None):
            if not isinstance(variable, list):
                self.variables = [variable]
            else:
                self.variables = variable
                
        def fit(self, X, y=None):
            self.imputer_dict_ = {}
            for feature in self.variables:
                self.imputer_dict_[feature] = X[feature].mean()
            return self
        
        def transform(self, X):
            X =X.copy()
            for feature in self.variables:
                X[feature].fillna(self.imputer_dict_[feature], inplace=True)
            
            return X
            
class CategoricalImputer_Education(BaseEstimator, TransformerMixin):
    """
    Categorical missing value imputer for Education 
    """

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = X[feature].mode()[0]
        return self
    
    def transform(self, X):
        
        for feature in self.variables:
            X[feature] = X[feature].fillna(self.imputer_dict[feature])
        
        X['TotalEducation'] = X.apply(lambda x: (int(x['MotherEducation']) + int(x['FatherEducation']))/2, axis=1)
        median = X.TotalEducation.median()
        X['TotalEducation'] = X['TotalEducation'].apply(lambda x: 0 if x < median else 1)
        
        return X

class CatgoricalEncoder_Income(BaseEstimator, TransformerMixin):
    """
    String to numbers categorical encoder for Income
    """
    
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = {"<1000": "0",
                                          "10001~20000": "1",
                                          "20001~30000": "2",
                                          "30001~40000": "3",
                                          "40001~50000": "4",
                                          ">50001": "5"}
        return self  
    
    def transform(self, X, y=None):
        for feature in self.variables:
            X[feature] = X[feature].map(self.imputer_dict[feature])
            
            if X[feature].isnull().any():
                X[feature].replace("#NULL!", pd.NA, inplace=True)
                X[feature].fillna(X[feature].mode()[0], inplace=True)
        
        return X
   

pipeline = make_pipeline(
    CategoricalImputer_Education(variables=['FatherEducation', 'MotherEducation']),
    CatgoricalEncoder_Income(variables=['Income'])
)


try:
    with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:

        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
   

GTEX_Dataset = dm.GTEX_raw_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")


y_given_raw_df = GTEX_Dataset.generate_labels("BDS_Total")

X_raw_df = GTEX_Dataset.all_gwas_df.values if isinstance(GTEX_Dataset.all_gwas_df, pd.DataFrame) else GTEX_Dataset.all_gwas_df
y_raw_df = y_given_raw_df.values if isinstance(y_given_raw_df, pd.DataFrame) else y_given_raw_df


X_train_raw_df, X_test_raw_df, y_train_raw_df, y_test_raw_df = GTEX_Dataset.train_test_split(X_raw_df, y_raw_df, seed=1)
X_train_df, X_test_df = pd.DataFrame(X_train_raw_df, columns=GTEX_Dataset.all_gwas_df.columns), pd.DataFrame(X_test_raw_df, columns=GTEX_Dataset.all_gwas_df.columns)
X_train_df['MotherEducation'].replace("#NULL!", pd.NA, inplace=True)
X_train_df['FatherEducation'].replace("#NULL!", pd.NA, inplace=True)
X_test_df['MotherEducation'].replace("#NULL!", pd.NA, inplace=True)
X_test_df['FatherEducation'].replace("#NULL!", pd.NA, inplace=True)


y_train_df, y_test_df = pd.DataFrame(y_train_raw_df, columns=["BDS_Total"]), pd.DataFrame(y_test_raw_df, columns=["BDS_Total"])
pipeline.fit_transform(X_train_df)
pipeline.transform(X_test_df)

Unnamed: 0,ENSG00000001561.6,ENSG00000002933.3,ENSG00000004534.10,ENSG00000004975.7,ENSG00000005020.8,ENSG00000006042.7,ENSG00000006128.7,ENSG00000006282.15,ENSG00000006432.11,ENSG00000006744.14,...,ENSG00000273340.1,ENSG00000273381.1,ENSG00000273448.1,ENSG00000273449.1,ENSG00000273487.1,ENSG00000273492.1,FatherEducation,MotherEducation,Income,TotalEducation
0,-0.782407,0.224835,-0.152156,-0.317023,0.53921,-0.189654,0.124765,-2.149951,0.264409,1.319007,...,-0.291407,0.944426,0.593712,-0.601255,0.768297,0.90182,3,3,4,0
1,-0.522555,0.561864,-0.322371,-0.077216,-0.20033,-0.024786,0.185901,-2.020906,-0.422715,1.804942,...,0.300662,1.499659,0.786927,-1.202431,0.267172,0.566421,2,2,5,0
2,-1.029837,0.313268,-0.250289,-0.438109,0.240681,-0.168159,0.379419,-2.131486,-0.291576,1.33047,...,-0.061943,1.914681,0.602343,-0.540497,0.443978,0.286228,2,4,2,0
3,-0.223879,0.090533,-0.044679,-0.03981,0.236208,0.067001,0.403702,-2.040622,-0.2571,0.722356,...,0.305864,1.941359,0.729119,-0.488783,0.735341,0.773101,3,1,1,0
4,-0.586527,0.128612,0.150189,-0.307449,0.800393,-0.17848,0.393404,-2.004131,-0.276851,1.868274,...,-0.590855,1.638066,0.276953,-0.601347,0.476412,-0.720112,5,5,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,-0.451969,0.74445,0.321903,-0.850006,0.244734,-0.069555,0.21996,-0.855746,-0.45907,1.866159,...,-0.145041,1.256244,0.547743,-0.601255,0.332628,0.838252,2,2,5,0
205,-0.584144,0.737156,-0.470361,0.047838,0.72757,-0.003729,0.247059,-2.242692,-0.188346,1.301629,...,0.420549,1.666644,0.590527,-0.601255,0.516255,1.028859,2,2,1,0
206,-0.77332,0.101317,0.355903,-0.09747,0.134052,0.153946,0.27935,-2.152645,-0.597546,1.005348,...,0.074323,1.838915,0.601343,-1.202511,0.240834,0.285242,6,6,5,1
207,-0.916734,0.865083,-0.252489,-0.027691,0.714889,-0.074797,-0.218718,-2.113658,-0.274916,1.324228,...,-0.024676,1.915742,0.463355,-0.601255,-0.064424,0.808381,4,3,5,1


In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import sys
import yaml
import dataset_model as dm

class NormalizeDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        normalized_df = X.apply(self.standardize_column, axis=0)
        return normalized_df
    
    def standardize_column(self, col):
        max_val = col.max()
        min_val = col.min()
        col_range = max_val - min_val

        if col_range == 0:
            standardized_column = np.zeros(len(col))
        else:
            standardized_column = (col - min_val) / col_range

        return standardized_column

pipeline = make_pipeline(
    NormalizeDataTransformer()
)

try:
    with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
      
GTEX_Dataset = dm.GTEX_raw_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")

X_raw_df = GTEX_Dataset.all_gen_df
X_train_processed = pipeline.fit_transform(X_raw_df)

### Regress out genetic relationship structure 

In [1]:
import numpy as np
import pandas as pd
import sys
import yaml
import dataset_model as dm


def compute_expected_value(grm, y):
    """
    Compute the expected value using GBLUP (Genomic Best Linear Unbiased Prediction)
    """
    
    ones = np.ones(grm.shape[0])
    # The next line adds a small amount to the diagonal of G,
    # otherwise G is not invertable in this small example!
    grm += np.diag(np.ones(grm.shape[0]) * 0.01)
    # Compute the inverse of GRM
    grm_inv = np.linalg.inv(grm)

    # Construct Z
    Z = np.diag(np.ones(grm.shape[0]))
    # Build mixed model solution equations
    coeff = np.zeros((grm.shape[0] + 1, grm.shape[0] + 1))
    coeff[0, 0] = np.matmul(ones.T, ones)
    coeff[0, 1:] = np.matmul(ones.T, Z)
    coeff[1:, 0] = np.matmul(Z.T, ones)
    coeff[1:, 1:] = np.matmul(Z.T, Z) + grm_inv
    
    # Compute the right-hand side
    rhs = np.vstack((np.matmul(ones.T, y), np.matmul(Z.T, y)))
    gblup = np.linalg.solve(coeff, rhs)
    # Compute expected value
    expected_value = np.ones((len(y),1)) * gblup[0] + np.matmul(Z, gblup[1:])
    
    return expected_value

try:
    with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
   
    
GTEX_Dataset = dm.GTEX_raw_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")

# generate phenotype label
y_given_raw_df = GTEX_Dataset.generate_labels("CCR_Total")
# impute missing value with mean value
mean_value = y_given_raw_df["CCR_Total"].mean()
y_given_raw_df["CCR_Total"].fillna(mean_value, inplace=True)
y_raw = y_given_raw_df.values if isinstance(y_given_raw_df, pd.DataFrame) else y_given_raw_df
    
# load GRM
grm = GTEX_Dataset.gene_cor_matrix 
# get expected_value 
expected_value = compute_expected_value(grm, y_raw)
# substract genetic relationship structure from phenotype
y_residual = y_raw - expected_value
y_residual_df = pd.DataFrame(y_residual, columns=["CCR_Total"])