## Introduction

This notebook contains multiple sections that describe the functions utilized in the analysis.
- Feature Engineering Encoder and Imputer (run_making_pipline.py)
  - NumericalImputer function
    - Numerical missing value imputer for numerica variable
    - Using mean to impute missing value 
  - CategoricalImputer_Education
    - Categorical missing value imputer for Education (FatherEducation, MotherEducation)
    - Using mode to impute missing value
    - crate new variable called Total_Education 
        - above Median(average(sum up FatherEducation and MotherEducation)) = 1
        - blow Median(average(sum up FatherEducation and MotherEducation)) = 0 
        - next step miss forest or MICE for better imputation
  - CatgoricalEncoder_Income
    - String to numbers categorical encoder for Income
    - {"<1000": "0", "10001~20000": "1", "20001~30000": "2", "30001~40000": "3", "40001~50000": "4", ">50001": "5"}
  - NormalizeDataTransformer
- Regress out genetic relationship structure (regress_out_grm.py)
  
============================================================================================================================================================

### Feature Engineering Encoder and Imputer
    - NumericalImputer function
    - CategoricalImputer_Education
    - CatgoricalEncoder_Income
    - NormalizeDataTransformer
    

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
import pandas as pd
import dataset_model as dm
import yaml
import sys

class NumericalImputer(BaseEstimator, TransformerMixin):
        """
        Numerical missing value imputer
        """
      
        def __init__(self, variable=None):
            if not isinstance(variable, list):
                self.variables = [variable]
            else:
                self.variables = variable
                
        def fit(self, X, y=None):
            self.imputer_dict_ = {}
            for feature in self.variables:
                self.imputer_dict_[feature] = X[feature].mean()
            return self
        
        def transform(self, X):
            X =X.copy()
            for feature in self.variables:
                X[feature].fillna(self.imputer_dict_[feature], inplace=True)
            
            return X
            
class CategoricalImputer_Education(BaseEstimator, TransformerMixin):
    """
    Categorical missing value imputer for Education 
    """

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = X[feature].mode()[0]
        return self
    
    def transform(self, X):
        
        for feature in self.variables:
            X[feature] = X[feature].fillna(self.imputer_dict[feature])
        
        X['TotalEducation'] = X.apply(lambda x: (int(x['MotherEducation']) + int(x['FatherEducation']))/2, axis=1)
        median = X.TotalEducation.median()
        X['TotalEducation'] = X['TotalEducation'].apply(lambda x: 0 if x < median else 1)
        
        return X

class CatgoricalEncoder_Income(BaseEstimator, TransformerMixin):
    """
    String to numbers categorical encoder for Income
    """
    
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = {"<1000": "0",
                                          "10001~20000": "1",
                                          "20001~30000": "2",
                                          "30001~40000": "3",
                                          "40001~50000": "4",
                                          ">50001": "5"}
        return self  
    
    def transform(self, X, y=None):
        for feature in self.variables:
            X[feature] = X[feature].map(self.imputer_dict[feature])
            
            if X[feature].isnull().any():
                X[feature].replace("#NULL!", pd.NA, inplace=True)
                X[feature].fillna(X[feature].mode()[0], inplace=True)
        
        return X
   

pipeline = make_pipeline(
    CategoricalImputer_Education(variables=['FatherEducation', 'MotherEducation']),
    CatgoricalEncoder_Income(variables=['Income'])
)


try:
    with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:

        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
   

GTEX_Dataset = dm.GTEX_raw_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")


y_given_raw_df = GTEX_Dataset.generate_labels("BDS_Total")

X_raw_df = GTEX_Dataset.all_gwas_df.values if isinstance(GTEX_Dataset.all_gwas_df, pd.DataFrame) else GTEX_Dataset.all_gwas_df
y_raw_df = y_given_raw_df.values if isinstance(y_given_raw_df, pd.DataFrame) else y_given_raw_df


X_train_raw_df, X_test_raw_df, y_train_raw_df, y_test_raw_df = GTEX_Dataset.train_test_split(X_raw_df, y_raw_df, seed=1)
X_train_df, X_test_df = pd.DataFrame(X_train_raw_df, columns=GTEX_Dataset.all_gwas_df.columns), pd.DataFrame(X_test_raw_df, columns=GTEX_Dataset.all_gwas_df.columns)
X_train_df['MotherEducation'].replace("#NULL!", pd.NA, inplace=True)
X_train_df['FatherEducation'].replace("#NULL!", pd.NA, inplace=True)
X_test_df['MotherEducation'].replace("#NULL!", pd.NA, inplace=True)
X_test_df['FatherEducation'].replace("#NULL!", pd.NA, inplace=True)


y_train_df, y_test_df = pd.DataFrame(y_train_raw_df, columns=["BDS_Total"]), pd.DataFrame(y_test_raw_df, columns=["BDS_Total"])
pipeline.fit_transform(X_train_df)
pipeline.transform(X_test_df)

In [118]:
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import sys
import yaml
import dataset_model as dm

class NormalizeDataTransformer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.column_max = None
        self.column_min = None

    def fit(self, X, y=None):
        self.column_max = X.max()
        self.column_min = X.min()
        return self

    def transform(self, X):
        standardized_X = X.copy()
        for col in X.columns:
            standardized_X[col] = self.standardize_column(X[col], col)
        return standardized_X

    def standardize_column(self, col, col_name):
        """Zero-one standardize a dataframe column"""
        max_val = self.column_max[col_name]
        min_val = self.column_min[col_name]
        col_range = max_val - min_val

        if col_range == 0:
            standardized_column = np.zeros(len(col))
        else:
            standardized_column = (col - min_val) / col_range

        return standardized_column

pipeline = make_pipeline(
    NormalizeDataTransformer()
)

try:
    with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
      
GTEX_Dataset = dm.GTEX_raw_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")

X_raw_df = GTEX_Dataset.all_gen_df
X_train_processed = pipeline.fit_transform(X_raw_df)

### Regress out genetic relationship structure 

In [116]:
import numpy as np
import pandas as pd
import sys
import yaml
import dataset_model as dm




def compute_expected_value(grm, y):
    """
    Compute the expected value using GBLUP (Genomic Best Linear Unbiased Prediction)
    """
    
    ones = np.ones(grm.shape[0])
    # The next line adds a small amount to the diagonal of G,
    # otherwise G is not invertable in this small example!
    grm += np.diag(np.ones(grm.shape[0]) * 0.01)
    # Compute the inverse of GRM
    grm_inv = np.linalg.inv(grm)

    # Construct Z
    Z = np.diag(np.ones(grm.shape[0]))
    # Build mixed model solution equations
    coeff = np.zeros((grm.shape[0] + 1, grm.shape[0] + 1))
    coeff[0, 0] = np.matmul(ones.T, ones)
    coeff[0, 1:] = np.matmul(ones.T, Z)
    coeff[1:, 0] = np.matmul(Z.T, ones)
    coeff[1:, 1:] = np.matmul(Z.T, Z) + grm_inv
    
    # Compute the right-hand side
    crossprod_ones_y = np.dot(ones.T, y).flatten()
    crossprod_Z_y = np.dot(Z.T, y).flatten()

    RHS = np.concatenate((crossprod_ones_y, crossprod_Z_y), axis=0)
    gblup = np.linalg.solve(coeff, RHS)
    # Compute expected value
    expected_value = gblup[0] + np.matmul(Z, gblup[1:])
    
    return expected_value

try:
    with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:
        load_configure = yaml.safe_load(infile)
except Exception:
        sys.stderr.write("Please specify valid yaml file.")
        sys.exit(1)
   
    
GTEX_Dataset = dm.GTEX_raw_Dataset.from_config(config_file=load_configure, 
                                               weight_tissue="Brain_Amygdala")

# generate phenotype label
y_given_raw_df = GTEX_Dataset.generate_labels("CWR_Total")
# impute missing value with mean value
mean_value = y_given_raw_df["CWR_Total"].mean()
y_given_raw_df["CWR_Total"].fillna(mean_value, inplace=True)
y_raw = y_given_raw_df.values if isinstance(y_given_raw_df, pd.DataFrame) else y_given_raw_df
    
# load GRM
grm = GTEX_Dataset.gene_cor_matrix 
# get predicted_y 
expected_value = compute_expected_value(grm, y_raw)
# substract genetic relationship structure from phenotype
y_residual = y_raw.flatten() - expected_value

In [117]:
print(expected_value)

[88.28437573 75.41608993 93.67250658 ... 55.21262859 87.79299875
 67.02674958]
