## ETL pipeline for predictor and phenotype

### ETL pipeline for phenotype

In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import pandas as pd
import numpy as np
import pandas as pd
import gzip
import pickle
import preprocessing_utils as pp



def compute_expected_value(y, grm):
    """
    Compute the expected value using GBLUP (Genomic Best Linear Unbiased Prediction)
    """
    
    ones = np.ones(grm.shape[0])
    # The next line adds a small amount to the diagonal of G,
    # otherwise G is not invertable in this small example!
    grm += np.diag(np.ones(grm.shape[0]) * 0.01)
    # Compute the inverse of GRM
    grm_inv = np.linalg.inv(grm)

    # Construct Z
    Z = np.diag(np.ones(grm.shape[0]))
    # Build mixed model solution equations
    coeff = np.zeros((grm.shape[0] + 1, grm.shape[0] + 1))
    coeff[0, 0] = np.matmul(ones.T, ones)
    coeff[0, 1:] = np.matmul(ones.T, Z)
    coeff[1:, 0] = np.matmul(Z.T, ones)
    coeff[1:, 1:] = np.matmul(Z.T, Z) + grm_inv
    
    # Compute the right-hand side
    crossprod_ones_y = np.dot(ones.T, y).flatten()
    crossprod_Z_y = np.dot(Z.T, y).flatten()
    RHS = np.concatenate((crossprod_ones_y, crossprod_Z_y), axis=0)
    
    gblup = np.linalg.solve(coeff, RHS)
    # Compute expected value
    expected_value = gblup[0] + np.matmul(Z, gblup[1:])
    
    return expected_value


class NumericalImputer(BaseEstimator, TransformerMixin):
        """
        Numerical missing value imputer
        """
      
        def __init__(self, variables=None):
            if not isinstance(variables, list):
                self.variables = [variables]
            else:
                self.variables = variables
                
        def fit(self, X, y=None):
            self.imputer_dict_ = {}
            for feature in self.variables:
                self.imputer_dict_[feature] = X[feature].dropna().mean()
            return self
        
        def transform(self, X):
            X = X.copy()
            for feature in self.variables:
                X[feature].fillna(self.imputer_dict_[feature], inplace=True)
            
            return X[self.variables]

y = pd.read_csv("/mnt/data/share/yuping/data/phenotype_info.csv", sep="\t")
pipeline = make_pipeline(
       pp.NumericalImputer(variables="CWR_Total")
)

   
df = pipeline.fit_transform(y)
gene_cor_dir = "/exeh_4/yuping/Epistasis_Interaction/00_Generate_Data/results/genetic_correlation.pkl.gz"
with gzip.open(gene_cor_dir, 'rb') as f:
    gene_cor_matrix = pickle.load(f)  

# Compute the expected value using a function 'compute_expected_value' and the 'grm'
result = df.apply(lambda column: compute_expected_value(column, gene_cor_matrix), axis=0)

### ETL pipeline for predictor

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import pandas as pd
import sys
import yaml
import dataset_model as dm
import  preprocessing_utils as pp

class CategorialEncoder_Education(BaseEstimator, TransformerMixin):
    """
    Categorical missing value imputer for Education 
    """

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            if X[feature].str.contains("#NULL!").any():
                X[feature].replace("#NULL!", pd.NA, inplace=True)
                X[feature].fillna(X[feature].dropna().mode()[0], inplace=True)
                self.imputer_dict[feature] = X[feature].mode()[0]
            else:
                self.imputer_dict[feature] = X[feature].mode()[0]
        return self
    
    def transform(self, X):
        for feature in self.variables:
            X[feature] = X[feature].fillna(self.imputer_dict[feature])
        
        X['TotalEducation'] = X.apply(lambda x: (int(x['MotherEducation']) + int(x['FatherEducation']))/2, axis=1)
        median = X.TotalEducation.median()
        X['TotalEducation'] = X['TotalEducation'].apply(lambda x: 0 if x < median else 1)
        
        return X

class CategorialEncoder_Income(BaseEstimator, TransformerMixin):
    """
    String to numbers categorical encoder for Income
    """
    
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = {"<1000": "0",
                                          "10001~50000": "1",
                                          ">50001": "2"}
        return self  
    
    def transform(self, X, y=None):
        for feature in self.variables:
            X[feature] = X[feature].map(self.imputer_dict[feature])
            
            if X[feature].isnull().any():
                X[feature].replace("#NULL!", pd.NA, inplace=True)
                X[feature].fillna(X[feature].dropna().mode()[0], inplace=True)
        return X


class NormalizeDataTransformer(BaseEstimator, TransformerMixin):
    """
    Z-score standardize specific DataFrame columns
    """
    
    def __init__(self, column_names=None):
        self.column_names = column_names
      
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        standardized_X = X.copy()

        for col in self.column_names:
            standardized_X[col] = self.standardize_column(X[col])
        
        return standardized_X
    
    def standardize_column(self, col):
        """Z-score standardize a dataframe column"""
        mean_val = col.mean()
        std_val = col.std()

        if std_val == 0:
            standardized_column = np.zeros(len(col))
        else:
            standardized_column = (col - mean_val) / std_val

        return standardized_column
     
def extract(data_set):
    extract_data = data_set.all_gwas_df
    
    return extract_data

def transform(df):
    pipeline = make_pipeline(
            pp.NormalizeDataTransformer(column_names=df.columns.tolist()[4:]),
            pp.CategorialEncoder_Education(variables=['FatherEducation', 'MotherEducation']),
            pp.CategorialEncoder_Income(variables=['Income'])   
    )
    
    result = pipeline.fit_transform(df)
    
    return result
   
def load_data(dataset):
    extract_data = extract(dataset)
    transform_data = transform(extract_data)
    
    return transform_data


if __name__ == "__main__":
    try:
        with open("/exeh_4/yuping/Epistasis_Interaction/01_Preprocessing/config.yaml") as infile:
            load_configure = yaml.safe_load(infile)
    except Exception:
            sys.stderr.write("Please specify valid yaml file.")
            sys.exit(1)


    GTEX_Dataset = dm.GTEX_raw_Dataset.from_config(config_file=load_configure, 
                                                   weight_tissue="Brain_Amygdala")
    
    load_data(dataset=GTEX_Dataset)

['Ageyr', 'ENSG00000001561.6', 'ENSG00000002933.3', 'ENSG00000004534.10', 'ENSG00000004975.7', 'ENSG00000005020.8', 'ENSG00000006042.7', 'ENSG00000006128.7', 'ENSG00000006282.15', 'ENSG00000006432.11', 'ENSG00000006744.14', 'ENSG00000007171.12', 'ENSG00000007516.9', 'ENSG00000008277.10', 'ENSG00000008282.3', 'ENSG00000008283.11', 'ENSG00000008517.12', 'ENSG00000008735.10', 'ENSG00000008838.13', 'ENSG00000009724.12', 'ENSG00000010219.9', 'ENSG00000010610.5', 'ENSG00000011295.11', 'ENSG00000011376.5', 'ENSG00000012232.4', 'ENSG00000013306.11', 'ENSG00000013573.12', 'ENSG00000013725.10', 'ENSG00000015133.14', 'ENSG00000015592.12', 'ENSG00000019186.5', 'ENSG00000021488.8', 'ENSG00000022840.11', 'ENSG00000023041.7', 'ENSG00000023228.9', 'ENSG00000023516.7', 'ENSG00000024862.12', 'ENSG00000025423.7', 'ENSG00000025434.14', 'ENSG00000026103.15', 'ENSG00000027001.7', 'ENSG00000029559.5', 'ENSG00000029639.6', 'ENSG00000033178.8', 'ENSG00000034239.6', 'ENSG00000034971.10', 'ENSG00000036565.10', '