In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import re
import sys

# Self-defined functions
from pred_utils import balanced_over_sampling

In [2]:
gut_16s_df = pd.read_table("./data/gut_16s_abundance.txt", index_col=0)
metabolome_df = pd.read_table("./data/metabolome_abundance.txt", index_col=0)
metabolome_df = metabolome_df.drop(columns=['SubjectID', 'CollectionDate', 'CL1', 'CL2', 'CL3', 'CL4'])

In [3]:
from sklearn.preprocessing import StandardScaler
gut_16s_df_array = StandardScaler().fit_transform(gut_16s_df)
gut_16s_df = pd.DataFrame(gut_16s_df_array, index=gut_16s_df.index, columns=gut_16s_df.columns)

metabolome_df_array = StandardScaler().fit_transform(metabolome_df)
metabolome_df = pd.DataFrame(metabolome_df_array, index=metabolome_df.index, columns=metabolome_df.columns)

In [4]:
def prediction_original_data(covariate, predict_method, l1_ratio=0.5, max_iter=3000, random_seed=123):
    
    """
    predict_method: {'logistic', 'elasticnet', 'svr'}
    kwargs: arguments for corresponding methods
    """
    
    gut_16s_feat_subj_train = pd.read_pickle('./feature_data/{}/train/gut_16s_feat_subj_train.pkl'.format(covariate))
    metabolome_feat_subj_train = pd.read_pickle('./feature_data/{}/train/metabolome_feat_subj_train.pkl'.format(covariate))

    gut_16s_feat_subj_val = pd.read_pickle('./feature_data/{}/val/gut_16s_feat_subj_val.pkl'.format(covariate))
    metabolome_feat_subj_val = pd.read_pickle('./feature_data/{}/val/metabolome_feat_subj_val.pkl'.format(covariate))
    
    gut_16s_feat_subj_test = pd.read_pickle('./feature_data/{}/test/gut_16s_feat_subj_test.pkl'.format(covariate))
    metabolome_feat_subj_test = pd.read_pickle('./feature_data/{}/test/metabolome_feat_subj_test.pkl'.format(covariate))
    
    indexes_train = gut_16s_feat_subj_train.index
    indexes_val = gut_16s_feat_subj_val.index
    indexes_test = gut_16s_feat_subj_test.index
    
    X_train_g = np.array(gut_16s_df.loc[indexes_train,:])
    y_train_g = gut_16s_feat_subj_train.loc[:,covariate].values
    X_val_g = np.array(gut_16s_df.loc[indexes_val,:])
    y_val_g = gut_16s_feat_subj_val.loc[:,covariate].values
    X_test_g = np.array(gut_16s_df.loc[indexes_test,:])
    y_test_g = gut_16s_feat_subj_test.loc[:,covariate].values
    
    X_train_m = np.array(metabolome_df.loc[indexes_train,:])
    y_train_m = metabolome_feat_subj_train.loc[:,covariate].values
    X_val_m = np.array(metabolome_df.loc[indexes_val,:])
    y_val_m = metabolome_feat_subj_val.loc[:,covariate].values
    X_test_m = np.array(metabolome_df.loc[indexes_test,:])
    y_test_m = metabolome_feat_subj_test.loc[:,covariate].values
    
    X_balanced_train_g, y_balanced_train_g = balanced_over_sampling(X_train_g, y_train_g)
    X_balanced_train_m, y_balanced_train_m = balanced_over_sampling(X_train_m, y_train_m)
    
    if predict_method == 'logistic':
        from sklearn.linear_model import LogisticRegression
        logi_reg_g = LogisticRegression(random_state=random_seed, max_iter=max_iter, penalty='elasticnet', 
                                        solver='saga', l1_ratio=l1_ratio)
        logi_reg_g.fit(X_balanced_train_g, y_balanced_train_g)
        
        pred_value_g_val = logi_reg_g.predict(X_val_g)
        pred_prob_g_val = logi_reg_g.predict_proba(X_val_g)
        accuracy_g_val = logi_reg_g.score(X_val_g, y_val_g)
        
        pred_value_g_test = logi_reg_g.predict(X_test_g)
        pred_prob_g_test = logi_reg_g.predict_proba(X_test_g)
        accuracy_g_test = logi_reg_g.score(X_test_g, y_test_g)
        
        logi_reg_m = LogisticRegression(random_state=random_seed, max_iter=max_iter, penalty='elasticnet', 
                                        solver='saga', l1_ratio=l1_ratio)
        logi_reg_m.fit(X_balanced_train_m, y_balanced_train_m)
        
        pred_value_m_val = logi_reg_m.predict(X_val_m)
        pred_prob_m_val = logi_reg_m.predict_proba(X_val_m)
        accuracy_m_val = logi_reg_m.score(X_val_m, y_val_m)
        
        pred_value_m_test = logi_reg_m.predict(X_test_m)
        pred_prob_m_test = logi_reg_m.predict_proba(X_test_m)
        accuracy_m_test = logi_reg_m.score(X_test_m, y_test_m)
        return (pred_value_g_val, pred_prob_g_val, accuracy_g_val), (pred_value_m_val, pred_prob_m_val, accuracy_m_val),\
            (pred_value_g_test, pred_prob_g_test, accuracy_g_test), (pred_value_m_test, pred_prob_m_test, accuracy_m_test)
    
    elif predict_method == 'elasticnet':
        pass
    

## Prediction from original data for categorical covariates

In [5]:
predict_original_iris = prediction_original_data(covariate='IR_IS_classification', predict_method='logistic',
                                                max_iter=5000)
print('Validation:\nPrediction accuracy for original microbiome data ({0}) = {1:.2%}\n'
      'Prediction accuracy for original metabolome data ({0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for original microbiome data ({0}) = {3:.2%}\n'
      'Prediction accuracy for original metabolome data ({0}) = {4:.2%}'.\
      format('IR_IS_classification', predict_original_iris[0][2], predict_original_iris[1][2],
             predict_original_iris[2][2], predict_original_iris[3][2]))

Validation:
Prediction accuracy for original microbiome data (IR_IS_classification) = 81.71%
Prediction accuracy for original metabolome data (IR_IS_classification) = 98.78%

Testing:
Prediction accuracy for original microbiome data (IR_IS_classification) = 72.29%
Prediction accuracy for original metabolome data (IR_IS_classification) = 93.98%


In [6]:
predict_original_sex = prediction_original_data(covariate='Sex', predict_method='logistic',
                                                max_iter=5000)
print('Validation:\nPrediction accuracy for original microbiome data ({0}) = {1:.2%}\n'
      'Prediction accuracy for original metabolome data ({0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for original microbiome data ({0}) = {3:.2%}\n'
      'Prediction accuracy for original metabolome data ({0}) = {4:.2%}'.\
      format('Sex', predict_original_sex[0][2], predict_original_sex[1][2],
             predict_original_sex[2][2], predict_original_sex[3][2]))

Validation:
Prediction accuracy for original microbiome data (Sex) = 63.89%
Prediction accuracy for original metabolome data (Sex) = 97.22%

Testing:
Prediction accuracy for original microbiome data (Sex) = 66.97%
Prediction accuracy for original metabolome data (Sex) = 99.08%


In [7]:
predict_original_race = prediction_original_data(covariate='Race', predict_method='logistic',
                                                max_iter=5000)
print('Validation:\nPrediction accuracy for original microbiome data ({0}) = {1:.2%}\n'
      'Prediction accuracy for original metabolome data ({0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for original microbiome data ({0}) = {3:.2%}\n'
      'Prediction accuracy for original metabolome data ({0}) = {4:.2%}'.\
      format('Race', predict_original_race[0][2], predict_original_race[1][2],
             predict_original_race[2][2], predict_original_race[3][2]))

Validation:
Prediction accuracy for original microbiome data (Race) = 57.94%
Prediction accuracy for original metabolome data (Race) = 94.39%

Testing:
Prediction accuracy for original microbiome data (Race) = 66.97%
Prediction accuracy for original metabolome data (Race) = 97.25%
