In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import re
import sys

# Self-defined functions
from pred_utils import balanced_over_sampling

In [2]:
def prediction_embeddings_MB_SupCon(covariate, predict_method, l1_ratio=0.5, random_seed=123):
    
    """
    predict_method: {'logistic', 'elasticnet', 'svr', ...}
    kwargs: arguments for corresponding methods
    """
    
    feat_gut_16s_train = pd.read_pickle('./feature_data/{}/train/feat_gut_16s_train.pkl'.format(covariate))
    feat_metabolome_train = pd.read_pickle('./feature_data/{}/train/feat_metabolome_train.pkl'.format(covariate))
    gut_16s_feat_subj_train = pd.read_pickle('./feature_data/{}/train/gut_16s_feat_subj_train.pkl'.format(covariate))
    metabolome_feat_subj_train = pd.read_pickle('./feature_data/{}/train/metabolome_feat_subj_train.pkl'.format(covariate))

    feat_gut_16s_val = pd.read_pickle('./feature_data/{}/val/feat_gut_16s_val.pkl'.format(covariate))
    feat_metabolome_val = pd.read_pickle('./feature_data/{}/val/feat_metabolome_val.pkl'.format(covariate))
    gut_16s_feat_subj_val = pd.read_pickle('./feature_data/{}/val/gut_16s_feat_subj_val.pkl'.format(covariate))
    metabolome_feat_subj_val = pd.read_pickle('./feature_data/{}/val/metabolome_feat_subj_val.pkl'.format(covariate))
    
    feat_gut_16s_test = pd.read_pickle('./feature_data/{}/test/feat_gut_16s_test.pkl'.format(covariate))
    feat_metabolome_test = pd.read_pickle('./feature_data/{}/test/feat_metabolome_test.pkl'.format(covariate))
    gut_16s_feat_subj_test = pd.read_pickle('./feature_data/{}/test/gut_16s_feat_subj_test.pkl'.format(covariate))
    metabolome_feat_subj_test = pd.read_pickle('./feature_data/{}/test/metabolome_feat_subj_test.pkl'.format(covariate))
    
    X_train_g = np.array(feat_gut_16s_train)
    y_train_g = gut_16s_feat_subj_train.loc[:,covariate].values
    X_val_g = np.array(feat_gut_16s_val)
    y_val_g = gut_16s_feat_subj_val.loc[:,covariate].values
    X_test_g = np.array(feat_gut_16s_test)
    y_test_g = gut_16s_feat_subj_test.loc[:,covariate].values

    X_train_m = np.array(feat_metabolome_train)
    y_train_m = metabolome_feat_subj_train.loc[:,covariate].values
    X_val_m = np.array(feat_metabolome_val)
    y_val_m = metabolome_feat_subj_val.loc[:,covariate].values
    X_test_m = np.array(feat_metabolome_test)
    y_test_m = metabolome_feat_subj_test.loc[:,covariate].values
    
    X_balanced_train_g, y_balanced_train_g = balanced_over_sampling(X_train_g, y_train_g)
    X_balanced_train_m, y_balanced_train_m = balanced_over_sampling(X_train_m, y_train_m)
    
    if predict_method == 'logistic':
        from sklearn.linear_model import LogisticRegression
        logi_reg_g = LogisticRegression(random_state=random_seed, max_iter=1000, penalty='elasticnet', 
                                        solver='saga', l1_ratio=l1_ratio)
        logi_reg_g.fit(X_balanced_train_g, y_balanced_train_g)
        
        pred_value_g_val = logi_reg_g.predict(X_val_g)
        pred_prob_g_val = logi_reg_g.predict_proba(X_val_g)
        accuracy_g_val = logi_reg_g.score(X_val_g, y_val_g)
        
        pred_value_g_test = logi_reg_g.predict(X_test_g)
        pred_prob_g_test = logi_reg_g.predict_proba(X_test_g)
        accuracy_g_test = logi_reg_g.score(X_test_g, y_test_g)
        
        logi_reg_m = LogisticRegression(random_state=random_seed, max_iter=1000, penalty='elasticnet', 
                                        solver='saga', l1_ratio=l1_ratio)
        logi_reg_m.fit(X_balanced_train_m, y_balanced_train_m)
        
        pred_value_m_val = logi_reg_m.predict(X_val_m)
        pred_prob_m_val = logi_reg_m.predict_proba(X_val_m)
        accuracy_m_val = logi_reg_m.score(X_val_m, y_val_m)
        
        pred_value_m_test = logi_reg_m.predict(X_test_m)
        pred_prob_m_test = logi_reg_m.predict_proba(X_test_m)
        accuracy_m_test = logi_reg_m.score(X_test_m, y_test_m)
        return (pred_value_g_val, pred_prob_g_val, accuracy_g_val), (pred_value_m_val, pred_prob_m_val, accuracy_m_val),\
               (pred_value_g_test, pred_prob_g_test, accuracy_g_test), (pred_value_m_test, pred_prob_m_test, accuracy_m_test)
    
    elif predict_method == 'elasticnet':
        pass
    

## Prediction for MB-SupCon with categorical covariates

In [3]:
predict_iris = prediction_embeddings_MB_SupCon(covariate='IR_IS_classification', predict_method='logistic')
print('Validation:\nPrediction accuracy for microbiome embedding ({0}) = {1:.2%}\n'
      'Prediction accuracy for metabolome embedding ({0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for microbiome embedding ({0}) = {3:.2%}\n'
      'Prediction accuracy for metabolome embedding ({0}) = {4:.2%}'.\
      format('IR_IS_classification', predict_iris[0][2], predict_iris[1][2], predict_iris[2][2], predict_iris[3][2]))

Validation:
Prediction accuracy for microbiome embedding (IR_IS_classification) = 84.15%
Prediction accuracy for metabolome embedding (IR_IS_classification) = 96.34%

Testing:
Prediction accuracy for microbiome embedding (IR_IS_classification) = 79.52%
Prediction accuracy for metabolome embedding (IR_IS_classification) = 93.98%


In [4]:
predict_sex = prediction_embeddings_MB_SupCon(covariate='Sex', predict_method='logistic')
print('Validation:\nPrediction accuracy for microbiome embedding ({0}) = {1:.2%}\n'
      'Prediction accuracy for metabolome embedding ({0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for microbiome embedding ({0}) = {3:.2%}\n'
      'Prediction accuracy for metabolome embedding ({0}) = {4:.2%}'.\
      format('Sex', predict_sex[0][2], predict_sex[1][2], predict_sex[2][2], predict_sex[3][2]))

Validation:
Prediction accuracy for microbiome embedding (Sex) = 83.33%
Prediction accuracy for metabolome embedding (Sex) = 98.15%

Testing:
Prediction accuracy for microbiome embedding (Sex) = 81.65%
Prediction accuracy for metabolome embedding (Sex) = 98.17%


In [5]:
predict_race = prediction_embeddings_MB_SupCon(covariate='Race', predict_method='logistic')
print('Validation:\nPrediction accuracy for microbiome embedding ({0}) = {1:.2%}\n'
      'Prediction accuracy for metabolome embedding ({0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for microbiome embedding ({0}) = {3:.2%}\n'
      'Prediction accuracy for metabolome embedding ({0}) = {4:.2%}'.\
      format('Race', predict_race[0][2], predict_race[1][2], predict_race[2][2], predict_race[3][2]))

Validation:
Prediction accuracy for microbiome embedding (Race) = 72.90%
Prediction accuracy for metabolome embedding (Race) = 94.39%

Testing:
Prediction accuracy for microbiome embedding (Race) = 77.98%
Prediction accuracy for metabolome embedding (Race) = 97.25%


## Prediction for MB-SupCon with continuous covariates