In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import re
import sys

# Self-defined functions
from pred_utils import balanced_over_sampling

In [23]:
def split_by_indexes(feat_g, feat_m, g_feat_subj, m_feat_subj, index):
    feat_g_subset = feat_g.loc[index,:]
    feat_m_subset = feat_m.loc[index,:]
    g_feat_subj_subset = g_feat_subj.loc[index,:]
    m_feat_subj_subset = m_feat_subj.loc[index,:]
    return feat_g_subset, feat_m_subset, g_feat_subj_subset, m_feat_subj_subset

In [24]:
def prediction_embeddings_unsup_simCLR(covariate, predict_method, l1_ratio=0.5, random_seed=123):
    
    """
    predict_method: {'logistic', 'elasticnet', 'svr'}
    kwargs: arguments for corresponding methods
    """
    indexes_noukn = np.genfromtxt('./data/index/{}/indexes_noukn.txt'.format(covariate), dtype='str')
    indexes_train = np.genfromtxt('./data/index/{}/indexes_train.txt'.format(covariate), dtype='str')
    indexes_val = np.genfromtxt('./data/index/{}/indexes_val.txt'.format(covariate), dtype='str')
    indexes_test = np.genfromtxt('./data/index/{}/indexes_test.txt'.format(covariate), dtype='str')
    
    feat_gut_16s_all = pd.read_pickle('./feature_data/simCLR/all/feat_gut_16s_all.pkl')
    feat_metabolome_all = pd.read_pickle('./feature_data/simCLR/all/feat_metabolome_all.pkl')
    gut_16s_feat_subj_all = pd.read_pickle('./feature_data/simCLR/all/gut_16s_feat_subj_all.pkl')
    metabolome_feat_subj_all = pd.read_pickle('./feature_data/simCLR/all/metabolome_feat_subj_all.pkl')

    feat_gut_16s_train, feat_metabolome_train, gut_16s_feat_subj_train, metabolome_feat_subj_train=\
        split_by_indexes(feat_gut_16s_all, feat_metabolome_all, gut_16s_feat_subj_all, metabolome_feat_subj_all, 
                         indexes_train)
    feat_gut_16s_val, feat_metabolome_val, gut_16s_feat_subj_val, metabolome_feat_subj_val=\
        split_by_indexes(feat_gut_16s_all, feat_metabolome_all, gut_16s_feat_subj_all, metabolome_feat_subj_all, 
                         indexes_val)
    feat_gut_16s_test, feat_metabolome_test, gut_16s_feat_subj_test, metabolome_feat_subj_test=\
        split_by_indexes(feat_gut_16s_all, feat_metabolome_all, gut_16s_feat_subj_all, metabolome_feat_subj_all, 
                         indexes_test)
    
    X_train_g = np.array(feat_gut_16s_train)
    y_train_g = gut_16s_feat_subj_train.loc[:,covariate].values
    X_val_g = np.array(feat_gut_16s_val)
    y_val_g = gut_16s_feat_subj_val.loc[:,covariate].values
    X_test_g = np.array(feat_gut_16s_test)
    y_test_g = gut_16s_feat_subj_test.loc[:,covariate].values

    X_train_m = np.array(feat_metabolome_train)
    y_train_m = metabolome_feat_subj_train.loc[:,covariate].values
    X_val_m = np.array(feat_metabolome_val)
    y_val_m = metabolome_feat_subj_val.loc[:,covariate].values
    X_test_m = np.array(feat_metabolome_test)
    y_test_m = metabolome_feat_subj_test.loc[:,covariate].values
    
    X_balanced_train_g, y_balanced_train_g = balanced_over_sampling(X_train_g, y_train_g)
    X_balanced_train_m, y_balanced_train_m = balanced_over_sampling(X_train_m, y_train_m)
    
    if predict_method == 'logistic':
        from sklearn.linear_model import LogisticRegression
        logi_reg_g = LogisticRegression(random_state=random_seed, max_iter=1000, penalty='elasticnet', 
                                        solver='saga', l1_ratio=l1_ratio)
        logi_reg_g.fit(X_balanced_train_g, y_balanced_train_g)
        
        pred_value_g_val = logi_reg_g.predict(X_val_g)
        pred_prob_g_val = logi_reg_g.predict_proba(X_val_g)
        accuracy_g_val = logi_reg_g.score(X_val_g, y_val_g)
        
        pred_value_g_test = logi_reg_g.predict(X_test_g)
        pred_prob_g_test = logi_reg_g.predict_proba(X_test_g)
        accuracy_g_test = logi_reg_g.score(X_test_g, y_test_g)
        
        logi_reg_m = LogisticRegression(random_state=random_seed, max_iter=1000, penalty='elasticnet', 
                                        solver='saga', l1_ratio=l1_ratio)
        logi_reg_m.fit(X_balanced_train_m, y_balanced_train_m)
        
        pred_value_m_val = logi_reg_m.predict(X_val_m)
        pred_prob_m_val = logi_reg_m.predict_proba(X_val_m)
        accuracy_m_val = logi_reg_m.score(X_val_m, y_val_m)
        
        pred_value_m_test = logi_reg_m.predict(X_test_m)
        pred_prob_m_test = logi_reg_m.predict_proba(X_test_m)
        accuracy_m_test = logi_reg_m.score(X_test_m, y_test_m)
        return (pred_value_g_val, pred_prob_g_val, accuracy_g_val), (pred_value_m_val, pred_prob_m_val, accuracy_m_val),\
               (pred_value_g_test, pred_prob_g_test, accuracy_g_test), (pred_value_m_test, pred_prob_m_test, accuracy_m_test)
    
    elif predict_method == 'elasticnet':
        pass
    

## Prediction on embeddings of simCLR

In [28]:
predict_iris = prediction_embeddings_unsup_simCLR(covariate='IR_IS_classification', predict_method='logistic')
print('Validation:\nPrediction accuracy for microbiome embedding (simCLR; {0}) = {1:.2%}\n'
      'Prediction accuracy for metabolome embedding (simCLR; {0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for microbiome embedding (simCLR; {0}) = {3:.2%}\n'
      'Prediction accuracy for metabolome embedding (simCLR; {0}) = {4:.2%}'.\
      format('IR_IS_classification', predict_iris[0][2], predict_iris[1][2], predict_iris[2][2], predict_iris[3][2]))

Validation:
Prediction accuracy for microbiome embedding (simCLR; IR_IS_classification) = 71.95%
Prediction accuracy for metabolome embedding (simCLR; IR_IS_classification) = 68.29%

Testing:
Prediction accuracy for microbiome embedding (simCLR; IR_IS_classification) = 68.67%
Prediction accuracy for metabolome embedding (simCLR; IR_IS_classification) = 71.08%


In [29]:
predict_sex = prediction_embeddings_unsup_simCLR(covariate='Sex', predict_method='logistic')
print('Validation:\nPrediction accuracy for microbiome embedding (simCLR; {0}) = {1:.2%}\n'
      'Prediction accuracy for metabolome embedding (simCLR; {0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for microbiome embedding (simCLR; {0}) = {3:.2%}\n'
      'Prediction accuracy for metabolome embedding (simCLR; {0}) = {4:.2%}'.\
      format('Sex', predict_sex[0][2], predict_sex[1][2], predict_sex[2][2], predict_sex[3][2]))

Validation:
Prediction accuracy for microbiome embedding (simCLR; Sex) = 56.48%
Prediction accuracy for metabolome embedding (simCLR; Sex) = 63.89%

Testing:
Prediction accuracy for microbiome embedding (simCLR; Sex) = 67.89%
Prediction accuracy for metabolome embedding (simCLR; Sex) = 66.06%


In [30]:
predict_race = prediction_embeddings_unsup_simCLR(covariate='Race', predict_method='logistic')
print('Validation:\nPrediction accuracy for microbiome embedding (simCLR; {0}) = {1:.2%}\n'
      'Prediction accuracy for metabolome embedding (simCLR; {0}) = {2:.2%}\n\n'
      'Testing:\nPrediction accuracy for microbiome embedding (simCLR; {0}) = {3:.2%}\n'
      'Prediction accuracy for metabolome embedding (simCLR; {0}) = {4:.2%}'.\
      format('Race', predict_race[0][2], predict_race[1][2], predict_race[2][2], predict_race[3][2]))

Validation:
Prediction accuracy for microbiome embedding (simCLR; Race) = 53.27%
Prediction accuracy for metabolome embedding (simCLR; Race) = 53.27%

Testing:
Prediction accuracy for microbiome embedding (simCLR; Race) = 49.54%
Prediction accuracy for metabolome embedding (simCLR; Race) = 37.61%
