In [2]:
#base
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import random
#keras
import keras.backend as K
import tensorflow as tf 
#sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
#scipy
from scipy.spatial import distance
from scipy.stats import gmean
from scipy import interpolate 
#biopython
import skbio
from io import StringIO

In [3]:
warnings.filterwarnings('ignore')
sns.set_theme()

# 1. READ AND PREPARE DATA

In [4]:
df = pd.read_csv('rarefied_double_interpolated_feces_male_otu.csv', index_col = [0])

In [None]:
def split_data(df):
    
    train_samples = int(df.shape[0] * .85) #use 15% of samples for test data
    
    X_train = df.sample(train_samples, random_state = 42)
    X_test = df[~df.index.isin(X_train.index)]
    
    return X_train, X_test

In [None]:
#split data to train and test
x_train, x_test = split_data(df)

In [None]:
#normalize data using minmax
scaler = MinMaxScaler()

normalized_train = scaler.fit_transform(x_train)
normalized_test = scaler.transform(x_test)

In [None]:
plt.figure(figsize = [10,4])
plt.subplot(1, 2, 1)
sns.distplot(x_train)
plt.subplot(1, 2, 2)
sns.distplot(normalized_train)

# 2. TRAIN AND TEST AUTOENCODER

In [None]:
tree_file = 'tree.nwk'
tree = skbio.tree.TreeNode.read(tree_file)

class Metrics(tf.keras.callbacks.Callback):
    
    def __init__(self, validation_data, scaler, prefix = 'val'):
        self.validation_data = validation_data #podpinamy validation data
        self.scaler = scaler
        self.prefix = prefix
        
        metrics_dict = {}

    def on_epoch_end(self, batch, logs={}):
        
        predict = np.asarray(self.model.predict(self.validation_data[0]))
        targ = self.validation_data[1]
        
        #predict_denorm = np.exp(scaler.inverse_transform(predict)) - 1
        #target_denorm = np.exp(scaler.inverse_transform(targ)) - 1
        
        predict_denorm = scaler.inverse_transform(predict)
        predict_denorm[predict_denorm<0] = 0
        target_denorm = scaler.inverse_transform(targ)
        target_denorm[target_denorm<0] = 0
        
        logs[f'{self.prefix}_cosine_distance'] = np.array([distance.cosine(target_denorm[i], predict_denorm[i]) for i in range(target_denorm.shape[0])]).mean()
        logs[f'{self.prefix}_bray_curtis'] = np.array([distance.braycurtis(target_denorm[i], predict_denorm[i]) for i in range(target_denorm.shape[0])]).mean()
        logs[f'{self.prefix}_weighted_unifrac'] = np.array([skbio.diversity.beta.weighted_unifrac(target_denorm[i], predict_denorm[i],otu_ids=df.columns, tree=tree) for i in range(target_denorm.shape[0])]).mean()
        
        return

### model on test data

In [None]:
input_size = normalized_train.shape[1]
input_data = tf.keras.Input(shape=(input_size,))

encoded = tf.keras.layers.Dense(504, activation='relu')(input_data)

encoded = tf.keras.layers.Dense(64)(encoded) ##latent, linear act

decoded = tf.keras.layers.Dense(504, activation='relu')(encoded)
#output layer
decoded = tf.keras.layers.Dense(input_size, activation='linear')(decoded)

# Compile model
autoencoder = tf.keras.Model(input_data, decoded)

autoencoder.compile(
    loss = tf.keras.losses.MeanSquaredError(),   
    optimizer='Adam'
)

metrics = Metrics(
    validation_data = (normalized_test, normalized_test),
    scaler=scaler,
    prefix='val'
)
train_metrics = Metrics(
    validation_data = (normalized_train, normalized_train),
    scaler=scaler,
    prefix = 'train'
)

ae_result = autoencoder.fit(
    normalized_train, normalized_train,
    epochs=60,
    batch_size=16,
    shuffle=False,
    validation_data=(normalized_test, normalized_test),
    callbacks = [metrics, train_metrics]
)

results_df = pd.DataFrame.from_dict(ae_result.history)
results_df['epochs'] = results_df.index

In [None]:
results_df.to_csv('autoencoders/history/raw_ae_history.csv')

# 3. PLOT METRICS AND LOSS

In [None]:
def plot_metric(df, metric):
    
    plt.figure(figsize=(10, 6))
    if metric == 'loss':
        
        sns.lineplot(x = df['epochs'], y = df[metric], color = 'black')
        sns.scatterplot(x = df['epochs'], y = df[metric], color = 'black')
        
    else:
        sns.lineplot(x = df['epochs'], y = df['train_' + metric], color = 'black')
        sns.scatterplot(x = df['epochs'], y = df['train_' + metric], color = 'black')
    
    sns.lineplot(x = df['epochs'], y = df['val_'+ metric], color = 'orange')
    sns.scatterplot(x = df['epochs'], y = df['val_'+ metric], color = 'orange')
    
    plt.title('%s on train and test data' % (metric))
    plt.savefig('plots/raw_data/raw_data_{}.png'.format(metric))

In [None]:
plot_metric(results_df, 'loss')
plot_metric(results_df, 'bray_curtis')
plot_metric(results_df, 'cosine_distance')
plot_metric(results_df, 'weighted_unifrac')

In [None]:
autoencoder.summary()

In [None]:
autoencoder.save('autoencoders/raw_data_ae/raw_data_ae.hdf5')

# 4. Extract latent layer

In [None]:
ae = tf.keras.models.load_model('autoencoders/raw_data_ae/raw_data_ae.hdf5')

encoder_model = tf.keras.Model(ae.input, ae.layers[-3].output)

encoded_array = encoder_model.predict(normalized_train)
encoded_df = pd.DataFrame(encoded_array)

encoded_df.to_csv('SVAR_DATA/encoded_data_for_prediction/raw_xtrain_encoded.csv', sep = '\t')

# 5. sVAR prediction

# 6. DECODE EMBEDDED PREDICTION

In [None]:
%pwd

In [None]:
encoded_prediction_svar2 = pd.read_csv('SVAR_DATA/svar_predictions/raw_xtrain_data_ae_predictions_sVAR2.csv')
#encoded_prediction_svar2 = pd.read_csv('svar_predictions/raw_data_ae_predictions_sVAR2.csv')

#read model
ae = tf.keras.models.load_model('autoencoders/raw_data_ae/raw_data_ae.hdf5')

#read decoder
encoding_dim = 64
encoded_input = tf.keras.Input(shape=(encoding_dim,))

decoder = ae.layers[-2](encoded_input)
decoder = ae.layers[-1](decoder)
decoder_model = tf.keras.Model(encoded_input, decoder)


#predict 
#decoded_predicion_svar1 = pd.DataFrame(decoder_model.predict(encoded_prediction_svar1), columns = df.columns)
#decoded_predicion_svar1[decoded_predicion_svar1<0] = 0

decoded_predicion_svar2 = decoder_model.predict(encoded_prediction_svar2)
decoded_prediction_denorm = scaler.inverse_transform(decoded_predicion_svar2)
decoded_prediction_denorm[decoded_prediction_denorm<0] = 0

decoded_prediction_denorm_df = pd.DataFrame(decoded_prediction_denorm, columns = df.columns)
decoded_prediction_denorm_df.index = x_train.tail(18).index

#decoded_predicion_svar1.to_csv('decoded_svar_predictions/raw_svar1_prediction_decoded.csv')
#decoded_predicion_svar2.to_csv('decoded_svar_predictions/raw_svar2_prediction_decoded.csv')

In [None]:
decoded_prediction_denorm_df

In [None]:
col = 'TACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTGACTTAAGTGAGGTGTGAAAGCCCCGGGCTTAACCTGGGAATTG'

sns.lineplot(

# check prediction

### normalized rmse

In [None]:
from sklearn.metrics import mean_squared_error

def calculate_nrmse(predicion, model):
    
    normalised_nRMSE = []
    COL = []

    for col in predicion.columns.tolist():

        TRUE = history[col]    
        FORECAST = predicion[col]

        divider = TRUE.max() - TRUE.min() #+ 0.00001
        RMSE = mean_squared_error(TRUE, FORECAST, squared=False)
        nRMSE = RMSE/divider

        normalised_nRMSE.append(nRMSE)
        COL.append(col)

    nRMSE_COL = pd.DataFrame(list(zip(COL, normalised_nRMSE)), columns = ['BACTERIA', 'NRMSE'])
    nRMSE_COL['model'] = model
    
    return nRMSE_COL

In [None]:
nrmse_svar1 = calculate_nrmse(decoded_predicion_svar1, 'svar1')
nrmse_svar2 = calculate_nrmse(decoded_predicion_svar2, 'svar2')

NRMSE_DF = nrmse_svar1.append(nrmse_svar2)

In [None]:
plt.figure(figsize = [10, 5])

sns.boxplot(
    y = NRMSE_DF['NRMSE'],
    x = NRMSE_DF['model'],
    orient = 'v',
    width = .3,
    color = 'White'
)
sns.swarmplot(
    y = NRMSE_DF['NRMSE'],
    x = NRMSE_DF['model'],
    orient = 'v',
    s = 5,
    alpha = .6
)

#plt.title('{}_nrmse'.format(model))
#plt.savefig('svar1_nrmse.png')

### spearman r

In [None]:
from scipy.stats import spearmanr

def calculate_rho(predicion, model):
    
    RHO = []
    COL = []

    for col in predicion.columns.tolist():

        TRUE = history[col]    
        FORECAST = predicion[col]
        
        rho = spearmanr(TRUE, FORECAST)

        RHO.append(rho[0])
        COL.append(col)

    RHO_DF = pd.DataFrame(list(zip(COL, np.abs(RHO))), columns = ['BACTERIA', 'RHO']).dropna()
    RHO_DF['model'] = model
    
    return RHO_DF

In [None]:
rho_svar1 = calculate_rho(decoded_predicion_svar1, 'svar1')
rho_svar2 = calculate_rho(decoded_predicion_svar2, 'svar2')

RHO_DF = rho_svar1.append(rho_svar2)

In [None]:
plt.figure(figsize = [10, 5])

sns.boxplot(
    y = RHO_DF['RHO'],
    x = RHO_DF['model'],
    orient = 'v',
    width = .3,
    color = 'White'
)
sns.swarmplot(
    y = RHO_DF['RHO'],
    x = RHO_DF['model'],
    orient = 'v',
    s = 5,
    alpha = .6
)