In [1]:
#%matplotlib inline
#%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

import tensorflow as tf 

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from scipy import interpolate 
import random

import keras.backend as K
import skbio
from io import StringIO
from scipy.spatial import distance

2021-12-10 12:16:09.984236: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-10 12:16:09.984283: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


As this is implemented as a loss and we minimize the loss during training the sign is inverted, so that when you minimize the loss (towards -1), the actual similarity is maximized towards 1.

if we use it as metric it will converge to 1, if we use it as loss it will fo towards -1

In [2]:
df = pd.read_csv('rarefied_interpolated_male_feces copy.tsv',
                  sep='\t', header = [1], index_col =[0])
df = df.T
df = df.head(402)

X_train = df.sample(350, random_state = 42)
X_test = df[~df.index.isin (X_train.index)]

In [80]:
scaler = StandardScaler()

normalized_data = scaler.fit_transform(np.log(1+X_train))
normalized_test = scaler.transform(np.log(1+X_test))

In [69]:
x = normalized_data

In [None]:
#randomized model
def model_dropout(h_neurons, dropout, l_neurons):
    
    #SGD
    learning_rate = 0.1
    momentum = 0.8
    
    sgd = SGD(lr=learning_rate, momentum=momentum, nesterov=True)

    # create model
    input_size = x.shape[1]
    input_data = Input(shape=(input_size,))
    
    model = Sequential()
    
    #encoder layer #1
    model.add(Dense(h_neurons, activation='relu', input_dim=input_size))#, kernel_regularizer=regularizers.l2(l2_reg))) 

    ######## latent layer #########
    model.add(Dense(l_neurons, activation='relu'))
    
    #dropout layer
    model.add(Dropout(dropout))
    
    #batch norm layer
    model.add(BatchNormalization())
    
    #decoder layer #1
    model.add(Dense(h_neurons, activation='relu'))#, kernel_regularizer=regularizers.l2(l2=l2_reg))) 
    
    #dropout layer
    model.add(Dropout(dropout))
    
    #batch norm layer
    model.add(BatchNormalization())

    #output layer
    model.add(Dense(input_size, activation='sigmoid')) #output as we normalized data between 0 and 1
    
    # Compile model
    #model.compile(loss='mae', optimizer=sgd)
    cosine_loss = CosineSimilarity()
    log_loss = tf.keras.losses.MeanSquaredLogarithmicError() #best 0
    model.compile(loss=log_loss, optimizer='Adam')
    
    return model


seed = 7
np.random.seed(seed)

#build model
model = KerasRegressor(build_fn=model_dropout, verbose=0)

#params
batch_size = [8, 16, 32]
epochs = [15]
h_neurons = [126, 252, 504]
l_neurons = [32, 64, 126]
dropout = [0.2, 0.5, 0.7]
#l2_reg = [0, 0.01, 0.1]

#param grid
param_grid = dict(
    batch_size=batch_size,
    epochs=epochs,
    h_neurons = h_neurons,
    l_neurons = l_neurons,
    dropout = dropout
)

#validation
kfld = KFold(n_splits=3,
             shuffle=True,
             random_state=seed
            )

grid = RandomizedSearchCV(estimator=model,
                          cv=kfld,
                          param_distributions=param_grid, 
                          verbose=20,
                          n_iter=10,
                          n_jobs=1
                         )

#early_stopping = EarlyStopping(monitor='val_loss', patience=epochs_to_wait_for_improve)

grid_result = grid.fit(x, x)

In [None]:
grid_result.best_score_

In [None]:
grid_result.best_params_['l_neurons']

In [None]:
for metric_name, metric in self.metrics_dict.items():
    logs[f'{prefix}_{metric_name} = np.array(metric(...))


In [81]:
class Metrics(tf.keras.callbacks.Callback):
    
    def __init__(self, validation_data, scaler, prefix = 'val'):
        self.validation_data = validation_data #podpinamy validation data
        self.scaler = scaler
        self.prefix = prefix
        
        metrics_dict = {}

    def on_epoch_end(self, batch, logs={}):
        
        predict = np.asarray(self.model.predict(self.validation_data[0]))
        targ = self.validation_data[1]
        
        predict_denorm = np.exp(scaler.inverse_transform(predict)) - 1
        target_denrom = np.exp(scaler.inverse_transform(targ)) - 1
        
        logs[f'{self.prefix}_cosine_distance'] = np.array([distance.cosine(target_denrom[i], predict_denorm[i]) for i in range(target_denrom.shape[0])]).mean()
        logs[f'{self.prefix}_bray_curtis'] = np.array([distance.braycurtis(target_denrom[i], predict_denorm[i]) for i in range(target_denrom.shape[0])]).mean()
        
        
        return

### model on test data

In [82]:
# create model
input_size = normalized_data.shape[1]
input_data = tf.keras.Input(shape=(input_size,))

encoded = tf.keras.layers.Dense(504, activation='relu')(input_data)
encoded = tf.keras.layers.Dense(252, activation='relu')(input_data)

encoded = tf.keras.layers.Dense(64)(encoded) ##latent, linear act

#decoded = layers.Dropout(grid_result.best_params_['dropout'])(decoded)
decoded = tf.keras.layers.Dense(252, activation='relu')(encoded)
decoded = tf.keras.layers.Dense(504, activation='relu')(decoded)

#output layer
decoded = tf.keras.layers.Dense(input_size, activation='linear')(decoded)

# Compile model
autoencoder = tf.keras.Model(input_data, decoded)


autoencoder.compile(
    #loss=tf.keras.losses.MeanSquaredLogarithmicError(),
    loss = tf.keras.losses.MeanSquaredError(),
    
    optimizer='Adam'
)

metrics = Metrics(validation_data = (normalized_test, normalized_test), scaler=scaler, prefix='val')
train_metrics = Metrics(validation_data = (normalized_data, normalized_data), scaler=scaler, prefix = 'train')


test_result = autoencoder.fit(
    normalized_data, normalized_data,
    epochs=5,
    batch_size=16,
    shuffle=False,
    validation_data=(normalized_test, normalized_test),
    callbacks = [metrics, train_metrics]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [83]:
train_val_loss_df = pd.DataFrame.from_dict(test_result.history)
train_val_loss_df['epochs'] = train_val_loss_df.index

In [84]:
train_val_loss_df

Unnamed: 0,loss,val_loss,val_cosine_distance,val_bray_curtis,train_cosine_distance,train_bray_curtis,epochs
0,0.943173,0.800041,0.090179,0.297666,0.090887,0.300489,0
1,0.853669,0.746791,0.093016,0.257505,0.08581,0.252944,1
2,0.782791,0.718443,0.080393,0.234396,0.073379,0.226374,2
3,0.718305,0.702507,0.078807,0.244654,0.077657,0.234014,3
4,0.658464,0.68515,0.070747,0.238409,0.064608,0.211648,4


In [79]:
train_val_loss_df

Unnamed: 0,loss,val_loss,val_cosine_distance,val_bray_curtis,train_cosine_distance,train_bray_curtis,epochs
0,0.950177,1.296157,0.083852,0.28386,0.079819,0.28108,0
1,0.884891,1.246577,0.072751,0.24603,0.06297,0.236884,1
2,0.813834,1.210798,0.065849,0.238871,0.053123,0.220401,2
3,0.736258,1.179414,0.058096,0.224838,0.041988,0.201863,3
4,0.668937,1.166469,0.054968,0.219866,0.04073,0.198197,4


In [None]:
plt.figure(figsize=(12, 8))

sns.lineplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['loss'], color = 'black')
sns.scatterplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['loss'], color = 'black')

sns.lineplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['val_loss'], color = 'orange')
sns.scatterplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['val_loss'], color = 'orange')
#plt.title('cosine similarity on train and test data')
#plt.savefig('AE_cosine_sim/model1_loss.png')

In [None]:
plt.figure(figsize=(12, 8))

sns.lineplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['cosine_similarity'], color = 'black')
sns.scatterplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['cosine_similarity'], color = 'black')

sns.lineplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['val_cosine_similarity'], color = 'orange')
sns.scatterplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['val_cosine_similarity'], color = 'orange')
plt.title('cosine similarity on train and test data')
#plt.savefig('AE_cosine_sim/model1_loss.png')

In [None]:
plt.figure(figsize=(12, 8))

sns.lineplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['root_mean_squared_error'], color = 'black')
sns.scatterplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['root_mean_squared_error'], color = 'black')

sns.lineplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['val_root_mean_squared_error'], color = 'orange')
sns.scatterplot(x = train_val_loss_df['epochs'], y = train_val_loss_df['val_root_mean_squared_error'], color = 'orange')
plt.title('root_mean_squared_error on train and test data')
#plt.savefig('AE_cosine_sim/model1_loss.png')

custom unifrac loss function

In [None]:
import keras.backend as K
import skbio
from io import StringIO

In [None]:
def unifrac_loss(y_true, y_reconstructed):
    
    tree_file = 'tree.nwk'
    tree = skbio.tree.TreeNode.read(tree_file)
    
    loss = skbio.diversity.beta.unweighted_unifrac(y_true,
                                                   y_reconstructed,
                                                   otu_ids = df.columns,
                                                   tree=tree
                                                  )
    loss = K.mean(loss)
    return loss

In [None]:
# create model
input_size = normalized_data.shape[1]
input_data = Input(shape=(input_size,))

encoded = layers.Dense(504, activation='relu')(input_data)

encoded = layers.Dense(64, activation = 'relu')(encoded) ##latent

decoded = layers.Dense(504, activation='relu')(encoded)
#output layer
decoded = layers.Dense(input_size, activation='sigmoid')(decoded)

# Compile model
autoencoder = Model(input_data, decoded)

loss_fn = CosineSimilarity()
autoencoder.compile(
    loss=unifrac_loss,
    optimizer='Adam'
)

test_result = autoencoder.fit(
    x, x,
    epochs=2,
    batch_size=16,
    shuffle=False,
    validation_data=(x, x)
)