### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [4]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[3]
file_name='final_new_par_NNMF_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [5]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [6]:
filtered_df.shape

(90, 101)

In [7]:
#para
epochs=4
batch_size=45

In [8]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x19f21af5c90>

In [9]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [10]:
x_new = features_new.to_numpy()

In [11]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [12]:
reconstructed_new = vae.predict(x_new)



In [13]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

          0              1    2    3    4         5    6         7    8   \
0   0.000000  9.784057e-199  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0   
1   3.398607   0.000000e+00  0.0  0.0  0.0  0.813774  0.0  0.151195  0.0   
2   3.398607   0.000000e+00  0.0  0.0  0.0  0.813774  0.0  0.151195  0.0   
3   3.398607   0.000000e+00  0.0  0.0  0.0  0.813774  0.0  0.151195  0.0   
4   3.398607   0.000000e+00  0.0  0.0  0.0  0.813774  0.0  0.151195  0.0   
..       ...            ...  ...  ...  ...       ...  ...       ...  ...   
85  0.000000   0.000000e+00  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0   
86  0.000000   0.000000e+00  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0   
87  0.000000   0.000000e+00  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0   
88  0.000000   0.000000e+00  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0   
89  0.000000   0.000000e+00  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0   

          9   ...        90        91   92   93        94        95       96  \
0   0.0

In [14]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0  0.728337  0.252341 -0.430401 -0.380237  0.064754  0.231725 -0.028846   
1  0.863965  0.368665 -0.806697 -0.330807 -0.635908  0.486659 -0.092815   
2  0.521806  0.147533 -0.132201 -0.377503 -0.094450 -0.053056 -0.299110   
3  0.572990  0.196850 -0.048263 -0.365108 -0.066044  0.106884 -0.169686   
4  0.226792  0.262180 -0.223541 -0.202549 -0.266578 -0.036398 -0.175415   

        7         8         9    ...       91        92        93        94   \
0 -0.223664 -0.035591 -0.253857  ...  0.475697 -0.108435 -0.152430  0.062531   
1 -0.029383 -0.048135  0.017781  ...  0.478629 -0.602392 -0.909419  0.089618   
2 -0.021320 -0.360872 -0.188976  ...  0.085065 -0.124812 -0.111010  0.068472   
3 -0.153908 -0.310950 -0.032121  ...  0.181952 -0.070727 -0.432513 -0.197764   
4  0.061892 -0.033599  0.156231  ...  0.311008 -0.421860 -0.462680 -0.137760   

        95        96        97        98        99   100  
0 -0.2013

In [15]:
num_samples_to_generate = 1224  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [16]:
new_data_generated

array([[ 0.28658432,  0.2997013 , -0.17601691, ..., -0.10716633,
         0.11184111,  0.22770865],
       [ 0.35273823,  0.29303792, -0.09646215, ..., -0.06499067,
         0.11163116, -0.02385332],
       [ 0.56578934, -0.06928885, -0.4651704 , ...,  0.31439084,
        -0.1797366 , -0.2143017 ],
       ...,
       [ 0.38110012,  0.3035871 , -0.49261037, ..., -0.03274395,
        -0.21214347,  0.1518813 ],
       [ 0.41897762,  0.10663438, -0.17796825, ...,  0.23006982,
         0.12182347,  0.16352013],
       [ 0.60858274,  0.33629084, -0.37386525, ...,  0.08513309,
         0.0609857 , -0.14942801]], dtype=float32)

In [17]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [18]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [19]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.000000,9.784057e-199,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.841355,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,0.000000,9.784057e-199,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.101491,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.000000,9.784057e-199,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.507284,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.000000,9.784057e-199,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.000000,9.784057e-199,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,0.405654,4.629274e-01,-0.366650,-0.508944,-0.194749,0.141172,-0.174666,-0.229475,-0.125181,0.148029,...,0.462409,0.007850,-0.214231,-0.196469,-0.050491,-0.159095,0.120817,-0.008852,-0.045440,1.0
2624,0.369171,1.758905e-01,0.006864,0.050550,-0.525651,-0.142062,-0.131028,-0.155611,-0.177161,0.230657,...,0.101704,-0.632435,-0.408802,-0.146775,-0.159656,-0.537760,-0.268562,0.232839,-0.015646,1.0
2625,0.381100,3.035871e-01,-0.492610,-0.131423,-0.505997,0.062401,-0.153792,0.152791,0.304824,0.161658,...,0.402255,-0.501319,-0.097919,-0.228281,-0.300880,-0.264588,-0.032744,-0.212143,0.151881,1.0
2626,0.418978,1.066344e-01,-0.177968,-0.139130,-0.334364,0.102424,-0.215029,0.211196,-0.015105,0.221532,...,0.313792,-0.478980,-0.595775,-0.112882,0.085187,-0.374138,0.230070,0.121823,0.163520,1.0


In [20]:
file_name='enhanced_VAE_final_new_par_50_NNMF_space_2.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)