### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[0]
file_name='final_new_par_NNMF_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
#para
epochs=4
batch_size=77

In [6]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1c01f7edf60>

In [7]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [8]:
x_new = features_new.to_numpy()

In [9]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [10]:
reconstructed_new = vae.predict(x_new)



In [11]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

       0    1    2    3    4    5    6    7    8         9   ...        90  \
0     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.000000   
1     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  5.110435   
2     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.000000   
3     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.000000   
4     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.014562  ...  0.000000   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...       ...  ...       ...   
2921  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.000000   
2922  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.000000   
2923  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.000000   
2924  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.120670  ...  0.000000   
2925  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.000000   

       91   92        93        94        95   96   97   98   9

In [12]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0  0.695677  0.143632  0.117471 -0.020689  0.185663 -0.385531  0.213294   
1  0.988414  0.090808  0.054008  0.002988  0.229224 -0.401676  0.287895   
2  0.337309 -0.027988  0.121742 -0.104274  0.147242 -0.251802  0.198055   
3  0.471478 -0.304712  0.288216 -0.268586  0.236000 -0.488064  0.332048   
4  0.437947 -0.045382 -0.028887 -0.082802  0.237653 -0.336536  0.162202   

        7         8         9    ...       91        92        93        94   \
0 -0.448035 -0.022585 -0.429457  ... -0.343178 -0.423867 -0.250233 -0.036870   
1 -0.399467  0.009779 -0.375494  ... -0.434709 -0.358744 -0.236484 -0.084229   
2 -0.302448 -0.023633 -0.365933  ... -0.216432 -0.279205 -0.169452 -0.192790   
3 -0.364924  0.026968 -0.442159  ... -0.476634 -0.081738 -0.026328 -0.290690   
4 -0.423634  0.146821 -0.298812  ... -0.377484 -0.390980 -0.154023  0.029324   

        95        96        97        98        99   100  
0 -0.4006

In [None]:
num_samples_to_generate = 289628  

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [14]:
new_data_generated

array([[ 0.9531993 ,  0.61610764,  0.00503654, ...,  0.1134003 ,
        -0.60187966, -1.0281173 ],
       [ 0.5137695 , -0.15900362,  0.14083761, ..., -0.17476079,
        -0.6424307 , -0.8401363 ],
       [ 0.60779315, -0.0601043 ,  0.08108652, ..., -0.0283568 ,
        -0.22836436, -0.4427022 ],
       ...,
       [ 0.78120357, -0.12632771,  0.16767126, ..., -0.07107446,
        -0.5062637 , -0.73861045],
       [ 0.82661325, -0.0308955 ,  0.05064399, ...,  0.12419126,
        -0.46481034, -0.61822397],
       [ 0.9942495 ,  0.4230418 , -0.0245189 , ...,  0.07805028,
        -0.748095  , -0.6321356 ]], dtype=float32)

In [15]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [16]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [17]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.947743,7.487351e-03,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,3.729715e-02,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.146746,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.080936e-215,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585103,0.706367,0.162674,-0.028344,-0.201966,0.289283,-0.404519,0.290795,-0.419304,0.144264,-0.432503,...,-0.471380,-3.131129e-01,-0.068906,-0.048637,-0.361918,-0.510515,-0.136898,-0.567991,-0.638304,1.0
585104,0.407153,0.130907,0.150620,-0.121162,0.186471,-0.317278,0.270852,-0.235071,0.012905,-0.402576,...,-0.161271,-2.821451e-01,-0.352899,-0.239710,-0.344966,-0.229304,0.084699,-0.364441,-0.625612,1.0
585105,0.781204,-0.126328,0.167671,-0.221109,0.339594,-0.211090,0.253513,-0.476060,-0.017334,-0.536464,...,-0.438001,-5.053270e-01,-0.061797,-0.620700,-0.496545,-0.689555,-0.071074,-0.506264,-0.738610,1.0
585106,0.826613,-0.030895,0.050644,-0.342844,0.250680,-0.398355,0.357057,-0.213522,0.134193,-0.406385,...,-0.443432,-3.193145e-01,-0.201415,-0.416608,-0.600299,-0.513583,0.124191,-0.464810,-0.618224,1.0


In [18]:
file_name='enhanced_VAE_final_new_par_50_NNFM_space_2.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)