### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[1]
file_name='final_new_par_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
filtered_df.shape

(635, 101)

In [6]:
#para
epochs=20
batch_size=127

In [7]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x17c9ed82020>

In [8]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [9]:
x_new = features_new.to_numpy()

In [10]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [11]:
reconstructed_new = vae.predict(x_new)



In [12]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

               0             1             2             3             4   \
0   -3.394378e-02 -2.774489e-02  7.436590e-02 -2.090742e-03  5.429764e-02   
1    2.061547e-01 -3.876480e-02  9.635618e-02 -3.174892e-02  1.931787e-01   
2    2.061547e-01 -3.876480e-02  9.635618e-02 -3.174892e-02  1.931787e-01   
3    2.061547e-01 -3.876480e-02  9.635618e-02 -3.174892e-02  1.931787e-01   
4    8.515725e-02  6.800587e-03 -7.951888e-02 -1.056655e-01  3.806502e-02   
..            ...           ...           ...           ...           ...   
630 -8.027622e-02  1.060513e-01  1.119848e-01 -5.767181e-02 -5.158156e-02   
631 -8.027622e-02  1.060513e-01  1.119848e-01 -5.767181e-02 -5.158156e-02   
632 -5.106899e-07  9.601754e-07 -5.953880e-07 -8.921481e-07  8.955838e-07   
633 -1.766981e-07  3.091753e-07 -1.796200e-07 -2.279041e-07  3.049497e-07   
634 -3.394378e-02 -2.774489e-02  7.436590e-02 -2.090742e-03  5.429764e-02   

               5             6             7             8             9   

In [13]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0 -0.492125 -0.336018 -0.559723 -0.117003 -0.528587 -0.422723 -0.238075   
1 -0.477891 -0.235203 -0.371923 -0.592861 -0.590469 -0.613857 -0.219182   
2 -0.435469 -0.260772 -0.145093 -0.541271 -0.148564 -0.380243 -0.062029   
3 -0.650563 -0.246841 -0.859602 -0.594107 -0.581299 -0.222427  0.174027   
4 -0.787603 -0.127268 -0.829895 -0.702727 -0.654649 -0.169831 -0.160371   

        7         8         9    ...       91        92        93        94   \
0 -0.303260 -0.457011 -0.079655  ... -0.505221 -0.772251  0.058573 -0.576534   
1 -0.321777 -0.351439 -0.264356  ... -0.746410 -0.501690  0.136681 -0.725792   
2 -0.205360 -0.470583 -0.060717  ... -0.341087 -0.440586  0.007748 -0.466442   
3  0.155203 -0.487916 -0.597907  ... -0.368406 -0.309678 -0.367306 -0.592962   
4 -0.171508 -0.537167 -0.415171  ... -0.350304 -0.649189 -0.176557 -0.523346   

        95        96        97        98        99   100  
0 -0.4406

In [14]:
num_samples_to_generate = 19915  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [15]:
new_data_generated

array([[-1.0777221 , -1.0918008 , -0.7030367 , ..., -0.7296246 ,
        -0.5811352 , -0.8859711 ],
       [-0.3001925 , -0.4287057 , -0.2137387 , ..., -0.5918791 ,
        -0.5554475 , -0.8658088 ],
       [-0.6815854 , -0.6867297 , -0.5907994 , ..., -0.8811034 ,
        -0.44196633, -0.5763391 ],
       ...,
       [-0.7349929 , -0.82603246, -0.15990685, ..., -0.58413976,
        -0.21431829, -0.69971174],
       [-0.5471129 , -0.3064836 , -0.53453535, ..., -0.52871704,
        -0.28403103, -0.36786234],
       [-0.15381911, -0.6186482 , -0.4932994 , ..., -0.60181916,
        -0.21477833, -0.7888159 ]], dtype=float32)

In [16]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [17]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [18]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.033944,-0.027745,0.074366,-0.002091,0.054298,0.063317,0.002203,-0.046714,-0.012891,-0.077753,...,-1.010880e-06,-0.000002,-0.000003,-6.166932e-07,-7.527073e-07,2.791215e-07,-8.883754e-07,0.000001,-7.427837e-07,0.0
1,-0.033944,-0.027745,0.074366,-0.002091,0.054298,0.063317,0.002203,-0.046714,-0.012891,-0.077753,...,-8.413894e-02,-0.082139,0.211017,6.760421e-03,-5.197733e-02,4.568656e-02,-1.280104e-01,-0.130348,-7.988450e-02,0.0
2,-0.033944,-0.027745,0.074366,-0.002091,0.054298,0.063317,0.002203,-0.046714,-0.012891,-0.077753,...,2.240346e-03,0.004919,-0.008754,-2.784838e-02,2.779317e-03,-1.460430e-01,1.236033e-01,0.071233,2.409235e-02,0.0
3,-0.033944,-0.027745,0.074366,-0.002091,0.054298,0.063317,0.002203,-0.046714,-0.012891,-0.077753,...,-8.452601e-07,-0.000002,-0.000002,-5.640821e-07,-6.983902e-07,1.942949e-07,-7.831622e-07,0.000001,-7.150177e-07,0.0
4,-0.033944,-0.027745,0.074366,-0.002091,0.054298,0.063317,0.002203,-0.046714,-0.012891,-0.077753,...,8.912425e-02,-0.075009,0.164325,-1.113106e-01,-8.748227e-02,-1.467504e-01,-4.673217e-02,-0.011832,3.213240e-02,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41095,-0.519155,-0.135045,-0.574430,-0.346728,-0.530469,-0.391611,0.086913,0.038543,-0.538496,-0.446105,...,-2.857168e-01,-0.476790,-0.055436,-3.985401e-01,-6.020269e-01,9.638464e-02,-2.804751e-01,-0.368702,-4.198478e-01,1.0
41096,-0.108699,-0.204299,-0.024243,-0.164217,-0.125962,-0.151479,-0.065604,0.021812,-0.202745,-0.123396,...,-1.359127e-01,-0.095078,-0.015124,-2.674699e-01,-1.799986e-01,1.614631e-03,-1.841585e-01,-0.068155,-1.348069e-01,1.0
41097,-0.734993,-0.826032,-0.159907,-0.886572,-0.457482,-0.812474,0.086447,-0.177162,-0.592399,-0.697786,...,-6.914510e-01,-0.488457,-0.223353,-1.252389e+00,-8.397382e-01,-3.781731e-01,-5.841398e-01,-0.214318,-6.997117e-01,1.0
41098,-0.547113,-0.306484,-0.534535,-0.447887,-0.536975,-0.420732,-0.312824,-0.084043,-0.633664,-0.250730,...,-3.812471e-01,-0.635850,-0.164249,-5.179812e-01,-4.505555e-01,1.035014e-01,-5.287170e-01,-0.284031,-3.678623e-01,1.0


In [20]:
file_name='enhanced_final_new_par_50_space_1.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)