### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[0]
file_name='final_new_par_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
#para
epochs=4
batch_size=77

In [6]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1b59e139840>

In [7]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [8]:
x_new = features_new.to_numpy()

In [9]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [10]:
reconstructed_new = vae.predict(x_new)



In [11]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

            0         1         2         3         4         5         6   \
0     0.080027  0.055418  0.017724  0.017277  0.056821  0.005954 -0.029917   
1     0.080027  0.055418  0.017724  0.017277  0.056821  0.005954 -0.029917   
2    -0.046547 -0.080549  0.052451  0.000061 -0.064179 -0.012908 -0.102061   
3    -0.046547 -0.080549  0.052451  0.000061 -0.064179 -0.012908 -0.102061   
4     0.038484  0.007434  0.102825  0.015764 -0.125684  0.015386  0.026150   
...        ...       ...       ...       ...       ...       ...       ...   
2921  0.003413  0.029073  0.025779 -0.019589 -0.060884 -0.042604  0.027152   
2922  0.003413  0.029073  0.025779 -0.019589 -0.060884 -0.042604  0.027152   
2923  0.068374 -0.106889  0.044540 -0.122091  0.018286 -0.004360 -0.022856   
2924  0.075645 -0.062817  0.059782  0.100770  0.028446  0.059260  0.046615   
2925 -0.039468 -0.070520  0.049080 -0.000448 -0.058769 -0.011082 -0.091713   

            7         8         9   ...        90        91    

In [12]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0 -0.861661 -0.585915 -0.591043 -0.842195 -0.745447  0.216452 -1.074485   
1 -0.949222 -0.500993 -0.880098 -0.553721 -0.883937  0.266532 -0.630056   
2 -0.413318 -0.222935 -0.334005 -0.224209 -0.315733  0.181605 -0.325502   
3 -0.635586 -0.641435 -0.709967 -0.775536 -0.257161  0.036724 -0.398677   
4 -0.772618 -0.453097 -0.257382 -0.739482 -0.630653  0.008736 -0.606879   

        7         8         9    ...       91        92        93        94   \
0  0.154959 -0.548225 -0.604495  ... -0.806050 -0.734828 -0.444605 -0.901690   
1 -0.394999 -0.661617 -0.385762  ... -0.442364 -0.664430 -0.690633 -1.119779   
2 -0.126942 -0.518338 -0.373655  ... -0.551292 -0.726652 -0.494389 -0.682730   
3 -0.233421 -0.926981 -0.387345  ... -0.648746 -0.915288 -0.749956 -0.948687   
4  0.067446 -0.679020 -0.501460  ... -0.604135 -0.721946 -0.522817 -0.616394   

        95        96        97        98        99   100  
0 -0.7730

In [13]:
num_samples_to_generate = 289628  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [14]:
new_data_generated

array([[-0.63620895, -0.7194237 , -0.6346098 , ..., -0.57374126,
        -0.5961747 , -0.2661002 ],
       [-0.787896  , -0.63543403, -0.99782455, ..., -0.7914736 ,
        -0.688481  , -0.33501157],
       [-0.52725196, -0.48129678, -0.35705304, ..., -0.5709403 ,
        -0.6445925 , -0.21847096],
       ...,
       [-0.39053866, -0.29287115, -0.42058325, ..., -0.47583935,
        -0.33196518,  0.01297571],
       [-0.51104516, -0.3512726 , -0.5907528 , ..., -0.6765822 ,
        -0.6665807 , -0.4243497 ],
       [-0.7252508 , -0.65709627, -0.66352063, ..., -0.53400946,
        -0.9022702 , -0.34600395]], dtype=float32)

In [15]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [16]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [17]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.080027,0.055418,0.017724,0.017277,0.056821,0.005954,-0.029917,0.008259,0.019316,0.066974,...,-4.035356e-01,-1.911631e-01,-3.826382e-01,-7.266647e-02,-2.352797e-01,6.740164e-01,-2.308341e-01,5.768036e-01,1.097545e-01,1.0
1,0.080027,0.055418,0.017724,0.017277,0.056821,0.005954,-0.029917,0.008259,0.019316,0.066974,...,3.967650e-01,-1.127281e-01,-1.678870e-01,-2.890346e-01,3.202525e-01,-8.532839e-02,4.050606e-01,-5.754756e-01,-1.279708e-01,0.0
2,0.080027,0.055418,0.017724,0.017277,0.056821,0.005954,-0.029917,0.008259,0.019316,0.066974,...,7.442881e-02,-6.804657e-02,-3.267399e-02,-1.205570e-02,6.426763e-02,1.016071e-01,-8.468351e-02,4.665966e-02,-3.235123e-02,0.0
3,0.080027,0.055418,0.017724,0.017277,0.056821,0.005954,-0.029917,0.008259,0.019316,0.066974,...,2.713670e-02,-5.593964e-03,-1.520310e-02,-4.307803e-02,2.164530e-02,-5.633650e-03,2.971487e-02,-3.464674e-02,-6.370794e-03,0.0
4,0.080027,0.055418,0.017724,0.017277,0.056821,0.005954,-0.029917,0.008259,0.019316,0.066974,...,5.513798e-92,2.232982e-92,-3.225556e-92,-6.545422e-92,-4.187428e-93,5.274171e-92,-2.057419e-92,3.269600e-92,6.465156e-92,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585103,-0.719688,-0.680150,-0.353983,-0.179601,-0.446141,0.480896,-0.623835,-0.014250,-0.568623,-0.413539,...,-4.082448e-01,-4.042336e-01,-4.044191e-01,-5.423626e-01,-5.827414e-01,6.793197e-01,-6.844943e-01,-6.408510e-01,-2.931975e-01,1.0
585104,-0.666006,-0.660190,-0.294481,-0.242714,-0.525151,0.003201,-0.788085,-0.022284,-0.518921,-0.470952,...,-4.003229e-01,-4.233132e-01,-1.515131e-01,-3.987822e-01,-3.624037e-01,4.606413e-01,-7.625961e-01,-4.748498e-01,-1.639115e-01,1.0
585105,-0.390539,-0.292871,-0.420583,-0.412239,-0.264540,-0.175043,-0.367739,-0.039407,-0.468592,-0.208790,...,-4.240860e-01,-9.338436e-01,-3.865623e-01,-5.520256e-01,-2.412660e-01,1.375760e-01,-4.758393e-01,-3.319652e-01,1.297571e-02,1.0
585106,-0.511045,-0.351273,-0.590753,-0.173930,-0.420678,0.511116,-0.498013,-0.296720,-0.558568,-0.506717,...,-5.115302e-01,-8.027560e-01,-5.947903e-01,-9.388174e-01,-3.517639e-01,3.777383e-01,-6.765822e-01,-6.665807e-01,-4.243497e-01,1.0


In [18]:
file_name='enhanced_final_new_par_50_space_1.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)