### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[2]
file_name='final_new_par_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
filtered_df.shape

(1476, 101)

In [6]:
#para
epochs=4
batch_size=82

In [7]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x24a1e1421a0>

In [8]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [9]:
x_new = features_new.to_numpy()

In [None]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# z contains the latent representations of  filtered data.



In [11]:
reconstructed_new = vae.predict(x_new)



In [12]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

            0         1         2         3         4         5         6   \
0    -0.046412 -0.058844  0.030492 -0.045302 -0.047853 -0.017467  0.049574   
1    -0.046412 -0.058844  0.030492 -0.045302 -0.047853 -0.017467  0.049574   
2    -0.092292  0.050318  0.007350 -0.025365 -0.007617  0.043665  0.030111   
3     0.001631 -0.043295 -0.090071 -0.045389 -0.108350 -0.136271  0.069069   
4     0.001631 -0.043295 -0.090071 -0.045389 -0.108350 -0.136271  0.069069   
...        ...       ...       ...       ...       ...       ...       ...   
1471 -0.040237 -0.150078  0.016174 -0.207401  0.026399 -0.030389 -0.210890   
1472 -0.040237 -0.150078  0.016174 -0.207401  0.026399 -0.030389 -0.210890   
1473 -0.040237 -0.150078  0.016174 -0.207401  0.026399 -0.030389 -0.210890   
1474 -0.046412 -0.058844  0.030492 -0.045302 -0.047853 -0.017467  0.049574   
1475 -0.046412 -0.058844  0.030492 -0.045302 -0.047853 -0.017467  0.049574   

            7         8         9   ...        90        91    

In [13]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0 -0.498210 -0.511602 -0.517240 -0.592945 -0.387479 -0.457874 -0.698511   
1 -0.324382 -0.486380 -0.613683 -0.643124 -0.569715 -0.234215 -0.482178   
2 -0.458869 -0.613392 -0.392305 -0.301704 -0.636402 -0.494370 -0.724505   
3 -0.268335 -0.777448 -0.311483 -0.349589 -0.322678 -0.632217 -0.496246   
4 -0.503912 -0.486360 -0.548580 -0.051595 -0.381177 -0.451978 -0.514669   

        7         8         9    ...       91        92        93        94   \
0 -0.643023 -0.392554 -0.884905  ... -0.379539 -1.200930 -0.740842 -0.791174   
1 -0.222053 -0.234737 -0.882680  ... -0.545987 -0.172602 -0.825639 -0.479157   
2 -0.395026 -0.377087 -0.682469  ... -0.587894 -0.409835 -0.576252 -0.385306   
3 -0.328431 -0.314897 -0.831167  ... -0.534184 -0.852437 -0.657167 -0.558626   
4 -0.673165 -0.339315 -0.555230  ... -0.472965 -0.373595 -0.384282 -0.517885   

        95        96        97        98        99   100  
0 -0.7205

In [14]:
num_samples_to_generate = 39888  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [15]:
new_data_generated

array([[-0.4450297 , -0.22294852, -0.3034466 , ..., -0.35437968,
        -0.47133923, -0.2836992 ],
       [-0.72759336, -0.6453137 , -0.45204452, ..., -0.3087919 ,
        -0.46016502,  0.06936888],
       [-0.40785375, -0.32768172, -0.33342355, ..., -0.5748681 ,
        -0.2446973 , -0.27959502],
       ...,
       [-0.49384815, -0.77441335, -0.50638896, ..., -0.53581977,
        -1.0080055 , -0.37307867],
       [-0.410911  , -0.4628031 , -0.5406908 , ..., -0.5029852 ,
        -0.60248905, -0.03458967],
       [-0.30690894, -0.5111382 , -0.4811407 , ..., -0.7031956 ,
        -0.5389546 , -0.45367998]], dtype=float32)

In [16]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [17]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [18]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.046412,-0.058844,0.030492,-0.045302,-0.047853,-0.017467,0.049574,-0.033913,-0.039866,-0.073236,...,0.117293,0.031330,-0.265421,-0.020640,-0.166901,0.272434,0.110138,0.171606,0.299765,0.0
1,-0.046412,-0.058844,0.030492,-0.045302,-0.047853,-0.017467,0.049574,-0.033913,-0.039866,-0.073236,...,0.089552,-0.358520,0.476823,-0.439907,0.063418,0.117955,0.029408,-0.054011,-0.026943,0.0
2,-0.046412,-0.058844,0.030492,-0.045302,-0.047853,-0.017467,0.049574,-0.033913,-0.039866,-0.073236,...,-0.236721,-0.386960,0.214320,0.207976,0.147840,0.355381,-0.277013,0.094609,0.033371,0.0
3,-0.046412,-0.058844,0.030492,-0.045302,-0.047853,-0.017467,0.049574,-0.033913,-0.039866,-0.073236,...,0.284572,0.157956,-0.099907,0.066470,0.001940,0.125655,-0.291628,-0.342115,-0.107380,0.0
4,-0.046412,-0.058844,0.030492,-0.045302,-0.047853,-0.017467,0.049574,-0.033913,-0.039866,-0.073236,...,-0.172661,0.090018,0.142202,0.089949,0.180742,-0.046197,0.052328,-0.031697,-0.077126,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82723,-0.681225,-0.631664,-0.586629,-0.279244,-0.570045,-0.778755,-0.579645,-0.531065,-0.553003,-0.606099,...,-0.263636,-0.434946,-0.607003,-0.691970,-0.211970,-0.240839,-0.734272,-0.311152,-0.195251,1.0
82724,-0.752199,-0.457726,-0.231432,-0.328189,-0.429773,-0.638683,-0.743695,-0.711249,-0.358762,-0.610126,...,-0.475021,-0.280397,-0.720837,-0.410277,-0.291110,-0.442867,-0.514432,-0.602070,-0.171334,1.0
82725,-0.493848,-0.774413,-0.506389,-0.262213,-0.488300,-0.342439,-0.640493,-0.601936,-0.553273,-1.082474,...,-0.492404,-0.310275,-0.677233,-0.554982,-0.310899,-0.708503,-0.535820,-1.008005,-0.373079,1.0
82726,-0.410911,-0.462803,-0.540691,-0.747069,-0.337355,-0.321866,-0.348446,-0.647528,-0.268311,-0.709107,...,-0.602498,-0.246532,-0.979358,-0.204187,-0.526367,-0.827878,-0.502985,-0.602489,-0.034590,1.0


In [19]:
file_name='enhanced_final_new_par_50_space_1.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)