### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[3]
file_name='final_new_par_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
filtered_df.shape

(90, 101)

In [6]:
#para
epochs=4
batch_size=45

In [7]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1c61e125630>

In [8]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [9]:
x_new = features_new.to_numpy()

In [10]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [11]:
reconstructed_new = vae.predict(x_new)



In [12]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

          0         1         2         3         4         5         6   \
0  -0.111270  0.008626 -0.112308 -0.022436  0.084521  0.016578 -0.158669   
1   0.404820  0.118141  0.452118 -0.074957 -0.062593  0.706790 -0.101757   
2   0.404820  0.118141  0.452118 -0.074957 -0.062593  0.706790 -0.101757   
3   0.404820  0.118141  0.452118 -0.074957 -0.062593  0.706790 -0.101757   
4   0.404820  0.118141  0.452118 -0.074957 -0.062593  0.706790 -0.101757   
..       ...       ...       ...       ...       ...       ...       ...   
85 -0.197877  0.131798  0.284894  0.033694 -0.090001 -0.126484  0.254990   
86 -0.197877  0.131798  0.284894  0.033694 -0.090001 -0.126484  0.254990   
87 -0.197877  0.131798  0.284894  0.033694 -0.090001 -0.126484  0.254990   
88  0.075678 -0.031658  0.055676 -0.030348 -0.079173 -0.121744  0.058323   
89 -0.011902 -0.035561 -0.130223 -0.044128  0.034009 -0.115561 -0.062005   

          7         8         9   ...        90        91        92        93  \
0  -0.

In [13]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0 -0.292419  0.100761 -0.214540 -0.128120 -0.137349 -0.083648 -0.214622   
1 -0.506139  0.182735  0.253912 -0.123106 -0.391820  0.032372 -0.244451   
2 -0.194849  0.033693  0.098197 -0.049577 -0.142657 -0.002021  0.179296   
3 -0.011516  0.053434  0.277638 -0.008508 -0.308292  0.111951 -0.076099   
4 -0.084857 -0.038653  0.220638  0.118545 -0.233726  0.280457  0.009732   

        7         8         9    ...       91        92        93        94   \
0  0.227798 -0.326102  0.353781  ... -0.225842 -0.064208  0.219204  0.185292   
1  0.696048 -0.802705  0.280618  ... -0.425340 -0.179115  0.410294  0.269959   
2  0.272113 -0.335388  0.105428  ... -0.134091 -0.149261  0.085183 -0.097674   
3  0.197168 -0.169586  0.176682  ... -0.353359 -0.398614  0.051616  0.004061   
4  0.372805 -0.400499  0.080946  ... -0.512369 -0.484125  0.040205  0.159244   

        95        96        97        98        99   100  
0 -0.3100

In [14]:
num_samples_to_generate = 1224  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [15]:
new_data_generated

array([[-0.09123705,  0.06339117, -0.17144644, ..., -0.0630437 ,
        -0.50806636, -0.13034306],
       [ 0.08614964,  0.2854382 ,  0.1268754 , ..., -0.12161686,
        -0.05624686, -0.4340832 ],
       [-0.19427133, -0.22305417,  0.36385834, ..., -0.34575555,
        -0.55573475, -0.41676787],
       ...,
       [-0.3581043 ,  0.30615392,  0.30374143, ...,  0.25461277,
        -0.13223541, -0.47395644],
       [-0.07534185,  0.19252378, -0.09778434, ...,  0.02933992,
        -0.35277668, -0.06226281],
       [-0.03117191, -0.03090625,  0.00302742, ..., -0.00437047,
        -0.11266375, -0.26539537]], dtype=float32)

In [16]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [17]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [18]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.111270,0.008626,-0.112308,-0.022436,0.084521,0.016578,-0.158669,-0.079974,0.223279,0.033481,...,0.283005,0.085423,-0.062640,0.161380,-0.122537,0.115114,-0.053795,-0.214284,0.233027,0.0
1,-0.111270,0.008626,-0.112308,-0.022436,0.084521,0.016578,-0.158669,-0.079974,0.223279,0.033481,...,-0.018210,-0.094529,-0.008272,-0.045378,-0.252298,0.047064,-0.177535,0.144204,0.012367,0.0
2,-0.111270,0.008626,-0.112308,-0.022436,0.084521,0.016578,-0.158669,-0.079974,0.223279,0.033481,...,0.042791,-0.079722,0.102271,0.001523,-0.077309,-0.041517,-0.067451,0.027401,-0.037864,0.0
3,-0.111270,0.008626,-0.112308,-0.022436,0.084521,0.016578,-0.158669,-0.079974,0.223279,0.033481,...,-0.120547,0.099485,-0.063370,0.007632,0.123424,0.068034,-0.028188,-0.256661,0.122674,0.0
4,-0.111270,0.008626,-0.112308,-0.022436,0.084521,0.016578,-0.158669,-0.079974,0.223279,0.033481,...,-0.080258,-0.229467,0.045752,-0.064296,0.134913,0.000163,0.054669,-0.196582,-0.041674,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,-0.346970,0.252749,0.043169,0.038911,-0.546259,0.029881,-0.065755,0.435748,-0.454515,0.079702,...,-0.581142,-0.323180,0.058032,0.143102,0.117994,-0.191738,0.060826,-0.030575,-0.436335,1.0
2624,-0.366686,-0.029260,0.026956,0.075629,-0.184772,-0.112145,-0.129582,0.202685,-0.137362,0.136930,...,-0.192703,-0.278295,0.225270,-0.002098,0.019151,-0.165369,0.046116,-0.168530,-0.170307,1.0
2625,-0.358104,0.306154,0.303741,-0.078911,-0.223747,-0.293677,-0.188593,0.754108,-0.529041,0.165997,...,-0.438769,-0.420876,0.244431,-0.054379,-0.016737,-0.254082,0.254613,-0.132235,-0.473956,1.0
2626,-0.075342,0.192524,-0.097784,-0.149718,-0.100732,0.233271,-0.234603,0.288299,-0.368860,0.305713,...,-0.268126,0.007388,0.092949,0.402296,-0.275853,-0.165124,0.029340,-0.352777,-0.062263,1.0


In [19]:
file_name='enhanced_final_new_par_50_space_1.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)