### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[2]
file_name='final_new_par_LMF_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
filtered_df.shape

(1476, 101)

In [6]:
#para
epochs=4
batch_size=82

In [7]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1f29f6a9d50>

In [8]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [9]:
x_new = features_new.to_numpy()

In [10]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [11]:
reconstructed_new = vae.predict(x_new)



In [12]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

            0         1         2         3         4         5         6   \
0     0.436501  0.250856  0.356307 -0.226115 -0.308184 -0.403182  0.140159   
1     0.436501  0.250856  0.356307 -0.226115 -0.308184 -0.403182  0.140159   
2     0.514852  0.162007  0.227619 -0.040816 -0.410620 -0.276363  0.228057   
3     0.676194  0.008535  0.514052  0.701031 -0.009285 -0.074659  0.460639   
4     0.676194  0.008535  0.514052  0.701031 -0.009285 -0.074659  0.460639   
...        ...       ...       ...       ...       ...       ...       ...   
1471  0.288869  0.503765  0.441279 -0.534502 -0.192024 -0.500892  0.039972   
1472  0.288869  0.503765  0.441279 -0.534502 -0.192024 -0.500892  0.039972   
1473  0.288869  0.503765  0.441279 -0.534502 -0.192024 -0.500892  0.039972   
1474  0.434229  0.250592  0.354392 -0.229408 -0.308570 -0.406938  0.132189   
1475  0.434229  0.250592  0.354392 -0.229408 -0.308570 -0.406938  0.132189   

            7         8         9   ...        90        91    

In [13]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0  0.260056  0.032919  0.220767 -0.174872 -0.577334 -0.354179 -0.159857   
1  0.311456  0.311386  0.049271 -0.164750 -0.285451 -0.241821  0.082974   
2  0.427149  0.202444  0.270143 -0.160114 -0.400416 -0.312203  0.115293   
3  0.225288  0.507207  0.354201 -0.446345 -0.689936 -0.443923  0.243075   
4  0.154052  0.104310  0.162548 -0.200826 -0.381189 -0.248847 -0.006547   

        7         8         9    ...       91        92        93        94   \
0  0.201504 -0.423579  0.167946  ...  0.409579  0.304660 -0.380151  0.254628   
1  0.096748 -0.195403  0.212971  ...  0.132413  0.180624 -0.363533  0.485360   
2  0.239834 -0.248102  0.285630  ...  0.345915  0.262240 -0.496873  0.515612   
3  0.383184 -0.207545  0.416541  ...  0.556569  0.396708 -0.447489  0.471658   
4  0.273231 -0.205433  0.420272  ...  0.390809  0.268016 -0.438787  0.271697   

        95        96        97        98        99   100  
0 -0.0256

In [14]:
num_samples_to_generate = 39888  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [15]:
new_data_generated

array([[ 0.16624707,  0.29759887,  0.53056234, ..., -0.49889925,
         0.2630684 , -0.64932287],
       [ 0.20755067,  0.13869058,  0.25071073, ..., -0.04014296,
         0.24108227, -0.3225715 ],
       [ 0.2546986 ,  0.29284146,  0.14377859, ..., -0.09342064,
         0.10958147, -0.33785632],
       ...,
       [ 0.27393532,  0.4019966 ,  0.3602587 , ..., -0.43873608,
         0.42941418, -0.77376765],
       [ 0.29083395,  0.3167049 , -0.25092044, ..., -0.3186695 ,
         0.14984392, -0.18940362],
       [ 0.30585042,  0.36540723,  0.42428488, ..., -0.567862  ,
         0.4792341 , -0.8947423 ]], dtype=float32)

In [16]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [17]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [18]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.436501,0.250856,0.356307,-0.226115,-0.308184,-0.403182,0.140159,0.360367,-0.081909,0.415986,...,0.335263,0.225140,-0.156722,0.190285,0.008524,0.009117,-0.155599,0.202474,-0.597380,0.0
1,0.436501,0.250856,0.356307,-0.226115,-0.308184,-0.403182,0.140159,0.360367,-0.081909,0.415986,...,0.168198,0.166573,0.020867,0.135056,0.069828,0.127377,-0.028108,0.284480,-0.377469,0.0
2,0.436501,0.250856,0.356307,-0.226115,-0.308184,-0.403182,0.140159,0.360367,-0.081909,0.415986,...,0.087902,-0.010726,-0.042239,0.053999,0.127199,0.019528,-0.133650,0.263689,-0.532280,0.0
3,0.436501,0.250856,0.356307,-0.226115,-0.308184,-0.403182,0.140159,0.360367,-0.081909,0.415986,...,0.419669,0.350892,-0.941822,0.227727,-0.656873,-0.281764,-0.346049,0.647751,-1.395413,0.0
4,0.436501,0.250856,0.356307,-0.226115,-0.308184,-0.403182,0.140159,0.360367,-0.081909,0.415986,...,0.623031,0.373050,-0.321666,0.190634,-0.144257,0.005621,-0.188413,0.209063,-0.761995,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82723,0.214274,0.253449,0.056102,-0.412751,-0.546540,-0.769365,0.444553,0.379777,-0.349331,0.259023,...,0.448116,0.394092,-0.273721,0.142124,-0.144411,-0.000833,-0.397974,0.332621,-0.672570,1.0
82724,0.103813,0.272143,0.062918,-0.098133,-0.253508,-0.419124,0.084841,0.123391,-0.405554,0.260061,...,0.414758,0.189580,-0.471439,0.007842,0.297609,-0.014151,-0.307918,0.186485,-0.439076,1.0
82725,0.273935,0.401997,0.360259,-0.291193,-0.478230,-0.555575,0.347580,0.509088,-0.487198,0.324044,...,0.662839,0.347305,-0.633611,0.138337,0.228748,0.017370,-0.438736,0.429414,-0.773768,1.0
82726,0.290834,0.316705,-0.250920,-0.361791,-0.598988,-0.352805,-0.114390,0.197087,-0.033816,0.607952,...,0.438913,0.329630,-0.676932,0.292915,-0.130853,0.281016,-0.318669,0.149844,-0.189404,1.0


In [19]:
file_name='enhanced_VAE_final_new_par_50_LMF_space_3.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)