### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[1]
file_name='final_new_par_LMF_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
filtered_df.shape

(635, 101)

In [6]:
#para
epochs=20
batch_size=127

In [7]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1ea9e19d8a0>

In [8]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [9]:
x_new = features_new.to_numpy()

In [10]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [11]:
reconstructed_new = vae.predict(x_new)



In [12]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

           0         1         2         3         4         5         6   \
0   -0.622757 -0.427074 -0.546902 -0.681086 -0.616897 -0.399852 -0.646469   
1   -0.432677 -0.327581 -0.411204 -0.477012 -0.441872 -0.262467 -0.407401   
2   -0.432677 -0.327581 -0.411204 -0.477012 -0.441872 -0.262467 -0.407401   
3   -0.432677 -0.327581 -0.411204 -0.477012 -0.441872 -0.262467 -0.407401   
4   -0.420101 -0.217427 -0.451843 -0.451589 -0.261617 -0.356622 -0.460438   
..        ...       ...       ...       ...       ...       ...       ...   
630 -0.549045 -0.404213 -0.456180 -0.632895 -0.563047 -0.400921 -0.567501   
631 -0.553836 -0.410753 -0.452271 -0.635185 -0.561335 -0.401089 -0.567444   
632 -0.528812 -0.379926 -0.418093 -0.607398 -0.547064 -0.384577 -0.551720   
633 -0.531418 -0.384527 -0.433663 -0.608771 -0.546250 -0.395371 -0.549275   
634 -0.622019 -0.426315 -0.545463 -0.684833 -0.615999 -0.395611 -0.650696   

           7         8         9   ...        90        91        92  \
0  

In [13]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0 -0.767552 -1.027751 -0.528422 -1.121074 -0.360472 -0.633081 -0.486807   
1 -0.523895 -0.431203 -0.421358 -1.115925 -0.642327 -0.796661 -0.634993   
2 -0.775185 -1.118137 -0.672486 -0.925044 -0.772080 -0.521150 -0.291339   
3 -0.554072 -0.115190 -0.502083 -0.271370 -0.569772 -0.519935 -0.550338   
4 -0.509177 -0.288525 -0.163922 -0.259877 -0.334433 -0.407594 -0.160249   

        7         8         9    ...       91        92        93        94   \
0 -0.843536 -0.820084 -0.759582  ...  0.291385  0.293064  0.271236  0.484003   
1 -0.422538 -0.314773 -0.629746  ...  0.473200  0.495974  0.267658  0.276563   
2 -0.893955 -1.031708 -0.669688  ...  0.484859  0.204295  0.187359  0.682945   
3 -0.619349 -0.445922 -0.328220  ...  0.519407  0.311631  0.595487  0.394250   
4 -0.461715 -0.379078 -0.196680  ...  0.184376  0.249424  0.348595  0.217766   

        95        96        97        98        99   100  
0  0.1887

In [14]:
num_samples_to_generate = 19915  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [15]:
new_data_generated

array([[-0.4639421 , -0.41055927, -0.1820075 , ...,  0.25906622,
         0.29884842,  0.28503722],
       [-0.54088026, -0.2505464 , -0.43269134, ...,  0.2952692 ,
         0.5592264 ,  0.36449322],
       [-0.3724004 , -0.3867712 , -0.34770685, ...,  0.23460422,
         0.43924147,  0.21839231],
       ...,
       [-0.45168483, -0.5824973 , -0.28928208, ...,  0.25632712,
         0.29967922,  0.28209397],
       [-0.377553  , -0.4255461 , -0.15130213, ...,  0.28396446,
         0.27018338,  0.3044331 ],
       [-0.7326226 , -0.8165902 , -0.1666347 , ...,  0.12669772,
         0.32729053,  0.3468352 ]], dtype=float32)

In [16]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [17]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [18]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.622757,-0.427074,-0.546902,-0.681086,-0.616897,-0.399852,-0.646469,-0.473096,-0.398657,-0.460780,...,0.153380,0.292761,0.272396,0.308351,0.318655,0.254048,0.237134,0.212641,0.222496,0.0
1,-0.622757,-0.427074,-0.546902,-0.681086,-0.616897,-0.399852,-0.646469,-0.473096,-0.398657,-0.460780,...,0.026970,0.387047,0.189493,0.333211,0.316694,0.095216,0.277618,0.255762,0.634482,0.0
2,-0.622757,-0.427074,-0.546902,-0.681086,-0.616897,-0.399852,-0.646469,-0.473096,-0.398657,-0.460780,...,0.175600,0.311160,0.309869,0.324986,0.349944,0.265524,0.263268,0.237989,0.223606,0.0
3,-0.622757,-0.427074,-0.546902,-0.681086,-0.616897,-0.399852,-0.646469,-0.473096,-0.398657,-0.460780,...,0.148754,0.291764,0.288028,0.304497,0.324409,0.249166,0.236031,0.222158,0.211160,0.0
4,-0.622757,-0.427074,-0.546902,-0.681086,-0.616897,-0.399852,-0.646469,-0.473096,-0.398657,-0.460780,...,0.532137,0.032426,0.061560,0.536695,0.296054,0.281888,0.210928,0.126561,0.074251,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41095,-1.167146,-0.773598,-0.408801,-0.361071,-0.592888,-0.712328,-0.773887,-0.839873,-0.909286,-0.876976,...,0.343708,0.453005,0.353923,0.161486,0.137816,0.452746,0.455927,0.430444,0.604609,1.0
41096,-0.730467,-1.271471,-0.432713,-0.866111,-0.806027,-0.912486,-0.260361,-0.866017,-1.079413,-0.887795,...,0.280170,0.208248,-0.020355,0.457036,-0.107935,0.394542,0.382657,0.546899,0.446768,1.0
41097,-0.451685,-0.582497,-0.289282,-0.636545,-0.051089,-0.379605,-0.474093,-0.426053,-0.249029,-0.678915,...,0.162749,0.173339,0.109635,0.151972,0.187001,0.224019,0.256327,0.299679,0.282094,1.0
41098,-0.377553,-0.425546,-0.151302,-0.393559,-0.184409,-0.360608,-0.299241,-0.486215,-0.316570,-0.166165,...,0.119539,0.105926,0.169582,0.327186,0.094141,0.263206,0.283964,0.270183,0.304433,1.0


In [19]:
file_name='enhanced_VAE_final_new_par_50_LFM_space_3.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)