### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim=20):
    encoder_inputs = layers.Input(shape=(100,)) #change here features*2
    x = layers.Dense(256, activation='relu')(encoder_inputs)
    x = layers.Dense(64, activation='relu')(x)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder

def build_decoder(latent_dim=20):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(64, activation='relu')(latent_inputs)
    x = layers.Dense(256, activation='relu')(x)
    decoder_outputs = layers.Dense(100, activation='linear')(x) #change here features*2
    decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # Initialize trackers for monitoring losses
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        # Return list of metrics to be updated during training
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def call(self, inputs, training=False):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Compute KL divergence loss even during inference to track loss correctly
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        # Only add KL loss during training
        if training:
            self.add_loss(kl_loss)
        return reconstructed

    def train_step(self, data):
    # Unpack the data
        x = data[0] if isinstance(data, tuple) else data

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(x, training=True)
            reconstruction = self.decoder(z, training=True)

            # If  data is flat (e.g., shape=(batch_size, features)), adjust axis accordingly
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(x, reconstruction), axis=-1
            )
            reconstruction_loss = tf.reduce_sum(reconstruction_loss)  # Sum over all dimensions

            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}

# Example usage:
latent_dim = 16  
encoder = build_encoder(latent_dim)
decoder = build_decoder(latent_dim)
vae = VAE(encoder, decoder)
#vae.compile(optimizer='adam')
vae.compile(optimizer=tf.keras.optimizers.Adam())


In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
base_dir
ligants_type=['enzyme','GPCR','ion_channel','nuclear_receptor']
ltype=ligants_type[3]
file_name='final_new_par_LMF_50.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
data_frame = pd.read_csv(file_path, header=None, skiprows=1)


In [4]:
# Filter to only include instances with label 1 (interactions)
# Separate features and labels
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]  # All rows, all columns except the last one
features_new = filtered_df.iloc[:, :-1]     # All rows, just the last column

# Convert features DataFrame to a NumPy array if necessary
x_train = features_new.to_numpy()

In [5]:
filtered_df.shape

(90, 101)

In [6]:
#para
epochs=4
batch_size=45

In [7]:
# Train the model
vae.fit(x_train, epochs=epochs, batch_size=batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1bc9e149870>

In [8]:
# Filter to only include instances with label 1 (interactions)
filtered_df = data_frame[data_frame.iloc[:, -1] == 1]

# Separate features
features_new = filtered_df.iloc[:, :-1]  # Assuming the last column is the label

In [9]:
x_new = features_new.to_numpy()

In [10]:
z_mean, z_log_var, z = vae.encoder.predict(x_new)
# Now, z contains the latent representations of  filtered data.



In [11]:
reconstructed_new = vae.predict(x_new)



In [12]:
# Convert the reconstructed data to a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_new)

# Display the first few rows of the reconstructed DataFrame
print(pd.DataFrame(x_new))
print(reconstructed_df)

          0         1         2         3         4         5         6   \
0  -0.565347 -0.426805 -0.278643 -0.341101 -0.444826 -0.245312 -0.517644   
1  -0.138126 -0.144002 -0.076164 -0.028113 -0.119450 -0.135161 -0.008757   
2  -0.138126 -0.144002 -0.076164 -0.028113 -0.119450 -0.135161 -0.008757   
3  -0.138126 -0.144002 -0.076164 -0.028113 -0.119450 -0.135161 -0.008757   
4  -0.138126 -0.144002 -0.076164 -0.028113 -0.119450 -0.135161 -0.008757   
..       ...       ...       ...       ...       ...       ...       ...   
85 -0.324121 -0.261553 -0.193835 -0.238345 -0.279753 -0.168059 -0.311657   
86 -0.324121 -0.261553 -0.193835 -0.238345 -0.279753 -0.168059 -0.311657   
87 -0.324121 -0.261553 -0.193835 -0.238345 -0.279753 -0.168059 -0.311657   
88 -0.476878 -0.382709 -0.265242 -0.321077 -0.393143 -0.221384 -0.442175   
89 -0.458062 -0.391875 -0.250347 -0.337288 -0.403554 -0.219870 -0.441276   

          7         8         9   ...        90        91        92        93  \
0  -0.

In [13]:
# Add a new column 'Label' with all values set to 1
reconstructed_df[100] = 1

# Display the first few rows to verify the new column
print(reconstructed_df.head())

        0         1         2         3         4         5         6    \
0 -0.064523 -0.134328  0.080037 -0.245222 -0.459697 -0.651397 -0.347039   
1  0.037169  0.015262 -0.042780 -0.370491 -0.212642 -0.359512 -0.283710   
2 -0.108611  0.055054 -0.159019 -0.220739 -0.044753 -0.554277 -0.267824   
3 -0.078665  0.016328 -0.127069 -0.263205 -0.208562 -0.059981 -0.079525   
4 -0.130004  0.042647  0.045475  0.086974 -0.212211  0.124418 -0.105467   

        7         8         9    ...       91        92        93        94   \
0 -0.406750 -0.430594 -0.277396  ...  0.101460 -0.143896  0.244717  0.292520   
1 -0.193819  0.022242 -0.040625  ...  0.141334  0.042713  0.301073  0.215140   
2 -0.264545 -0.263012 -0.139345  ...  0.126966 -0.230981  0.195033  0.317356   
3 -0.177945  0.077954  0.000507  ...  0.060492  0.030883  0.168405  0.130464   
4 -0.155988 -0.299326 -0.002633  ...  0.427540 -0.073234  0.146324  0.052302   

        95        96        97        98        99   100  
0  0.1198

In [14]:
num_samples_to_generate = 1224  # The number of new rows want to generate

# Generate random samples from the latent space
latent_dim = 16  # Ensure this matches the latent dimension size of  VAE
z_new_samples = np.random.normal(size=(num_samples_to_generate, latent_dim))

# Use the decoder to generate new data
new_data_generated = vae.decoder.predict(z_new_samples)





In [15]:
new_data_generated

array([[-4.86626863e-01,  2.46674232e-02, -1.14546113e-01, ...,
         9.68806386e-01,  2.79888690e-01, -6.48455203e-01],
       [-1.92022786e-01, -2.71976352e-01, -1.47042230e-01, ...,
         2.55978763e-01,  2.49570280e-01, -4.56500947e-01],
       [-4.49671417e-01,  1.29464447e-01, -2.66862482e-01, ...,
         3.59625816e-01,  3.57754618e-01, -4.47723448e-01],
       ...,
       [-5.75167477e-01, -1.06650010e-01, -1.20694518e-01, ...,
         2.30109289e-01,  2.72606969e-01, -3.56229037e-01],
       [ 9.45773441e-04,  2.68804491e-01,  7.34529868e-02, ...,
         6.94784224e-01, -3.34284492e-02, -2.85421431e-01],
       [-9.20171440e-02, -1.12558134e-01,  9.25085470e-02, ...,
         7.56215215e-01,  1.21875912e-01, -4.83328044e-01]], dtype=float32)

In [16]:
# Convert the generated data to a DataFrame
new_data_df = pd.DataFrame(new_data_generated)

# Add a column 'Label' with all values set to 1
new_data_df[100] = 1

In [17]:
enhanced_df = pd.concat([data_frame, new_data_df], axis=0).reset_index(drop=True)

In [18]:
enhanced_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.565347,-0.426805,-0.278643,-0.341101,-0.444826,-0.245312,-0.517644,-0.259974,-0.276345,-0.516024,...,0.243418,0.006033,0.263965,0.164053,0.323660,0.205177,0.183918,0.174048,0.147949,0
1,-0.565347,-0.426805,-0.278643,-0.341101,-0.444826,-0.245312,-0.517644,-0.259974,-0.276345,-0.516024,...,0.267232,-0.005838,0.239797,0.170993,0.350185,0.209856,0.197427,0.259254,0.131175,0
2,-0.565347,-0.426805,-0.278643,-0.341101,-0.444826,-0.245312,-0.517644,-0.259974,-0.276345,-0.516024,...,0.324669,0.001468,0.305311,0.217168,0.407196,0.257649,0.242267,0.282082,0.162188,0
3,-0.565347,-0.426805,-0.278643,-0.341101,-0.444826,-0.245312,-0.517644,-0.259974,-0.276345,-0.516024,...,0.286988,0.032042,0.299079,0.199626,0.369988,0.260502,0.221786,0.214709,0.187751,0
4,-0.565347,-0.426805,-0.278643,-0.341101,-0.444826,-0.245312,-0.517644,-0.259974,-0.276345,-0.516024,...,0.262091,0.029252,0.277777,0.183678,0.360393,0.226148,0.206291,0.197065,0.162283,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,-0.243129,-0.122140,-0.053377,-0.231776,-0.374284,-0.378557,0.077639,-0.592342,-0.456843,-0.242238,...,0.556469,-0.178810,0.290289,0.407412,0.339677,0.191231,0.469876,0.341804,-0.536705,1
2624,-0.002513,0.156427,-0.053416,-0.243952,-0.528682,-0.177617,-0.454571,-0.386666,-0.018120,0.011200,...,0.446218,0.106677,0.457910,-0.003070,0.426516,0.250798,0.433788,-0.024218,-0.281797,1
2625,-0.575167,-0.106650,-0.120695,-0.454638,-0.120757,-1.037028,-0.209787,-0.478722,-0.273956,0.151954,...,0.224131,0.094253,0.424761,0.391546,0.420641,0.136863,0.230109,0.272607,-0.356229,1
2626,0.000946,0.268804,0.073453,-0.121740,-0.490353,-0.250157,-0.165540,0.066354,-0.518123,-0.103939,...,0.461725,-0.312239,0.418163,0.314173,0.035887,0.520013,0.694784,-0.033428,-0.285421,1


In [19]:
file_name='enhanced_VAE_final_new_par_50_LMF_space_3.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)