### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras


import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models, backend as K

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
# WGAN-GP Architecture
def make_generator_model(input_dim, output_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, activation='relu', input_dim=input_dim))
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dense(output_dim, activation='linear'))  # Linear activation for WGAN
    return model

def make_critic_model(input_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(512, activation='relu', input_dim=input_dim))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(1))  # No activation, linear output
    return model

generator = make_generator_model(100, 101)
critic = make_critic_model(101)

# Losses and training
def critic_loss(real_output, fake_output):
    return tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

def generator_loss(fake_output):
    return -tf.reduce_mean(fake_output)

def gradient_penalty(batch_size, real_images, fake_images, critic):
    epsilon = tf.random.normal([batch_size, 1], 0.0, 1.0)
    interpolated = epsilon * real_images + (1 - epsilon) * fake_images
    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        pred = critic(interpolated, training=True)
    grads = tape.gradient(pred, [interpolated])[0]
    norm = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=[1]))
    gp = tf.reduce_mean((norm - 1.0) ** 2)
    return gp

def train_step(generator, critic, batch_size, generator_optimizer, critic_optimizer, real_features):
    # Append a label column to real_features to match the critic's input expectations
    labels = tf.ones((batch_size, 1))  # Assume label 1 for all positive samples
    real_data = tf.concat([real_features, labels], axis=1)
    
    noise = tf.random.normal([batch_size, generator.input_shape[1]])
    with tf.GradientTape() as gen_tape, tf.GradientTape() as crit_tape:
        generated_data = generator(noise, training=True)

        real_output = critic(real_data, training=True)
        fake_output = critic(generated_data, training=True)

        crit_loss = critic_loss(real_output, fake_output)
        gen_loss = generator_loss(fake_output)
        penalty = gradient_penalty(batch_size, real_data, generated_data, critic)
        crit_loss += 10 * penalty  # lambda for gradient penalty

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_critic = crit_tape.gradient(crit_loss, critic.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    critic_optimizer.apply_gradients(zip(gradients_of_critic, critic.trainable_variables))

    return crit_loss, gen_loss




In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
ligants_type = ['enzyme', 'GPCR', 'ion_channel', 'nuclear_receptor']
ltype = ligants_type[1]
file_name = 'final_new_par_50.csv'
file_path = os.path.join(base_dir, 'data', 'split', ltype, file_name)
data_frame = pd.read_csv(file_path, header=None, skiprows=1)
features = data_frame.iloc[:, :-1].values
labels = data_frame.iloc[:, -1].values
# Filter to get only the positive samples
positive_features = features[labels == 1]

In [4]:
print(positive_features.shape)
print(features.shape)
print(labels.shape)
len(positive_features)

(635, 100)
(21185, 100)
(21185,)


635

In [5]:
# Training parameters
num_samples_to_generate = 19915
epochs = 100
batch_size = 256
learning_rate=0.0001
beta_1=0.5
generator_optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1)
critic_optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1)

# Training loop
for epoch in range(epochs):
    for batch in range(0, len(positive_features), batch_size):
        real_data_batch = positive_features[batch:batch + batch_size]
        if real_data_batch.shape[0] != batch_size:  # Handle last batch which may be smaller
            continue  # Skip if the batch isn't full size
        crit_loss, gen_loss = train_step(generator, critic, batch_size, generator_optimizer, critic_optimizer, real_data_batch)
        print(f'Epoch {epoch}, Batch {batch // batch_size}, Critic Loss: {crit_loss.numpy()}, Generator Loss: {gen_loss.numpy()}')

Epoch 0, Batch 0, Critic Loss: 2.5564253330230713, Generator Loss: 0.03385821357369423
Epoch 0, Batch 1, Critic Loss: 2.5642354488372803, Generator Loss: -0.05705900490283966
Epoch 1, Batch 0, Critic Loss: 2.58237886428833, Generator Loss: -0.15056344866752625
Epoch 1, Batch 1, Critic Loss: 2.5908639430999756, Generator Loss: -0.26303568482398987
Epoch 2, Batch 0, Critic Loss: 2.5840158462524414, Generator Loss: -0.3745231628417969
Epoch 2, Batch 1, Critic Loss: 2.562553882598877, Generator Loss: -0.507118284702301
Epoch 3, Batch 0, Critic Loss: 2.63327693939209, Generator Loss: -0.6742141246795654
Epoch 3, Batch 1, Critic Loss: 2.576385021209717, Generator Loss: -0.8392695784568787
Epoch 4, Batch 0, Critic Loss: 2.6579508781433105, Generator Loss: -1.045945167541504
Epoch 4, Batch 1, Critic Loss: 2.6984939575195312, Generator Loss: -1.27028226852417
Epoch 5, Batch 0, Critic Loss: 2.856401205062866, Generator Loss: -1.5610909461975098
Epoch 5, Batch 1, Critic Loss: 3.015629291534424, G

Epoch 48, Batch 1, Critic Loss: 0.620330274105072, Generator Loss: 1.1947033405303955
Epoch 49, Batch 0, Critic Loss: 0.6304507851600647, Generator Loss: 1.19431471824646
Epoch 49, Batch 1, Critic Loss: 0.6377188563346863, Generator Loss: 1.2334716320037842
Epoch 50, Batch 0, Critic Loss: 0.6416301131248474, Generator Loss: 1.2162528038024902
Epoch 50, Batch 1, Critic Loss: 0.6381792426109314, Generator Loss: 1.2550854682922363
Epoch 51, Batch 0, Critic Loss: 0.6634122133255005, Generator Loss: 1.279773235321045
Epoch 51, Batch 1, Critic Loss: 0.5445777177810669, Generator Loss: 1.2874541282653809
Epoch 52, Batch 0, Critic Loss: 0.5827305912971497, Generator Loss: 1.2543034553527832
Epoch 52, Batch 1, Critic Loss: 0.5703232288360596, Generator Loss: 1.2687335014343262
Epoch 53, Batch 0, Critic Loss: 0.5234785079956055, Generator Loss: 1.2556546926498413
Epoch 53, Batch 1, Critic Loss: 0.5060654282569885, Generator Loss: 1.217976450920105
Epoch 54, Batch 0, Critic Loss: 0.59979617595672

Epoch 96, Batch 1, Critic Loss: 0.6491603851318359, Generator Loss: 0.6258448362350464
Epoch 97, Batch 0, Critic Loss: 0.5969604253768921, Generator Loss: 0.6202605962753296
Epoch 97, Batch 1, Critic Loss: 0.6386134624481201, Generator Loss: 0.6194424629211426
Epoch 98, Batch 0, Critic Loss: 0.5793331861495972, Generator Loss: 0.6172288060188293
Epoch 98, Batch 1, Critic Loss: 0.6307359933853149, Generator Loss: 0.6057853102684021
Epoch 99, Batch 0, Critic Loss: 0.5985867977142334, Generator Loss: 0.6053241491317749
Epoch 99, Batch 1, Critic Loss: 0.6174975633621216, Generator Loss: 0.6088495254516602


In [6]:
# Assuming the last column name in  original dataframe represents the label
all_column_names = data_frame.columns.tolist()  # This should have 101 names if the label is included in data_frame

# Generate synthetic data
noise = tf.random.normal([num_samples_to_generate, 100])
synthetic_data = generator(noise, training=False)
synthetic_data_df = pd.DataFrame(synthetic_data.numpy(), columns=all_column_names)
# Set the label for all generated data to 1
synthetic_data_df[all_column_names[-1]] = 1

In [None]:
#synthetic_data

In [7]:
# Combine original and synthetic data
enhanced_df = pd.concat([data_frame, synthetic_data_df], axis=0).reset_index(drop=True)
file_name='enhanced_GAN_final_new_par_50_space_1.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)