### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import os
import json
import requests
from tqdm import tqdm
import time
import keras


import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models, backend as K

C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\riskf\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
# WGAN-GP Architecture
def make_generator_model(input_dim, output_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, activation='relu', input_dim=input_dim))
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dense(output_dim, activation='linear'))  # Linear activation for WGAN
    return model

def make_critic_model(input_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(512, activation='relu', input_dim=input_dim))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(1))  # No activation, linear output
    return model

generator = make_generator_model(100, 101)
critic = make_critic_model(101)

# Losses and training
def critic_loss(real_output, fake_output):
    return tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

def generator_loss(fake_output):
    return -tf.reduce_mean(fake_output)

def gradient_penalty(batch_size, real_images, fake_images, critic):
    epsilon = tf.random.normal([batch_size, 1], 0.0, 1.0)
    interpolated = epsilon * real_images + (1 - epsilon) * fake_images
    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        pred = critic(interpolated, training=True)
    grads = tape.gradient(pred, [interpolated])[0]
    norm = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=[1]))
    gp = tf.reduce_mean((norm - 1.0) ** 2)
    return gp

def train_step(generator, critic, batch_size, generator_optimizer, critic_optimizer, real_features):
    # Append a label column to real_features to match the critic's input expectations
    labels = tf.ones((batch_size, 1))  # Assume label 1 for all positive samples
    real_data = tf.concat([real_features, labels], axis=1)
    
    noise = tf.random.normal([batch_size, generator.input_shape[1]])
    with tf.GradientTape() as gen_tape, tf.GradientTape() as crit_tape:
        generated_data = generator(noise, training=True)

        real_output = critic(real_data, training=True)
        fake_output = critic(generated_data, training=True)

        crit_loss = critic_loss(real_output, fake_output)
        gen_loss = generator_loss(fake_output)
        penalty = gradient_penalty(batch_size, real_data, generated_data, critic)
        crit_loss += 10 * penalty  # lambda for gradient penalty

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_critic = crit_tape.gradient(crit_loss, critic.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    critic_optimizer.apply_gradients(zip(gradients_of_critic, critic.trainable_variables))

    return crit_loss, gen_loss




In [3]:
#relative paths. # Set directory paths for later use.
# Get the directory of the script file
base_dir = os.getcwd()
ligants_type = ['enzyme', 'GPCR', 'ion_channel', 'nuclear_receptor']
ltype = ligants_type[1]
file_name = 'final_new_par_NNMF_50.csv'
file_path = os.path.join(base_dir, 'data', 'split', ltype, file_name)
data_frame = pd.read_csv(file_path, header=None, skiprows=1)
features = data_frame.iloc[:, :-1].values
labels = data_frame.iloc[:, -1].values
# Filter to get only the positive samples
positive_features = features[labels == 1]

In [4]:
print(positive_features.shape)
print(features.shape)
print(labels.shape)
len(positive_features)

(635, 100)
(21185, 100)
(21185,)


635

In [5]:
# Training parameters
num_samples_to_generate = 19915
epochs = 100
batch_size = 256
learning_rate=0.0001
beta_1=0.5
generator_optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1)
critic_optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1)

# Training loop
for epoch in range(epochs):
    for batch in range(0, len(positive_features), batch_size):
        real_data_batch = positive_features[batch:batch + batch_size]
        if real_data_batch.shape[0] != batch_size:  # Handle last batch which may be smaller
            continue  # Skip if the batch isn't full size
        crit_loss, gen_loss = train_step(generator, critic, batch_size, generator_optimizer, critic_optimizer, real_data_batch)
        print(f'Epoch {epoch}, Batch {batch // batch_size}, Critic Loss: {crit_loss.numpy()}, Generator Loss: {gen_loss.numpy()}')

Epoch 0, Batch 0, Critic Loss: 2.481267213821411, Generator Loss: -0.05738067999482155
Epoch 0, Batch 1, Critic Loss: 2.5392813682556152, Generator Loss: -0.14112913608551025
Epoch 1, Batch 0, Critic Loss: 2.51015043258667, Generator Loss: -0.22798646986484528
Epoch 1, Batch 1, Critic Loss: 2.56207275390625, Generator Loss: -0.33638182282447815
Epoch 2, Batch 0, Critic Loss: 2.424968719482422, Generator Loss: -0.45749056339263916
Epoch 2, Batch 1, Critic Loss: 2.5199995040893555, Generator Loss: -0.5905004739761353
Epoch 3, Batch 0, Critic Loss: 2.489124298095703, Generator Loss: -0.7844465374946594
Epoch 3, Batch 1, Critic Loss: 2.412771701812744, Generator Loss: -0.9533607363700867
Epoch 4, Batch 0, Critic Loss: 2.459195852279663, Generator Loss: -1.2034063339233398
Epoch 4, Batch 1, Critic Loss: 2.560898780822754, Generator Loss: -1.4659512042999268
Epoch 5, Batch 0, Critic Loss: 2.686497688293457, Generator Loss: -1.7751145362854004
Epoch 5, Batch 1, Critic Loss: 2.8547589778900146

Epoch 47, Batch 1, Critic Loss: -0.6892296075820923, Generator Loss: 2.0204238891601562
Epoch 48, Batch 0, Critic Loss: -0.7755508422851562, Generator Loss: 2.044858455657959
Epoch 48, Batch 1, Critic Loss: -0.8713946342468262, Generator Loss: 2.1144802570343018
Epoch 49, Batch 0, Critic Loss: -0.8298988342285156, Generator Loss: 2.0514156818389893
Epoch 49, Batch 1, Critic Loss: -0.7751446962356567, Generator Loss: 2.0109262466430664
Epoch 50, Batch 0, Critic Loss: -0.6986001133918762, Generator Loss: 1.952492117881775
Epoch 50, Batch 1, Critic Loss: -0.6713316440582275, Generator Loss: 1.9034805297851562
Epoch 51, Batch 0, Critic Loss: -0.6979022026062012, Generator Loss: 1.9174574613571167
Epoch 51, Batch 1, Critic Loss: -0.6015505194664001, Generator Loss: 1.8561232089996338
Epoch 52, Batch 0, Critic Loss: -0.5759337544441223, Generator Loss: 1.8323843479156494
Epoch 52, Batch 1, Critic Loss: -0.5413241386413574, Generator Loss: 1.7969884872436523
Epoch 53, Batch 0, Critic Loss: -0

Epoch 96, Batch 0, Critic Loss: 1.323354959487915, Generator Loss: 0.4919777512550354
Epoch 96, Batch 1, Critic Loss: 1.28952157497406, Generator Loss: 0.4756891131401062
Epoch 97, Batch 0, Critic Loss: 1.2677699327468872, Generator Loss: 0.4839947521686554
Epoch 97, Batch 1, Critic Loss: 1.3290973901748657, Generator Loss: 0.46788424253463745
Epoch 98, Batch 0, Critic Loss: 1.3106372356414795, Generator Loss: 0.46462756395339966
Epoch 98, Batch 1, Critic Loss: 1.329371452331543, Generator Loss: 0.4877927601337433
Epoch 99, Batch 0, Critic Loss: 1.331176519393921, Generator Loss: 0.47941726446151733
Epoch 99, Batch 1, Critic Loss: 1.2674808502197266, Generator Loss: 0.48664385080337524


In [6]:
# Assuming the last column name in  original dataframe represents the label
all_column_names = data_frame.columns.tolist()  # This should have 101 names if the label is included in data_frame

# Generate synthetic data
noise = tf.random.normal([num_samples_to_generate, 100])
synthetic_data = generator(noise, training=False)
synthetic_data_df = pd.DataFrame(synthetic_data.numpy(), columns=all_column_names)
# Set the label for all generated data to 1
synthetic_data_df[all_column_names[-1]] = 1

In [7]:
#synthetic_data

In [8]:
# Combine original and synthetic data
enhanced_df = pd.concat([data_frame, synthetic_data_df], axis=0).reset_index(drop=True)
file_name='enhanced_GAN_final_new_par_50_NNFM_space_2.csv'
file_path = os.path.join(base_dir,'data','split',ltype, file_name)
output_path = file_path
enhanced_df.to_csv(output_path, index=False)