<a href="https://colab.research.google.com/github/yutaro-tanaka-yt2705/ag-cgan/blob/main/3_build_own_cgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import our common libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import gzip
import sys
import warnings

In [2]:
#Import our DL libraries
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import regularizers, layers
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, Activation, LeakyReLU, BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential, load_model, save_model, model_from_json
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from torch.optim import Adam
import torch.cuda as cuda
import torch.nn as nn
import torch
from torch.distributions import Normal
######################################
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('Check GPU')

In [3]:
#Saving model
def save_mod(g, d, epo,folder='/content/drive/MyDrive/artificial_genome_project/cgan_models/'):
    discriminator.trainable = True
    save_model(g, folder+str(epo)+"_generator")
    save_model(d,folder+str(epo)+"_discriminator")

In [4]:
#Import our data
data = pd.read_csv('/content/drive/MyDrive/artificial_genome_project/test_dataset.csv')
data.head()

Unnamed: 0,ID,pair,X_1343510_CA_C,X_2464340_G_A,X_844565_T_G,X_1443587_C_T,X_846946_A_G,X_155783945_C_T,X_2348441_C_T,X_1495791_C_G,...,X_1494970_G_A,X_2272724_A_C,X_2441570_T_C,X_1420810_T_C,X_719380_A_G,X_1115457_A_G,X_2449045_T_C,X_1165297_A_G,population_code,suppop_code
0,HG00096,A,0,0,0,1,0,1,1,1,...,0,1,0,0,0,0,0,1,10.0,3.0
1,HG00096,B,0,0,1,1,1,1,0,1,...,0,1,0,0,0,1,0,0,10.0,3.0
2,HG00097,A,0,0,1,1,1,1,0,1,...,0,0,0,1,0,0,0,0,10.0,3.0
3,HG00097,B,0,0,1,1,0,1,0,0,...,0,1,0,0,0,0,1,0,10.0,3.0
4,HG00099,A,0,0,1,1,1,1,0,0,...,0,1,1,1,0,0,1,0,10.0,3.0


In [5]:
var_data = data.iloc[:, 2:-2] #training data
#labels = keras.utils.to_categorical(data.population_code.tolist(), len(data.population_code.unique())) #training labels
labels = keras.utils.to_categorical(data.suppop_code.tolist(), len(data.suppop_code.unique())) #superpopulation labels
print(var_data.shape, labels.shape)

(5096, 1000) (5096, 5)


In [6]:
#Implement hyperparameters
class Hyperparameter:
    num_classes: int        = labels.shape[1]
    num_channels: int       = 1
    batchsize: int          = 16
    num_epochs: int         = 20
    latent_size: int        = 64
    n_critic: int           = 5
    critic_size: int        = 1024
    generator_size: int     = 1024
    critic_hidden_size: int = 1024
    gp_lambda: float        = 10.
    alpha: int              = 0.01
    d_lr: float             = 0.008
    g_lr: float             = 0.008
        
hp = Hyperparameter()

In [7]:
#Implement dataset
dataset = tf.data.Dataset.from_tensor_slices((var_data, labels))
#dataset = dataset.shuffle(buffer_size=10).batch(hp.batchsize)

In [None]:
for e in dataset:
  print(e[0].shape, e[1].shape)
  break

(1000,) (5,)


In [None]:
#Implement Generator
generator = Sequential(name='generator')
generator.add(keras.layers.InputLayer((hp.latent_size + hp.num_classes)))
generator.add(Dense(int(var_data.shape[1]//1.2), kernel_regularizer=regularizers.l2(0.0001)))
generator.add(LeakyReLU(alpha=hp.alpha))
generator.add(Dense(int(var_data.shape[1]//1.1), kernel_regularizer=regularizers.l2(0.0001)))
generator.add(LeakyReLU(alpha=hp.alpha))
generator.add(Dense(var_data.shape[1], activation = 'tanh'))
g_optimizer = keras.optimizers.Adam(learning_rate=0.0003)
generator.compile(optimizer=g_optimizer, loss='binary_crossentropy')

In [None]:
generator.summary()

Model: "generator"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 833)               58310     
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 833)               0         
                                                                 
 dense_1 (Dense)             (None, 909)               758106    
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 909)               0         
                                                                 
 dense_2 (Dense)             (None, 1000)              910000    
                                                                 
Total params: 1,726,416
Trainable params: 1,726,416
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Implement Discriminator
discriminator = Sequential(name='discriminator')
discriminator.add(keras.layers.InputLayer((1000, hp.num_channels + hp.num_classes)))
discriminator.add(Dense(var_data.shape[1]//2, kernel_regularizer=regularizers.l2(0.0001)))
discriminator.add(LeakyReLU(alpha=hp.alpha))
discriminator.add(Dense(var_data.shape[1]//3, kernel_regularizer=regularizers.l2(0.0001)))
discriminator.add(LeakyReLU(alpha=hp.alpha))
discriminator.add(GlobalMaxPooling1D())
discriminator.add(Dense(1, activation = 'sigmoid'))
d_optimizer = keras.optimizers.Adam(learning_rate=0.0003)
discriminator.compile(optimizer=d_optimizer, loss='binary_crossentropy')
discriminator.trainable = False

In [None]:
discriminator.summary()

Model: "discriminator"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 1000, 500)         3500      
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 1000, 500)         0         
                                                                 
 dense_4 (Dense)             (None, 1000, 333)         166833    
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 1000, 333)         0         
                                                                 
 global_max_pooling1d (Globa  (None, 333)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_5 (Dense)             (None, 1)                 334       
                                                     

In [None]:
#Implement Conditional GAN
cgan = Sequential()
cgan.add(generator)
cgan.add(discriminator)
g_opt = keras.optimizers.Adam(learning_rate=hp.g_lr)
cgan.compile(optimizer=g_opt, loss='binary_crossentropy')

In [None]:
epoch = 200
batch_size = hp.batchsize
batch = var_data.shape[0]//batch_size

loss_func = keras.losses.BinaryCrossentropy(from_logits=True)

losses = []
with tf.device('/device:GPU:0'):

  for e in range(epoch):
      for b in tqdm(range(batch)):

        #1. variant data
        var_data_real = var_data[b*batch_size:(b+1)*batch_size] #batch data of variant data
        var_data_real = tf.cast(var_data_real, dtype=tf.float32) #make into float32 data (initially int64)
        var_data_real = tf.expand_dims(var_data_real, axis = -1) #add extra dimension
        
        #2. label data
        label_real = labels[b*batch_size:(b+1)*batch_size] #batch data of label data
        real_one_hot_labels = label_real[:, None]
        real_one_hot_labels = tf.repeat(real_one_hot_labels, repeats=[1000])
        real_one_hot_labels = tf.reshape(
                real_one_hot_labels, (-1, 1000, hp.num_classes) #prepare labels to be concat with variant data.
                )
        real_one_hot_labels = tf.squeeze(real_one_hot_labels)

        #3. concat variant data with labels
        real_labelled_data = tf.concat([var_data_real, real_one_hot_labels], -1)
        
        #4. prepare fake data
        latent_data = tf.random.normal(shape=(batch_size, hp.latent_size)) 
        latent_labels = tf.concat([latent_data, label_real], axis=1)

        #5. train generator on fake data
        artificial_trained_data = generator.predict_on_batch(latent_labels)
        artificial_trained_data = tf.expand_dims(artificial_trained_data, axis = -1)

        #6. prepare real+fake training set for discriminator
        artificial_labelled_data = tf.concat([artificial_trained_data, real_one_hot_labels], -1)
        combined_labelled_data = tf.concat(
            [artificial_labelled_data, real_labelled_data], axis=0
        )

        #7. make labels for real+fake data
        rf_labels = tf.concat(
                [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0
            )
        
        #8. train discriminator on combined dataset
        discriminator.trainable = True
        with tf.GradientTape() as tape:
            predicted_labels = discriminator(combined_labelled_data)
            d_loss = loss_func(rf_labels, predicted_labels)
        d_grads = tape.gradient(d_loss, discriminator.trainable_weights)
        d_optimizer.apply_gradients(zip(d_grads, discriminator.trainable_weights))

        #9. make misleading labels
        misleading_labels = tf.zeros((batch_size, 1))

        #10. train cGAN
        discriminator.trainable = False
        with tf.GradientTape() as tape:
            artificial_trained_data = generator(latent_labels)
            artificial_trained_data = tf.expand_dims(artificial_trained_data, axis = -1)
            artificial_labelled_data = tf.concat([artificial_trained_data, real_one_hot_labels], -1)
            g_loss = discriminator(artificial_labelled_data, misleading_labels)
        g_grads = tape.gradient(g_loss, generator.trainable_weights)
        g_optimizer.apply_gradients(zip(g_grads, generator.trainable_weights))

        losses.append([d_loss, g_loss])

      #save models at checkpoint
      if e % 100 == 0 and e != 0:
        save_mod(generator, discriminator, str(e))
      elif e == range(epoch):
        save_mod(generator, discriminator, str(e))

  return dispatch_target(*args, **kwargs)
100%|██████████| 318/318 [00:10<00:00, 29.92it/s]
100%|██████████| 318/318 [00:10<00:00, 31.16it/s]
100%|██████████| 318/318 [00:10<00:00, 31.48it/s]
100%|██████████| 318/318 [00:10<00:00, 30.36it/s]
100%|██████████| 318/318 [00:10<00:00, 31.48it/s]
100%|██████████| 318/318 [00:10<00:00, 31.05it/s]
100%|██████████| 318/318 [00:10<00:00, 30.92it/s]
100%|██████████| 318/318 [00:09<00:00, 32.06it/s]
100%|██████████| 318/318 [00:10<00:00, 31.33it/s]
100%|██████████| 318/318 [00:10<00:00, 30.84it/s]
100%|██████████| 318/318 [00:10<00:00, 29.88it/s]
100%|██████████| 318/318 [00:10<00:00, 30.99it/s]
100%|██████████| 318/318 [00:09<00:00, 31.92it/s]
100%|██████████| 318/318 [00:10<00:00, 29.72it/s]
100%|██████████| 318/318 [00:10<00:00, 31.43it/s]
100%|██████████| 318/318 [00:10<00:00, 31.15it/s]
100%|██████████| 318/318 [00:10<00:00, 31.06it/s]
100%|██████████| 318/318 [00:10<00:00, 31.78it/s]
100%|██████████| 318/318 [00:10<00:00, 31.02it/s]
100%|███






INFO:tensorflow:Assets written to: /content/drive/MyDrive/artificial_genome_project/cgan_models/100_generator/assets
INFO:tensorflow:Assets written to: /content/drive/MyDrive/artificial_genome_project/cgan_models/100_discriminator/assets


  return dispatch_target(*args, **kwargs)
100%|██████████| 318/318 [00:09<00:00, 31.98it/s]
100%|██████████| 318/318 [00:09<00:00, 32.42it/s]
100%|██████████| 318/318 [00:09<00:00, 32.64it/s]
100%|██████████| 318/318 [00:09<00:00, 32.24it/s]
100%|██████████| 318/318 [00:09<00:00, 32.22it/s]
100%|██████████| 318/318 [00:09<00:00, 32.35it/s]
100%|██████████| 318/318 [00:09<00:00, 32.20it/s]
100%|██████████| 318/318 [00:09<00:00, 32.79it/s]
100%|██████████| 318/318 [00:09<00:00, 32.49it/s]
100%|██████████| 318/318 [00:09<00:00, 32.10it/s]
100%|██████████| 318/318 [00:09<00:00, 32.28it/s]
100%|██████████| 318/318 [00:09<00:00, 32.15it/s]
100%|██████████| 318/318 [00:09<00:00, 32.29it/s]
100%|██████████| 318/318 [00:09<00:00, 32.86it/s]
100%|██████████| 318/318 [00:09<00:00, 32.42it/s]
100%|██████████| 318/318 [00:09<00:00, 32.47it/s]
100%|██████████| 318/318 [00:09<00:00, 32.28it/s]
100%|██████████| 318/318 [00:09<00:00, 32.38it/s]
100%|██████████| 318/318 [00:09<00:00, 32.91it/s]
100%|███

In [None]:
save_mod(generator, discriminator, str(e))

INFO:tensorflow:Assets written to: /content/drive/MyDrive/artificial_genome_project/cgan_models/10_generator/assets
INFO:tensorflow:Assets written to: /content/drive/MyDrive/artificial_genome_project/cgan_models/10_discriminator/assets


---

In [None]:
# create artificial genome dataset
generator_model_filepath = '/content/drive/MyDrive/artificial_genome_project/cgan_models/100_generator/'
generator = tf.keras.models.load_model(generator_model_filepath)

num_classes = hp.num_classes #num classes

# latent data
ag_num = 100
latent_samples = np.random.normal(loc=0, scale=1, size=(ag_num, hp.latent_size+hp.num_classes,))
generated_genomes = generator.predict(latent_samples)

for class_idx in range(num_classes):
    label = keras.utils.to_categorical([class_idx], num_classes)
    label = label[:, None]
    label = tf.repeat(label, repeats=[ag_num])
    label = tf.reshape(
            label, (-1, ag_num, hp.num_classes) #prepare labels to be concat with variant data.
            )
    label = tf.squeeze(label)
    label = tf.cast(label, tf.float32)

    latent_data = tf.random.normal(shape=(ag_num, hp.latent_size)) 
    latent_labels = tf.concat([latent_data, label], axis=1)

    artificial_data = generator.predict(latent_labels)
    artificial_data_df = pd.DataFrame(artificial_data)

    artificial_data_df.to_csv('/content/drive/MyDrive/artificial_genome_project/artificial_data/artificial_genome_'+str(class_idx)+'.csv', index=False)



In [None]:
tf.keras.utils.plot_model(discriminator), tf.keras.utils.plot_model(generator)

(<IPython.core.display.Image object>, <IPython.core.display.Image object>)

---