This NoteBook implements a state of art  Generative Adversarial Imputation Network (GAIN)  a paper for this approach could be found here: [GAIN](http://proceedings.mlr.press/v80/yoon18a.html?ref=https://githubhelp.com)  please leave a comment for your feedback 

# Installing and Importing

In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras import backend as K

# Few EDA

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df_nans = df.loc[:, np.isnan(df).sum() > 0]
np.isnan(df_nans).sum() / df_nans.shape[0]

In [None]:
columns = df_nans.columns
fig = plt.figure(figsize=(30, 10))
plt.bar(x=columns, height = np.isnan(df_nans).sum())
plt.show()

In [None]:
df_nans.plot.hist(figsize=(30, 10), bins = 50)
plt.show()

In [None]:
df_nans.hist(figsize=(30, 20))
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_data = scaler.fit_transform(df.drop('row_id', axis = 1))

In [None]:
BUFFER_SIZE = 900000
BATCH_SIZE = 128
train_dataset = tf.data.Dataset.from_tensor_slices(train_data).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Building the GAIN

## Define the Generator

In [None]:
def generator_builder(num_fet = 80):
  # 80 * 2 = 162
  generator = keras.models.Sequential([
    keras.layers.Input(shape = num_fet * 2), 
    keras.layers.Dense(150, 'relu'),
    keras.layers.Dense(100, 'relu'),
    keras.layers.Dense(80, 'sigmoid')
  ])
  return generator

## Define the Discriminator

In [None]:
def discriminator_builder(num_fet = 80):
  # 80 * 2 = 162
  discriminator = keras.models.Sequential([
      keras.layers.Input(shape = num_fet * 2),
      keras.layers.Dense(150, 'relu'),
      keras.layers.Dense(100, 'relu'),
      keras.layers.Dense(80, 'sigmoid'),
  ])

  return discriminator

Discriminarot Loss 

In [None]:
def discriminator_loss(m, m_hat):
  temp = tf.math.multiply(m, tf.math.log(m_hat + 1e-8))
  temp += tf.math.multiply(tf.ones_like(m) - m , tf.math.log(tf.ones_like(m_hat) - m_hat + 1e-8))
  return -tf.reduce_mean(temp)

Generator Loss

In [None]:
def generator_loss(m, m_hat, x, x_hat, alpha = 10):
  loss = -tf.math.multiply((tf.ones_like(m)-m), tf.math.log(m_hat+1e-8))
  loss += alpha * tf.multiply(m, (x - x_hat) ** 2)
  return tf.reduce_mean(loss)

In [None]:
def hint_smapler(batch_size = 128, num_fet = 80, hint_rate = 0.9):
    A = np.random.uniform(0., 1., size = [batch_size, num_fet])
    B = hint_rate > A
    C = 1.*B
    return C

Define the Optimizers and build the models

In [None]:
generator_optimizer = keras.optimizers.SGD(1e-4)
discriminator_optimizer = keras.optimizers.SGD(1e-4)

In [None]:
generator = generator_builder()
discriminator = discriminator_builder()

In [None]:
@tf.function
def train_step(batch, epoch_num):
    
  m = tf.where(tf.math.is_nan(batch), tf.zeros_like(batch), tf.ones_like(batch))
  batch_C = tf.where(tf.math.is_nan(batch), tf.zeros_like(batch), batch)
  z = np.random.uniform(0, 0.01, size = batch.shape)
  batch = tf.math.multiply(batch_C, m) + tf.math.multiply((tf.ones_like(m) - m), z)
  
  with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:

    X_temp = tf.cast(generator(tf.concat([batch, m], 1), training = True), tf.float64)
    X_hat = tf.math.multiply(m, batch) + tf.math.multiply((tf.ones_like(m) - m), X_temp)

    H = tf.math.multiply(hint_smapler(),  m)
    M_hat = tf.cast(discriminator(tf.concat([X_hat, H], 1), training = True), tf.float64)

    disc_loss = discriminator_loss(m, M_hat)
    gen_loss = generator_loss(m, M_hat, batch, X_hat, alpha = 10)

  gen_grad = gen_tape.gradient(gen_loss, generator.trainable_variables)
  disc_grad = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
  generator_optimizer.apply_gradients(zip(gen_grad, generator.trainable_variables))
  discriminator_optimizer.apply_gradients((zip(disc_grad, discriminator.trainable_variables)))
  return gen_loss, disc_loss

In [None]:
def train(dataset, EPOCHS):
  losses_gen = []
  losses_disc  = []
  for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch} / {EPOCHS}:', end=' ')
    t0 = time.time()
    for batch in dataset:
      gen_loss, disc_loss = train_step(batch, epoch)
      losses_gen.append(gen_loss)
      losses_disc.append(disc_loss)
    t1 = time.time()
    print(f'gen_loss = {gen_loss}, disc_loss = {disc_loss}, time = {t1 - t0}')
  return losses_gen, losses_disc

In [None]:
gen_loss, disc_loss = train(train_dataset, EPOCHS = 10)

In [None]:
plt.figure(figsize = (20, 10))
plt.plot(gen_loss, color = 'orange')
plt.title('Generator Loss')
plt.xlabel('Iter')
plt.ylabel('Loss')
plt.show()

In [None]:
plt.figure(figsize = (20, 10))
plt.plot(disc_loss, color = 'orange')
plt.title('Disciriminator Loss')
plt.xlabel('Iter')
plt.ylabel('Loss')
plt.show()

In [None]:
m = tf.where(tf.math.is_nan(train_data), tf.zeros_like(train_data), tf.ones_like(train_data))
train_data_C = tf.where(tf.math.is_nan(train_data), tf.zeros_like(train_data), train_data)
z = np.random.uniform(0, 0.01, size = train_data.shape)
train_data = tf.math.multiply(train_data_C, m) + tf.math.multiply((tf.ones_like(m) - m), z)

In [None]:
X_temp = tf.cast(generator(tf.concat([train_data, m], 1), training = True), tf.float64)
X_hat = tf.math.multiply(m, train_data) + tf.math.multiply((tf.ones_like(m) - m), X_temp) 

In [None]:
pred = scaler.inverse_transform(X_hat)
df_ = df.drop(['row_id'], axis = 1)
test = []
for idx_col, col in enumerate(df):
  for idx_row, row in enumerate(df.loc[:, col]):
    if np.isnan(row):
      test.append(f'{idx_row}-{col}')
res = []
for col_idx, col in enumerate(df_):
  for row_idx, row in enumerate(df_.loc[:, col]):
    if np.isnan(row):
      res.append(pred[row_idx, col_idx])

In [None]:
out = np.array([test, res])
out = pd.DataFrame(out.T)
out = out.rename({0: 'row-col', 1: 'value'}, axis = 1)
out.to_csv('sub3_temp.csv', index = False)

In [None]:
#!kaggle competitions submit -c tabular-playground-series-jun-2022 -f sub3_temp.csv -m "Message"