<a href="https://colab.research.google.com/github/veiro/Extentend_Data_imputation_with_CTGAN/blob/main/Conditional_GANs_for_Synthetic_Ranking_Data_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Nov 1, 2019

Conditional Imputation GAN on synthetic ranking data. Please do not distribute.



### Reading Data

In [18]:
import os
import sys

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from tqdm import tqdm
import pandas
from sklearn.model_selection import train_test_split

In [19]:
%%time
train_filepath = 'https://raw.githubusercontent.com/gdeng96/cond-imp-gan-ranking/main/synthetic_ranking_data.csv'#'~/synthetic_ranking_data_train.csv'
test_filepath = 'https://raw.githubusercontent.com/gdeng96/cond-imp-gan-ranking/main/synthetic_ranking_data.csv'#'~/synthetic_ranking_data_test.csv'
train_df = pandas.read_csv(train_filepath)
test_df = pandas.read_csv(test_filepath) 

CPU times: user 2.13 s, sys: 207 ms, total: 2.34 s
Wall time: 2.74 s


In [20]:
%%time
data_filepath = 'https://raw.githubusercontent.com/gdeng96/cond-imp-gan-ranking/main/synthetic_ranking_data.csv'
data_full = pandas.read_csv(data_filepath)
data =  data_full.sample(n=100000, random_state=1)
train_df, test_df = train_test_split(data, test_size=0.10, random_state=42)


CPU times: user 1.3 s, sys: 119 ms, total: 1.42 s
Wall time: 1.55 s


### Preparing Data

In [21]:
INFO_COLS = ['Category', "QID", "ProductID", "QProductID"]
RANKING_COLS = ["V5", "V6", "V7", "V8", "V9"]

In [22]:
%%time
trainX = train_df.loc[:, RANKING_COLS]
testX = test_df.loc[:, RANKING_COLS]

#Normalize features
trainX_mean = trainX.mean()
trainX_sd = trainX.std()
testX_mean = testX.mean()
testX_sd = testX.std()

trainX =(trainX-trainX.mean())/trainX.std()
testX =(testX-testX.mean())/testX.std()


CPU times: user 33.5 ms, sys: 1.06 ms, total: 34.6 ms
Wall time: 37.3 ms


In [23]:
trainX = trainX.values
testX = testX.values

print(trainX.shape)
print(testX.shape)
print(type(trainX))
print(type(testX))

Train_No = train_df.shape[0] 
Test_No = test_df.shape[0]

(90000, 5)
(10000, 5)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [24]:
##Create one-hot conditional matrix - numpy array
trainlabelsX = pandas.get_dummies(train_df.Category, prefix='Category')
testlabelsX = pandas.get_dummies(test_df.Category, prefix='Category')
trainlabelsX = trainlabelsX.values
testlabelsX = testlabelsX.values

In [25]:
# Number of classes for conditional one-hot tensor
nClass = 5

In [26]:
# Number of ranking features that needs imputing
Dim = 5

## Train GAN

In [27]:
# Set Hyperparameters
# Mini-batch
mb_size = 256
# Iterations
n_epoch = 50
# Missing rate
p_miss = 0.05
# Fully-connected layer sizes (change depending on Dim)
col1 = 64 
col2 = 64 

In [28]:
# Other functions
def sample_M(m, n, p):
    A = np.random.uniform(0., 1., size = [m, n])
    B = A > p
    C = 1.*B
    return C
  
trainM = sample_M(Train_No, Dim, p_miss)
testM = sample_M(Test_No, Dim, p_miss)

def xavier_init(size):
    in_dim = size[0]
    xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
    return tf.random_normal(shape = size, stddev = xavier_stddev)

def sample_Z(m, n):
    return np.random.uniform(0., 0.1, size = [m, n])        

def sample_idx(m, n):
    A = np.random.permutation(m)
    idx = A[:n]
    return idx


In [29]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#Tensors
X = tf.placeholder(tf.float32, shape = [None, Dim])
M = tf.placeholder(tf.float32, shape = [None, Dim])
H = tf.placeholder(tf.float32, shape = [None, Dim])
Z = tf.placeholder(tf.float32, shape = [None, Dim])
C = tf.placeholder(tf.float32, shape = [None, nClass])

D_W1 = tf.Variable(xavier_init([Dim*2 + nClass, col1]))    
D_b1 = tf.Variable(tf.zeros(shape = [col1]))
D_W2 = tf.Variable(xavier_init([col1, col2]))
D_b2 = tf.Variable(tf.zeros(shape = [col2]))
D_W3 = tf.Variable(xavier_init([col2, Dim]))
D_b3 = tf.Variable(tf.zeros(shape = [Dim]))   

G_W1 = tf.Variable(xavier_init([Dim*2 + nClass, col1]))     
G_b1 = tf.Variable(tf.zeros(shape = [col1]))
G_W2 = tf.Variable(xavier_init([col1, col2]))
G_b2 = tf.Variable(tf.zeros(shape = [col2]))
G_W3 = tf.Variable(xavier_init([col2, Dim]))
G_b3 = tf.Variable(tf.zeros(shape = [Dim]))

In [30]:
%%time 
#Setting up conditional generator
def generator_conditional(x,z,m,c):
    inp = m * x + (1-m) * z  
    inputs = tf.concat(axis = 1, values = [inp,c,m])  
    G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
    G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
    G_est = tf.matmul(G_h2, G_W3) + G_b3
    return G_est

    
#Setting up conditional discriminator
def discriminator_conditional(x, m, g, h, c):
    inp = m * x + (1-m) * g  
    inputs = tf.concat(axis = 1, values = [inp,c,h])  
    D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
    D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
    D_logit = tf.matmul(D_h2, D_W3) + D_b3
    D_prob = tf.nn.sigmoid(D_logit)  
    return D_prob

#Generate fake copies
G_sample = generator_conditional(X,Z,M,C)
D_prob = discriminator_conditional(X, M, G_sample, H,C)

D_loss1 = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8)) * 2
G_loss1 = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8)) / tf.reduce_mean(1-M)
MSE_train_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)
MSE_test_loss = tf.reduce_mean(((1-M) * X - (1-M)*G_sample)**2) / tf.reduce_mean(1-M)

D_loss = D_loss1 + MSE_test_loss
G_loss = G_loss1 + 10 * MSE_train_loss 


#Imputed Copy
Imputed_copy = M * X + (1-M) * G_sample

#Adam Optimizer
D_params = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
G_params = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=D_params)
G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=G_params)

CPU times: user 496 ms, sys: 6.55 ms, total: 502 ms
Wall time: 508 ms


In [31]:
%%time
sess = tf.Session()
sess.run(tf.global_variables_initializer())


i = 1
DiscriminatorLoss = []
GeneratorLoss = []
IterationsRecord = []

for it in tqdm(range(n_epoch)):    
    
    mb_idx = sample_idx(Train_No, mb_size)
    X_mb = trainX[mb_idx,:]  
    Z_mb = sample_Z(mb_size, Dim) 
    M_mb = trainM[mb_idx,:]  
    H_mb1 = sample_M(mb_size, Dim, 1-0.5)
    H_mb = M_mb * H_mb1
    C_mb = trainlabelsX[mb_idx, :]
    
    New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce
    
    _, D_loss = sess.run([D_solver, D_loss1], feed_dict = {X: X_mb, M: M_mb, Z: New_X_mb, H: H_mb, C: C_mb})
    _, G_loss, MSE_train_loss_curr, MSE_test_loss_curr = sess.run([G_solver, G_loss1, MSE_train_loss, MSE_test_loss],
                                                                       feed_dict = {X: X_mb, M: M_mb, Z: New_X_mb, H: H_mb, C: C_mb})
         
        
    #%% Intermediate Losses
    if it % 50 == 0:
        IterationsRecord.append(it)
        DiscriminatorLoss.append(D_loss)
        GeneratorLoss.append(G_loss)
        print('Iter: {}'.format(it))
        print('Dloss: {:.4}'.format(D_loss))
        print('Gloss: {:.4}'.format(G_loss))
        print()



 68%|██████▊   | 34/50 [00:00<00:00, 113.21it/s]

Iter: 0
Dloss: 1.385
Gloss: 0.6418



100%|██████████| 50/50 [00:00<00:00, 101.18it/s]

CPU times: user 626 ms, sys: 23.1 ms, total: 649 ms
Wall time: 607 ms





### Record some summary statistics


In [32]:
#Getting imputations for test ranking dataset + RMSE.
N_imputations = 10
RMSEoutput = []

for j in range(N_imputations):
    Z_mb = sample_Z(Test_No, Dim) 
    M_mb = testM
    X_mb = testX
        
    New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb       
    New_C_mb = testlabelsX
    
    MSE_final, Sample, Imputed_copy_v2, M_v2 = sess.run([MSE_test_loss, G_sample, Imputed_copy, M], feed_dict = {X: testX, M: testM, Z: New_X_mb, C: New_C_mb})
  
    RMSEoutput.append(np.sqrt(MSE_final))
    print("Done with imputation", j)
    


Done with imputation 0
Done with imputation 1
Done with imputation 2
Done with imputation 3
Done with imputation 4
Done with imputation 5
Done with imputation 6
Done with imputation 7
Done with imputation 8
Done with imputation 9


In [33]:
RMSEDF = pandas.DataFrame({'RMSE': RMSEoutput})
RMSEDF.to_csv('rmse.csv', index=False, header=True)

In [34]:
RMSEDF.mean(axis=0)

RMSE    0.969347
dtype: float32

Code adapted based on Yoon et al. (2018)



