<a href="https://colab.research.google.com/github/veiro/Extentend_Data_imputation_with_CTGAN/blob/main/GANS_tarea_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Basado en:

Grace Deng, Cuize Han, David S. Matteson. Extended Missing Data Imputation via GANs for Ranking Applications


@article{deng2020extended, title={Extended Missing Data Imputation via GANs for Ranking Applications}, author={Deng, Grace and Han, Cuize and Matteson, David S}, journal={arXiv preprint arXiv:2011.02089}, year={2020} }

Original GAIN code can be found here: https://github.com/jsyoon0823/GAIN


## Agregando la normalizacion y sampleo de CTGAN: 

Lei Xu, Maria Skoularidou, Alfredo Cuesta-Infante, Kalyan Veeramachaneni. Modeling Tabular data using Conditional GAN. NeurIPS, 2019.

@inproceedings{ctgan,
  title={Modeling Tabular data using Conditional GAN},
  author={Xu, Lei and Skoularidou, Maria and Cuesta-Infante, Alfredo and Veeramachaneni, Kalyan},
  booktitle={Advances in Neural Information Processing Systems},
  year={2019}
}

Original CTGAN code can be found here: https://github.com/sdv-dev/CTGAN 



### Reading Data

In [1]:
!pip install sdv
# la primera vez da error, reiniciar ambiente y ejecutar todo de nuevo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from sdv.tabular import CTGAN
from ctgan.data_transformer import DataTransformer
from ctgan.data_sampler import DataSampler

In [3]:
import os
import sys

import tensorflow.compat.v1 as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from tqdm import tqdm
import pandas
from sklearn.model_selection import train_test_split
import pandas as pd

In [4]:
%%time
data_filepath = 'https://raw.githubusercontent.com/veiro/Extentend_Data_imputation_with_CTGAN/main/datos/synthetic_ranking_data.csv'
data_full = pandas.read_csv(data_filepath)

print("Cantidad de datos reales:")
print(data_full.shape)

data =  data_full.sample(n=100000, random_state=1)
#data =  data_full[0:100000] #.sample(n=100000, random_state=1)

print("Cantidad de datos sampleados:")
print(data.shape)

Cantidad de datos reales:
(640000, 9)
Cantidad de datos sampleados:
(100000, 9)
CPU times: user 842 ms, sys: 193 ms, total: 1.03 s
Wall time: 1.08 s


In [5]:
data

Unnamed: 0,Category,QID,ProductID,QProductID,V5,V6,V7,V8,V9
40541,Books,Books634,30,Books634-30,0.575460,2.181218,0.361658,2,2.017008
314964,Beauty,Beauty922,21,Beauty922-21,2.826844,10.096474,0.082438,3,19.432840
486237,Clothes,Clothes1598,30,Clothes1598-30,5.150209,0.739852,0.028061,14,30.689661
620211,Electronics,Electronics1691,52,Electronics1691-52,3.108515,5.493077,0.773000,12,12.173721
372657,Beauty,Beauty1823,50,Beauty1823-50,3.272858,1.138467,1.360502,4,11.082728
...,...,...,...,...,...,...,...,...,...
84135,Books,Books1315,40,Books1315-40,-1.760588,4.236946,4.033930,2,0.654007
472993,Clothes,Clothes1391,34,Clothes1391-34,2.981975,1.346800,0.429989,5,31.436584
585718,Electronics,Electronics1152,55,Electronics1152-55,4.086355,4.320428,1.167587,13,15.418107
490553,Clothes,Clothes1665,58,Clothes1665-58,2.674404,1.387580,1.151788,9,0.226623


### Preparing Data

In [6]:
RANKING_COLS = ["V5", "V6", "V7", "V8", "V9", "Category"]
dataToTransform = data.loc[:, RANKING_COLS]

In [7]:
dataToTransform

Unnamed: 0,V5,V6,V7,V8,V9,Category
40541,0.575460,2.181218,0.361658,2,2.017008,Books
314964,2.826844,10.096474,0.082438,3,19.432840,Beauty
486237,5.150209,0.739852,0.028061,14,30.689661,Clothes
620211,3.108515,5.493077,0.773000,12,12.173721,Electronics
372657,3.272858,1.138467,1.360502,4,11.082728,Beauty
...,...,...,...,...,...,...
84135,-1.760588,4.236946,4.033930,2,0.654007,Books
472993,2.981975,1.346800,0.429989,5,31.436584,Clothes
585718,4.086355,4.320428,1.167587,13,15.418107,Electronics
490553,2.674404,1.387580,1.151788,9,0.226623,Clothes


In [8]:
%%time
dt = DataTransformer()
discrete_columns = ['V8', "Category"]

dt.fit(raw_data=dataToTransform, discrete_columns=discrete_columns)
dataTransformed = dt.transform(raw_data=dataToTransform)
sampler = DataSampler(data=dataTransformed, output_info=dt.output_info_list, log_frequency=True)



CPU times: user 1min 3s, sys: 45.3 s, total: 1min 48s
Wall time: 1min 28s


In [9]:
df2 = pd.DataFrame(dataTransformed)

df2.reset_index(drop=True, inplace=True)
data.reset_index(drop=True, inplace=True)

dataTotal= pd.concat([data, df2], axis=1)



In [10]:
trainData, testData = train_test_split(dataTotal, test_size=0.10, random_state=41)

In [11]:
DATASET_COLS = ["V5", "V6", "V7", "V8", "V9", "Category", "QID", "ProductID", "QProductID", 'Category_Beauty', 'Category_Books', 'Category_Clothes', 'Category_Electronics', 'Category_Furniture']
cols = [x for x in dataTotal.columns if (x not in DATASET_COLS)]
trainX = trainData.loc[:, cols]
testX = testData.loc[:,cols]

trainX = trainX.values
testX = testX.values

In [12]:
print(trainX.shape)
print(testX.shape)

Train_No = trainX.shape[0] 
Test_No = testX.shape[0]

(90000, 72)
(10000, 72)


In [13]:
# Number of ranking features that needs imputing
Dim = trainX.shape[1] 
print("Dim :" + str(Dim))

Dim :72


In [14]:
#nClass = trainData['Category'].unique().size 

In [15]:
##Create one-hot conditional matrix - numpy array
c_train, m1, col, opt = sampler.sample_condvec(batch=trainData.shape[0])
trainlabelsX = c_train 
c_test, m1, col, opt = sampler.sample_condvec(batch=testData.shape[0])
testlabelsX = c_test 

In [16]:
testlabelsX.shape

(10000, 31)

In [17]:
trainlabelsX.shape

(90000, 31)

## Train GAN

In [18]:
# Set Hyperparameters
# Mini-batch
mb_size = 256
# Iterations
n_epoch = 50
# Missing rate
p_miss = 0.30
# Fully-connected layer sizes (change depending on Dim)
col1 = 64 
col2 = 64 

In [19]:
def crearMatrizDimensionDeRepresentacion(C, m, n):
    salida = np.ones((m, n)) 
    for idx_fila in range(C.shape[0]):
      acumulador = 0 
      for columna_idx in range(len(C[idx_fila])):
        column = dt._column_transform_info_list[columna_idx]     
        dimension = column.output_dimensions
        if C[idx_fila, columna_idx] == 0:
          for d in range(dimension):
            salida[idx_fila, acumulador+d] = 0
        #siempre acumulo dimension    
        acumulador = acumulador + dimension
    return salida

# todo esto es para verificar que este bien armado, la verificacion es ojo :)
C_prueba = np.array([[1, 1, 1, 1, 1], [1, 1, 0, 1, 1]]) #np.matrix()
print("Entrada")
print(C_prueba)
m_prueba = 2
n_prueba = Dim
s = crearMatrizDimensionDeRepresentacion(C_prueba, m_prueba, n_prueba )
print("---------------")
print("Salida")
print(s)

print("---------------")
print("Info")
for id in range(len(dt._column_transform_info_list)):
  e = dt._column_transform_info_list[id]
  print("Clase: " + e.column_name + ", Dimesion: "+ str(e.output_dimensions))

Entrada
[[1 1 1 1 1]
 [1 1 0 1 1]]
---------------
Salida
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
---------------
Info
Clase: V5, Dimesion: 11
Clase: V6, Dimesion: 9
Clase: V7, Dimesion: 10
Clase: V8, Dimesion: 26
Clase: V9, Dimesion: 11
Clase: Category, Dimesion: 5


In [28]:
# Other functions
def sample_M(m, realDim, n, p):
    A = np.random.uniform(0., 1., size = [m, realDim])
    B = A > p
    C = 1.*B  
    return crearMatrizDimensionDeRepresentacion(C, m, n)

realDimesion = dataToTransform.shape[1]
trainM = sample_M(Train_No, realDimesion, Dim, p_miss)
testM = sample_M(Test_No, realDimesion, Dim, p_miss)

def xavier_init(size):
    in_dim = size[0]
    xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
    return tf.random_normal(shape = size, stddev = xavier_stddev)

def sample_Z(m, n):
    return np.random.uniform(0., 0.1, size = [m, n])        

def sample_idx(m, n):
    A = np.random.permutation(m)
    idx = A[:n]
    idx
    return idx


In [29]:
trainM

array([[1., 1., 1., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [30]:

tf.disable_v2_behavior()
#Tensors
X = tf.placeholder(tf.float32, shape = [None, Dim])
M = tf.placeholder(tf.float32, shape = [None, Dim])
H = tf.placeholder(tf.float32, shape = [None, Dim])
Z = tf.placeholder(tf.float32, shape = [None, Dim])
C = tf.placeholder(tf.float32, shape = [None, trainlabelsX.shape[1]])

D_W1 = tf.Variable(xavier_init([Dim*2 + trainlabelsX.shape[1], col1]))    
D_b1 = tf.Variable(tf.zeros(shape = [col1]))
D_W2 = tf.Variable(xavier_init([col1, col2]))
D_b2 = tf.Variable(tf.zeros(shape = [col2]))
D_W3 = tf.Variable(xavier_init([col2, Dim]))
D_b3 = tf.Variable(tf.zeros(shape = [Dim]))   

G_W1 = tf.Variable(xavier_init([Dim*2 + trainlabelsX.shape[1], col1]))     
G_b1 = tf.Variable(tf.zeros(shape = [col1]))
G_W2 = tf.Variable(xavier_init([col1, col2]))
G_b2 = tf.Variable(tf.zeros(shape = [col2]))
G_W3 = tf.Variable(xavier_init([col2, Dim]))
G_b3 = tf.Variable(tf.zeros(shape = [Dim]))

In [31]:
%%time 
#Setting up conditional generator
def generator_conditional(x,z,m,c):
    inp = m * x + (1-m) * z  
    inputs = tf.concat(axis = 1, values = [inp,c,m])  
    print("G input: " + str(inputs.shape))
    G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
    G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
    G_est = tf.matmul(G_h2, G_W3) + G_b3
    return G_est

    
#Setting up conditional discriminator
def discriminator_conditional(x, m, g, h, c):
    inp = m * x + (1-m) * g  
    inputs = tf.concat(axis = 1, values = [inp,c,h])  
    D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
    D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
    D_logit = tf.matmul(D_h2, D_W3) + D_b3
    D_prob = tf.nn.sigmoid(D_logit)  
    return D_prob

#Generate fake copies
#print("X: " + str(X.shape))
#print("Z: " + str(Z.shape))
#print("M: " + str(M.shape))
#print("C: " + str(C.shape))

G_sample = generator_conditional(X,Z,M,C)
D_prob = discriminator_conditional(X, M, G_sample, H,C)

D_loss1 = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8)) * 2
G_loss1 = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8)) / tf.reduce_mean(1-M)
MSE_train_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)
MSE_test_loss = tf.reduce_mean(((1-M) * X - (1-M)*G_sample)**2) / tf.reduce_mean(1-M)

D_loss = D_loss1 + MSE_test_loss
G_loss = G_loss1 + 10 * MSE_train_loss 


#Imputed Copy
Imputed_copy = M * X + (1-M) * G_sample

#Adam Optimizer
D_params = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
G_params = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=D_params)
G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=G_params)

G input: (?, 175)
CPU times: user 405 ms, sys: 10.9 ms, total: 415 ms
Wall time: 416 ms


In [32]:
%%time
sess = tf.Session()
sess.run(tf.global_variables_initializer())


i = 1
DiscriminatorLoss = []
GeneratorLoss = []
IterationsRecord = []

tqdm(range(n_epoch))

realDimesion = dataToTransform.shape[1]

for it in tqdm(range(n_epoch)):
    
    mb_idx = sample_idx(Train_No, mb_size)
    X_mb = trainX[mb_idx,:]  
    Z_mb = sample_Z(mb_size, Dim) 
    M_mb = trainM[mb_idx,:]  
    H_mb1 = sample_M(mb_size, realDimesion, Dim, p_miss)
    H_mb = M_mb * H_mb1
    C_mb = trainlabelsX[mb_idx, :]
    
    #print("M_mb: " + str(M_mb.shape))
    #print("X_mb: " + str(X_mb.shape))
    #print("Z_mb: " + str(Z_mb.shape))
    #print("H_mb: " + str(H_mb.shape))
    #print("C_mb: " + str(C_mb.shape))

    New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce
    
    _, D_loss = sess.run([D_solver, D_loss1], feed_dict = {X: X_mb, M: M_mb, Z: New_X_mb, H: H_mb, C: C_mb})
    _, G_loss, MSE_train_loss_curr, MSE_test_loss_curr = sess.run([G_solver, G_loss1, MSE_train_loss, MSE_test_loss],
                                                                       feed_dict = {X: X_mb, M: M_mb, Z: New_X_mb, H: H_mb, C: C_mb})
         
        
    #%% Intermediate Losses
    if it % 50 == 0:
        IterationsRecord.append(it)
        DiscriminatorLoss.append(D_loss)
        GeneratorLoss.append(G_loss)
        print('Iter: {}'.format(it))
        print('Dloss: {:.4}'.format(D_loss))
        print('Gloss: {:.4}'.format(G_loss))
        print()



  0%|          | 0/50 [00:00<?, ?it/s]
 24%|██▍       | 12/50 [00:00<00:00, 51.83it/s]

Iter: 0
Dloss: 1.528
Gloss: 0.7587



100%|██████████| 50/50 [00:00<00:00, 75.25it/s]

CPU times: user 762 ms, sys: 46 ms, total: 808 ms
Wall time: 781 ms





In [33]:
def mostrarDiferencias(X_mb,Imputed_copy_v2 ):
    X_mb_real = dt.inverse_transform(X_mb)
    #print("X_mb_real: ")
    #print(X_mb_real)

    imputed_real = dt.inverse_transform(Imputed_copy_v2)
    #print("imputed_real: ")
    #print(imputed_real)

    print(X_mb_real.compare(imputed_real, keep_equal=True, align_axis=0, keep_shape=True).head(20))

    

### Record some summary statistics


In [34]:
#Getting imputations for test ranking dataset + RMSE.
N_imputations = 10
RMSEoutput = []

for j in range(N_imputations):
    Z_mb = sample_Z(Test_No, Dim) 
    M_mb = testM
    X_mb = testX
        
    New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb       
    New_C_mb = testlabelsX
    
    MSE_final, Sample, Imputed_copy_v2, M_v2 = sess.run([MSE_test_loss, G_sample, Imputed_copy, M], feed_dict = {X: testX, M: testM, Z: New_X_mb, C: New_C_mb})


    #mostrarDiferencias(X_mb, Imputed_copy_v2)

    print("MSE_final: " + str(MSE_final))
    RMSEoutput.append(np.sqrt(MSE_final))
    print("Done with imputation", j)


MSE_final: 0.09064088
Done with imputation 0
MSE_final: 0.090651475
Done with imputation 1
MSE_final: 0.090663455
Done with imputation 2
MSE_final: 0.09063725
Done with imputation 3
MSE_final: 0.09064742
Done with imputation 4
MSE_final: 0.090673186
Done with imputation 5
MSE_final: 0.09066788
Done with imputation 6
MSE_final: 0.09064206
Done with imputation 7
MSE_final: 0.09064127
Done with imputation 8
MSE_final: 0.09063572
Done with imputation 9


In [35]:
data = {'Imputation': [str(i).zfill(2) for i in range(1,11)],
        'RMSE': RMSEoutput}
RMSEDF = pandas.DataFrame(data)
print("Promedio de las 10 imputaciones: " + str(RMSEDF["RMSE"].mean(axis=0)))
RMSEDF.loc[len(RMSEDF.index)] = ['prmedio',RMSEDF["RMSE"].mean(axis=0)]
RMSEDF.to_csv('rmse_'+ str(p_miss)+'.csv', index=False, header=True)

Promedio de las 10 imputaciones: 0.30108148
