In [1]:
from __future__ import print_function
import sys
sys.path.append("..")
import tensorflow as tf
import argparse
import numpy as np
import pandas as pd
import random
import os
import math
import time
import statistics
from datetime import datetime
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


# Model

In [3]:
if "concat_v2" in dir(tf):
    def concat(tensors, axis, *args, **kwargs):
        return tf.concat_v2(tensors, axis, *args, **kwargs)
else:
    def concat(tensors, axis, *args, **kwargs):
        return tf.concat(tensors, axis, *args, **kwargs)

def bn(x, is_training, scope):
    return tf.compat.v1.layers.batch_normalization(x,
                                        momentum=0.9,
                                        epsilon=1e-5,
                                        scale=True)

def conv_out_size_same(size, stride):
    return int(math.ceil(float(size) / float(stride)))

def conv_cond_concat(x, y):
    """Concatenate conditioning vector on feature map axis."""
    x_shapes = x.get_shape()
    y_shapes = y.get_shape()
    return concat([x, y*tf.ones([x_shapes[0], x_shapes[1], x_shapes[2], y_shapes[3]])], 3)

def conv2d(input_, output_dim, k_h=5, k_w=5, d_h=2, d_w=2, stddev=0.02, name="conv2d"):
    with tf.variable_scope(name):
        w = tf.get_variable('w', [k_h, k_w, input_.get_shape()[-1], output_dim],
              initializer=tf.truncated_normal_initializer(stddev=stddev))
        conv = tf.nn.conv2d(input_, w, strides=[1, d_h, d_w, 1], padding='SAME')

        biases = tf.get_variable('biases', [output_dim], initializer=tf.constant_initializer(0.0))
        conv = tf.reshape(tf.nn.bias_add(conv, biases), conv.get_shape())

        return conv

def deconv2d(input_, output_shape, k_h=5, k_w=5, d_h=2, d_w=2, name="deconv2d", stddev=0.02, with_w=False):
    with tf.variable_scope(name):
        # filter : [height, width, output_channels, in_channels]
        w = tf.get_variable('w', [k_h, k_w, output_shape[-1], input_.get_shape()[-1]],
                            initializer=tf.random_normal_initializer(stddev=stddev))

        try:
            deconv = tf.nn.conv2d_transpose(input_, w, output_shape=output_shape, strides=[1, d_h, d_w, 1])

        # Support for verisons of TensorFlow before 0.7.0
        except AttributeError:
            deconv = tf.nn.deconv2d(input_, w, output_shape=output_shape, strides=[1, d_h, d_w, 1])

        biases = tf.get_variable('biases', [output_shape[-1]], initializer=tf.constant_initializer(0.0))
        deconv = tf.reshape(tf.nn.bias_add(deconv, biases), deconv.get_shape())

        if with_w:
            return deconv, w, biases
        else:
            return deconv

def lrelu(x, leak=0.2, name="lrelu"):
    return tf.maximum(x, leak*x)

def linear(input_, output_size, scope=None, stddev=0.02, bias_start=0.0, with_w=False):
    shape = input_.get_shape().as_list()

    with tf.variable_scope(scope or "Linear"):
        matrix = tf.get_variable("Matrix", [shape[1], output_size], tf.float32,
                 tf.random_normal_initializer(stddev=stddev))
        bias = tf.get_variable("bias", [output_size],
        initializer=tf.constant_initializer(bias_start))
        if with_w:
            return tf.matmul(input_, matrix) + bias, matrix, bias
        else:
            return tf.matmul(input_, matrix) + bias

In [4]:

from tensorflow.python.ops.rnn_cell_impl import LayerRNNCell
from tensorflow.python.layers import base as base_layer
from tensorflow.python.ops import nn_ops
_BIAS_VARIABLE_NAME = "bias"
_WEIGHTS_VARIABLE_NAME = "kernel"

class MyGRUCell15(LayerRNNCell):
  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).

  Args:
    num_units: int, The number of units in the GRU cell.
    activation: Nonlinearity to use.  Default: `tanh`.
    reuse: (optional) Python boolean describing whether to reuse variables
     in an existing scope.  If not `True`, and the existing scope already has
     the given variables, an error is raised.
    kernel_initializer: (optional) The initializer to use for the weight and
    projection matrices.
    bias_initializer: (optional) The initializer to use for the bias.
    name: String, the name of the layer. Layers with the same name willbn
      share weights, but to avoid mistakes we require reuse=True in such
      cases.
  """
  def __init__(self,
               num_units,
               activation=None,
               reuse=None,
               kernel_initializer=None,
               bias_initializer=None,
               name=None):
    super(MyGRUCell15, self).__init__(_reuse=reuse, name=name)

    # Inputs must be 2-dimensional.
    self.input_spec = base_layer.InputSpec(ndim=2)

    self._num_units = num_units
    self._activation = activation or math_ops.tanh
    self._kernel_initializer = kernel_initializer
    self._bias_initializer = bias_initializer

  @property
  def state_size(self):
    return self._num_units

  @property
  def output_size(self):
    return self._num_units

  def build(self, inputs_shape):
    if inputs_shape[1].value is None:
      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                       % inputs_shape)

    input_depth = inputs_shape[1].value-self._num_units
    self._gate_kernel = self.add_variable(
        "gates/%s" % _WEIGHTS_VARIABLE_NAME,
        shape=[input_depth + self._num_units, 2 * self._num_units],
        initializer=self._kernel_initializer)
    self._gate_bias = self.add_variable(
        "gates/%s" % _BIAS_VARIABLE_NAME,
        shape=[2 * self._num_units],
        initializer=(
            self._bias_initializer
            if self._bias_initializer is not None
            else tf.constant_initializer(1.0, dtype=self.dtype)))
    self._candidate_kernel = self.add_variable(
        "candidate/%s" % _WEIGHTS_VARIABLE_NAME,
        shape=[input_depth + self._num_units, self._num_units],
        initializer=self._kernel_initializer)
    self._candidate_bias = self.add_variable(
        "candidate/%s" % _BIAS_VARIABLE_NAME,
        shape=[self._num_units],
        initializer=(
            self._bias_initializer
            if self._bias_initializer is not None
            else tf.zeros_initializer(dtype=self.dtype)))

    self.built = True

  def call(self, inputs, state):
    """Gated recurrent unit (GRU) with nunits cells."""
    totalLength=inputs.get_shape().as_list()[1]
    inputs_=inputs[:,0:totalLength-self._num_units]
    rth=inputs[:,totalLength-self._num_units:]
    inputs=inputs_
    state=math_ops.multiply(rth,state)

    gate_inputs = math_ops.matmul(
        array_ops.concat([inputs, state], 1), self._gate_kernel)
    gate_inputs = nn_ops.bias_add(gate_inputs, self._gate_bias)

    value = math_ops.sigmoid(gate_inputs)
    r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)

    r_state = r * state

    candidate = math_ops.matmul(
        array_ops.concat([inputs, r_state], 1), self._candidate_kernel)
    candidate = nn_ops.bias_add(candidate, self._candidate_bias)

    c = self._activation(candidate)
    new_h = u * state + (1 - u) * c
    return new_h, new_h

In [5]:

class WGAN(object):
    model_name = "WGAN_no_mask"     # name for checkpoint

    def __init__(self, sess, args, datasets):
        self.sess = sess
        self.isbatch_normal=args.isBatch_normal
        self.isNormal=args.isNormal
        self.checkpoint_dir = args.checkpoint_dir
        self.result_dir = args.result_dir
        self.log_dir = args.log_dir
        self.dataset_name=args.dataset_name
        self.run_type=args.run_type
        self.lr = args.lr                 
        self.epoch = args.epoch     
        self.batch_size = args.batch_size
        self.n_inputs = args.n_inputs                 # MNIST data input (img shape: 28*28)
        self.n_steps = datasets.maxLength                                # time steps
        self.n_hidden_units = args.n_hidden_units        # neurons in hidden layer
        self.n_classes = args.n_classes                # MNIST classes (0-9 digits)
        self.gpus=args.gpus
        self.run_type=args.run_type
        self.result_path=args.result_path
        self.model_path=args.model_path
        self.pretrain_epoch=args.pretrain_epoch
        self.impute_iter=args.impute_iter
        self.isSlicing=args.isSlicing
        self.g_loss_lambda=args.g_loss_lambda
        
        self.datasets=datasets
        self.z_dim = args.z_dim         # dimension of noise-vector
        self.gen_length=args.gen_length
        
        # WGAN_GP parameter
        self.lambd = 0.25       # The higher value, the more stable, but the slower convergence
        self.disc_iters = args.disc_iters     # The number of critic iterations for one-step of generator

        # train
        self.learning_rate = args.lr
        self.beta1 = args.beta1
    
        self.grud_cell_d = MyGRUCell15(self.n_hidden_units)
        self.grud_cell_g = MyGRUCell15(self.n_hidden_units)
        
        self.sample_num = 64  # number of generated images to be saved

        self.num_batches = len(datasets.x) // self.batch_size

      
    def pretrainG(self, X, M, Delta,  Mean, Lastvalues, X_lengths, Keep_prob, is_training=True, reuse=False):
        
        with tf.variable_scope("g_enerator", reuse=reuse):
            
            """
            the rnn cell's variable scope is defined by tensorflow,
            if we want to update rnn cell's weights, the variable scope must contains 'g_' or 'd_'
            
            """
            
            wr_h=tf.get_variable("g_wr_h",shape=[self.n_inputs,self.n_hidden_units],initializer=tf.random_normal_initializer())
            w_out= tf.get_variable("g_w_out",shape=[self.n_hidden_units, self.n_inputs],initializer=tf.random_normal_initializer())
            
            br_h= tf.get_variable("g_br_h",shape=[self.n_hidden_units, ],initializer=tf.constant_initializer(0.001))
            b_out= tf.get_variable("g_b_out",shape=[self.n_inputs, ],initializer=tf.constant_initializer(0.001))
            w_z=tf.get_variable("g_w_z",shape=[self.z_dim,self.n_inputs],initializer=tf.random_normal_initializer())
            b_z=tf.get_variable("g_b_z",shape=[self.n_inputs, ],initializer=tf.constant_initializer(0.001))
            
            
            X = tf.reshape(X, [-1, self.n_inputs])
            Delta=tf.reshape(Delta,[-1,self.n_inputs])
            
            rth= tf.matmul(Delta, wr_h)+br_h
            rth=math_ops.exp(-tf.maximum(0.0,rth))
            
            X=tf.concat([X,rth],1)
            
            X_in = tf.reshape(X, [-1, self.n_steps, self.n_inputs+self.n_hidden_units])
         
            init_state = self.grud_cell_g.zero_state(self.batch_size, dtype=tf.float32) 
            outputs, final_state = tf.nn.dynamic_rnn(self.grud_cell_g, X_in, \
                                initial_state=init_state,\
                                sequence_length=X_lengths,
                                time_major=False)
            #outputs: batch_size*n_steps*n_hiddensize
            outputs=tf.reshape(outputs,[-1,self.n_hidden_units])
            out_predict=tf.matmul(tf.nn.dropout(outputs,Keep_prob), w_out) + b_out
            out_predict=tf.reshape(out_predict,[-1,self.n_steps,self.n_inputs])
            return out_predict


    def discriminator(self, X, M, DeltaPre, Lastvalues ,DeltaSub ,SubValues , Mean,  X_lengths,Keep_prob, is_training=True, reuse=False, isTdata=True):
        # Network Architecture is exactly same as in infoGAN (https://arxiv.org/abs/1606.03657)
        # Architecture : (64)4c2s-(128)4c2s_BL-FC1024_BL-FC1_S
        with tf.variable_scope("d_iscriminator", reuse=reuse):
            
            wr_h=tf.get_variable("d_wr_h",shape=[self.n_inputs,self.n_hidden_units],initializer=tf.random_normal_initializer())
            w_out= tf.get_variable("d_w_out",shape=[self.n_hidden_units, 1],initializer=tf.random_normal_initializer())
            br_h= tf.get_variable("d_br_h",shape=[self.n_hidden_units, ],initializer=tf.constant_initializer(0.001))
            b_out= tf.get_variable("d_b_out",shape=[1, ],initializer=tf.constant_initializer(0.001))
          
           
            M=tf.reshape(M,[-1,self.n_inputs])
            X = tf.reshape(X, [-1, self.n_inputs])
            DeltaPre=tf.reshape(DeltaPre,[-1,self.n_inputs])
           
            # rth: time decay vector
            rth= tf.matmul(DeltaPre, wr_h)+br_h
            rth=math_ops.exp(-tf.maximum(0.0,rth))
            # add noise
            #X=X+np.random.standard_normal(size=(self.batch_size*self.n_steps, self.n_inputs))/100 
            X=tf.concat([X,rth],1)
              
            X_in = tf.reshape(X, [self.batch_size, self.n_steps , self.n_inputs+self.n_hidden_units])
            
            init_state = self.grud_cell_d.zero_state(self.batch_size, dtype=tf.float32) 
            outputs, final_state = tf.nn.dynamic_rnn(self.grud_cell_d, X_in, \
                                initial_state=init_state,\
                                sequence_length=X_lengths,
                                time_major=False)
         
           
            out_logit=tf.matmul(tf.nn.dropout(final_state,Keep_prob), w_out) + b_out
            out =tf.nn.sigmoid(out_logit)   
            return out,out_logit

    def generator(self, z, Keep_prob, is_training=True, reuse=False):
        # x,delta,n_steps
        # z :[self.batch_size, self.z_dim]
        # first feed noize in rnn, then feed the previous output into next input
        # or we can feed noize and previous output into next input in future version
        with tf.variable_scope("g_enerator", reuse=reuse):
            #gennerate 
            
            wr_h=tf.get_variable("g_wr_h",shape=[self.n_inputs,self.n_hidden_units],initializer=tf.random_normal_initializer())
            w_out= tf.get_variable("g_w_out",shape=[self.n_hidden_units, self.n_inputs],initializer=tf.random_normal_initializer())
            br_h= tf.get_variable("g_br_h",shape=[self.n_hidden_units, ],initializer=tf.constant_initializer(0.001))
            b_out= tf.get_variable("g_b_out",shape=[self.n_inputs, ],initializer=tf.constant_initializer(0.001))
            w_z=tf.get_variable("g_w_z",shape=[self.z_dim,self.n_inputs],initializer=tf.random_normal_initializer())
            b_z=tf.get_variable("g_b_z",shape=[self.n_inputs, ],initializer=tf.constant_initializer(0.001))
            
            #self.times=tf.reshape(self.times,[self.batch_size,self.n_steps,self.n_inputs])
            #change z's dimension
            # batch_size*z_dim-->batch_size*n_inputs
            x=tf.matmul(z,w_z)+b_z
            x=tf.reshape(x,[-1,self.n_inputs])
            delta_zero=tf.constant(0.0,shape=[self.batch_size,self.n_inputs])
          
            

            # combine X_in
            rth= tf.matmul(delta_zero, wr_h)+br_h
            rth=math_ops.exp(-tf.maximum(0.0,rth))
            x=tf.concat([x,rth],1)
            
            X_in = tf.reshape(x, [-1, 1, self.n_inputs+self.n_hidden_units])
            
            init_state = self.grud_cell_g.zero_state(self.batch_size, dtype=tf.float32) # 初始化全零 state
            #z=tf.reshape(z,[self.batch_size,1,self.z_dim])
            seq_len=tf.constant(1,shape=[self.batch_size])
            
            outputs, final_state = tf.nn.dynamic_rnn(self.grud_cell_g, X_in, \
                                initial_state=init_state,\
                                sequence_length=seq_len,
                                time_major=False)
            init_state=final_state
            #outputs: batch_size*1*n_hidden
            outputs=tf.reshape(outputs,[-1,self.n_hidden_units])
            # full connect
            out_predict=tf.matmul(tf.nn.dropout(outputs,Keep_prob), w_out) + b_out
            out_predict=tf.reshape(out_predict,[-1,1,self.n_inputs])
            
            total_result=tf.multiply(out_predict,1.0)
            
            for i in range(1,self.n_steps):
                out_predict=tf.reshape(out_predict,[self.batch_size,self.n_inputs])
               
                delta_normal=tf.reshape(self.imputed_deltapre[:,i:(i+1),:],[self.batch_size,self.n_inputs])
                rth= tf.matmul(delta_normal, wr_h)+br_h
                rth=math_ops.exp(-tf.maximum(0.0,rth))
                x=tf.concat([out_predict,rth],1)
                X_in = tf.reshape(x, [-1, 1, self.n_inputs+self.n_hidden_units])
                
                outputs, final_state = tf.nn.dynamic_rnn(self.grud_cell_g, X_in, \
                            initial_state=init_state,\
                            sequence_length=seq_len,
                            time_major=False)
                init_state=final_state
                outputs=tf.reshape(outputs,[-1,self.n_hidden_units])
                out_predict=tf.matmul(tf.nn.dropout(outputs,Keep_prob), w_out) + b_out
                out_predict=tf.reshape(out_predict,[-1,1,self.n_inputs])
                total_result=tf.concat([total_result,out_predict],1)
            
           
        
            if self.isbatch_normal:
                with tf.variable_scope("g_bn", reuse=tf.AUTO_REUSE):
                    total_result=bn(total_result,is_training=is_training, scope="g_bn_imple")
            
            
            last_values=tf.multiply(total_result,1)
            sub_values=tf.multiply(total_result,1)

            return total_result,self.imputed_deltapre,self.imputed_deltasub,self.imputed_m,self.x_lengths,last_values,sub_values
        
    def impute(self):
        with tf.variable_scope("impute", reuse=tf.AUTO_REUSE):
            z_need_tune=tf.get_variable("z_needtune",shape=[self.batch_size,self.z_dim],initializer=tf.random_normal_initializer(mean=0,stddev=0.1) )
            return z_need_tune
            
    def build_model(self):
        
        self.keep_prob = tf.placeholder(tf.float32) 
        self.x = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
        self.y = tf.placeholder(tf.float32, [None, self.n_classes])
        self.m = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
        self.mean = tf.placeholder(tf.float32, [self.n_inputs,])
        self.deltaPre = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
        self.lastvalues = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
        self.deltaSub = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
        self.subvalues = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
        self.x_lengths = tf.placeholder(tf.int32,  shape=[self.batch_size,])
        self.imputed_deltapre=tf.placeholder(tf.float32,  shape=[self.batch_size,self.n_steps,self.n_inputs])
        self.imputed_deltasub=tf.placeholder(tf.float32,  shape=[self.batch_size,self.n_steps,self.n_inputs])
        self.imputed_m = tf.placeholder(tf.float32, [None, self.n_steps, self.n_inputs])
        self.z = tf.placeholder(tf.float32, [self.batch_size, self.z_dim], name='z')

        
        

        """ Loss Function """

        Pre_out=self.pretrainG(self.x, self.m, self.deltaPre,  self.mean,\
                                                      self.lastvalues, self.x_lengths,self.keep_prob, \
                                                      is_training=True, reuse=False)
        
        self.pretrain_loss=tf.reduce_sum(tf.square(tf.multiply(Pre_out,self.m)-self.x)) / tf.cast(tf.reduce_sum(self.x_lengths),tf.float32)
        
        #discriminator( X, M, DeltaPre, Lastvalues ,DeltaSub ,SubValues , Mean,  X_lengths,Keep_prob, is_training=True, reuse=False, isTdata=True):
        
        D_real, D_real_logits = self.discriminator(self.x, self.m, self.deltaPre,self.lastvalues,\
                                                   self.deltaSub,self.subvalues,  self.mean,\
                                                       self.x_lengths,self.keep_prob, \
                                                      is_training=True, reuse=False,isTdata=True)

        #G return total_result,self.imputed_deltapre,self.imputed_deltasub,self.imputed_m,self.x_lengths,last_values,sub_values
        g_x,g_deltapre,g_deltasub,g_m,G_x_lengths,g_last_values,g_sub_values = self.generator(self.z,self.keep_prob, is_training=True, reuse=True)
        
        D_fake, D_fake_logits = self.discriminator(g_x,g_m,g_deltapre,g_last_values,\
                                                   g_deltasub,g_sub_values,self.mean,\
                                                      G_x_lengths,self.keep_prob,
                                                      is_training=True, reuse=True ,isTdata=False)
        
        """
        impute loss
        """
        self.z_need_tune=self.impute()
        
        impute_out,impute_deltapre,impute_deltasub,impute_m,impute_x_lengths,impute_last_values,impute_sub_values=self.generator(self.z_need_tune,self.keep_prob, is_training=False, reuse=True)
        
        
        impute_fake, impute_fake_logits = self.discriminator(impute_out,impute_m,impute_deltapre,impute_last_values,\
                                                             impute_deltasub,impute_sub_values,self.mean,\
                                                      impute_x_lengths,self.keep_prob,
                                                      is_training=False, reuse=True ,isTdata=False)
        
        # loss for imputation
        
        self.impute_loss=tf.reduce_mean(tf.square(tf.multiply(impute_out,self.m)-self.x))-self.g_loss_lambda*tf.reduce_mean(impute_fake_logits)
        
        self.impute_out=impute_out
        
        #the imputed results
        self.imputed=tf.multiply((1-self.m),self.impute_out)+self.x
        # get loss for discriminator
        d_loss_real = - tf.reduce_mean(D_real_logits)
        d_loss_fake = tf.reduce_mean(D_fake_logits)


        self.d_loss = d_loss_real + d_loss_fake

        # get loss for generator
        self.g_loss = - d_loss_fake
        

        """ Training """
        # divide trainable variables into a group for D and a group for G
        t_vars = tf.trainable_variables()
        d_vars = [var for var in t_vars if 'd_' in var.name]
        g_vars = [var for var in t_vars if 'g_' in var.name]
        z_vars = [self.z_need_tune]
        '''
        print("d vars:")
        for v in d_vars:
            print(v.name)
        print("g vars:")
        for v in g_vars:
            print(v.name)
        print("z vars:")
        for v in z_vars:
            print(v.name)
        '''
        
        #don't need normalization because we have adopted the dropout
        """
        ld = 0.0
        for w in d_vars:
            ld += tf.contrib.layers.l2_regularizer(1e-4)(w)
        lg = 0.0
        for w in g_vars:
            lg += tf.contrib.layers.l2_regularizer(1e-4)(w)
        
        self.d_loss+=ld
        self.g_loss+=lg
        """
        
        # optimizers
        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        # this code have used batch normalization, so the upside line should be executed
            self.d_optim = tf.train.AdamOptimizer(self.learning_rate, beta1=self.beta1) \
                        .minimize(self.d_loss, var_list=d_vars)
            #self.d_optim=self.optim(self.learning_rate, self.beta1,self.d_loss,d_vars)
            self.g_optim = tf.train.AdamOptimizer(self.learning_rate*self.disc_iters, beta1=self.beta1) \
                        .minimize(self.g_loss, var_list=g_vars)
            #self.g_optim=self.optim(self.learning_rate, self.beta1,self.g_loss,g_vars)
            self.g_pre_optim=tf.train.AdamOptimizer(self.learning_rate*2,beta1=self.beta1) \
                        .minimize(self.pretrain_loss,var_list=g_vars)
        self.impute_optim=tf.train.AdamOptimizer(self.learning_rate*7,beta1=self.beta1) \
                    .minimize(self.impute_loss,var_list=z_vars)
    
        
        

        #clip weight
        self.clip_all_vals = [p.assign(tf.clip_by_value(p, -0.99, 0.99)) for p in t_vars]
        self.clip_D = [p.assign(tf.clip_by_value(p, -0.99, 0.99)) for p in d_vars]
        self.clip_G = [p.assign(tf.clip_by_value(p, -0.99, 0.99)) for p in g_vars]
        
        
        """" Testing """
        # for test
        self.fake_x,self.fake_delta,_,_,_,_,_ = self.generator(self.z, self.keep_prob, is_training=False, reuse=True)

        """ Summary """
        d_loss_real_sum = tf.summary.scalar("d_loss_real", d_loss_real)
        d_loss_fake_sum = tf.summary.scalar("d_loss_fake", d_loss_fake)
        d_loss_sum = tf.summary.scalar("d_loss", self.d_loss)
        g_loss_sum = tf.summary.scalar("g_loss", self.g_loss)
        g_pretrain_loss_sum=tf.summary.scalar("g_pretrain_loss", self.pretrain_loss)
        # final summary operations
        self.impute_sum=tf.summary.scalar("impute_loss", self.impute_loss)
        self.g_sum = g_loss_sum
        self.g_pretrain_sum=tf.summary.merge([g_pretrain_loss_sum])
        self.d_sum = tf.summary.merge([d_loss_real_sum,d_loss_fake_sum, d_loss_sum])
        
    def optim(self,learning_rate,beta,loss,var):
        optimizer = tf.train.AdamOptimizer(learning_rate, beta1=beta)
        grads = optimizer.compute_gradients(loss,var_list=var)
        for i, (g, v) in enumerate(grads):
            if g is not None:
                grads[i] = (tf.clip_by_norm(g, 5), v)  # clip gradients
        train_op = optimizer.apply_gradients(grads)
        return train_op
    def pretrain(self, start_epoch,counter,start_time):
        
        if start_epoch < self.pretrain_epoch:
            #todo
            for epoch in range(start_epoch, self.pretrain_epoch):
            # get batch data
                self.datasets.shuffle(self.batch_size,True)
                idx=0
                #x,y,mean,m,deltaPre,x_lengths,lastvalues,files,imputed_deltapre,imputed_m,deltaSub,subvalues,imputed_deltasub
                for data_x,data_y,data_mean,data_m,data_deltaPre,data_x_lengths,data_lastvalues,_,imputed_deltapre,imputed_m,deltaSub,subvalues,imputed_deltasub in self.datasets.nextBatch():
                    
                    # pretrain
                    _, summary_str, p_loss = self.sess.run([self.g_pre_optim, self.g_pretrain_sum, self.pretrain_loss],
                                                   feed_dict={self.x: data_x,
                                                              self.m: data_m,
                                                              self.deltaPre: data_deltaPre,
                                                              self.mean: data_mean,
                                                              self.x_lengths: data_x_lengths,
                                                              self.lastvalues: data_lastvalues,
                                                              self.deltaSub:deltaSub,
                                                              self.subvalues:subvalues,
                                                              self.imputed_m:imputed_m,
                                                              self.imputed_deltapre:imputed_deltapre,
                                                              self.imputed_deltasub:imputed_deltasub,
                                                              self.keep_prob: 0.5})
                    self.writer.add_summary(summary_str, counter)
    
    
                    counter += 1
    
                    # display training status
                    print("Epoch: [%2d] [%4d/%4d] time: %4.4f, pretrain_loss: %.8f" \
                          % (epoch, idx, self.num_batches, time.time() - start_time, p_loss))
                    idx+=1
               
                start_batch_id = 0


    def train(self):

        # graph inputs for visualize training results
        self.sample_z = np.random.standard_normal(size=(self.batch_size , self.z_dim))

        # saver to save model
        self.saver = tf.train.Saver()

        # summary writer
        self.writer = tf.summary.FileWriter(self.log_dir + '/' + self.model_name+'/'+self.model_dir)

        # restore check-point if it exits
        could_load, checkpoint_counter = self.load(self.checkpoint_dir)
        if could_load:
            start_epoch = (int)(checkpoint_counter / self.num_batches)
            #start_batch_id = checkpoint_counter - start_epoch * self.num_batches
            start_batch_id=0
            #counter = checkpoint_counter
            counter=start_epoch*self.num_batches
            print(" [*] Load SUCCESS")
            return 
        else:
            # initialize all variables
            tf.global_variables_initializer().run()
            start_epoch = 0
            start_batch_id = 0
            counter = 1
            print(" [!] Load failed...")

        # loop for epoch
        start_time = time.time()
        
        self.pretrain(start_epoch,counter,start_time)
        if start_epoch < self.pretrain_epoch:
            start_epoch=self.pretrain_epoch
        
        for epoch in range(start_epoch, self.epoch):

            # get batch data
            self.datasets.shuffle(self.batch_size,True)
            idx=0
            for data_x,data_y,data_mean,data_m,data_deltaPre,data_x_lengths,data_lastvalues,_,imputed_deltapre,imputed_m,deltaSub,subvalues,imputed_deltasub in self.datasets.nextBatch():
                
                batch_z = np.random.standard_normal(size=(self.batch_size, self.z_dim))
                #_ = self.sess.run(self.clip_D)
                _ = self.sess.run(self.clip_all_vals)
                _, summary_str, d_loss = self.sess.run([self.d_optim, self.d_sum, self.d_loss],
                                               feed_dict={self.z: batch_z,
                                                          self.x: data_x,
                                                          self.m: data_m,
                                                          self.deltaPre: data_deltaPre,
                                                          self.mean: data_mean,
                                                          self.x_lengths: data_x_lengths,
                                                          self.lastvalues: data_lastvalues,
                                                          self.deltaSub:deltaSub,
                                                          self.subvalues:subvalues,
                                                          self.imputed_m:imputed_m,
                                                          self.imputed_deltapre:imputed_deltapre,
                                                          self.imputed_deltasub:imputed_deltasub,
                                                          self.keep_prob: 0.5})
                self.writer.add_summary(summary_str, counter)

                # update G network
                if counter%self.disc_iters==0:
                    #batch_z = np.random.normal(0, 1, [self.batch_size, self.z_dim]).astype(np.float32)
                    _, summary_str, g_loss = self.sess.run([self.g_optim, self.g_sum, self.g_loss], 
                                                           feed_dict={self.z: batch_z,
                                                           self.keep_prob: 0.5,
                                                           self.deltaPre: data_deltaPre,
                                                           self.mean: data_mean,
                                                           self.x_lengths: data_x_lengths,
                                                           self.lastvalues: data_lastvalues,
                                                           self.deltaSub:deltaSub,
                                                           self.subvalues:subvalues,
                                                           self.imputed_m:imputed_m,
                                                           self.imputed_deltapre:imputed_deltapre,
                                                           self.imputed_deltasub:imputed_deltasub,
                                                           self.mean: data_mean})
                    self.writer.add_summary(summary_str, counter)
                    print("Epoch: [%2d] [%4d/%4d] time: %4.4f, d_loss: %.8f, g_loss: %.8f,counter:%4d" \
                      % (epoch, idx, self.num_batches, time.time() - start_time, d_loss, g_loss,counter))
                    #debug 

                counter += 1

                # display training status
                print("Epoch: [%2d] [%4d/%4d] time: %4.4f, d_loss: %.8f, counter:%4d" \
                      % (epoch, idx, self.num_batches, time.time() - start_time, d_loss, counter))

                # save training results for every 300 steps
                if np.mod(counter, 300) == 0 :
                    fake_x,fake_delta = self.sess.run([self.fake_x,self.fake_delta],
                                            feed_dict={self.z: batch_z,
                                                       self.deltaPre: data_deltaPre,
                                                       self.mean: data_mean,
                                                       self.x_lengths: data_x_lengths,
                                                       self.lastvalues: data_lastvalues,
                                                       self.deltaSub:deltaSub,
                                                       self.subvalues:subvalues,
                                                       self.imputed_m:imputed_m,
                                                       self.imputed_deltapre:imputed_deltapre,
                                                       self.imputed_deltasub:imputed_deltasub,
                                                       self.mean: data_mean,
                                                       self.keep_prob: 0.5})
                    if self.run_type=="train":
                        self.writeG_Samples("G_sample_x",counter,fake_x)
                        self.writeG_Samples("G_sample_delta",counter,fake_delta)
                    
                idx+=1
            # After an epoch, start_batch_id is set to zero
            # non-zero value is only for the first epoch after loading pre-trained model
            start_batch_id = 0

        
        self.save(self.checkpoint_dir, counter)

    def imputation(self,dataset,isTrain):
        self.datasets=dataset
        self.datasets.shuffle(self.batch_size,True)
        tf.variables_initializer([self.z_need_tune]).run()
       
        start_time = time.time()
        batchid=1
        impute_tune_time=1
        counter=1
        for data_x,data_y,data_mean,data_m,data_deltaPre,data_x_lengths,data_lastvalues,_,imputed_deltapre,imputed_m,deltaSub,subvalues,imputed_deltasub in self.datasets.nextBatch():
            
            
            np.save("588/batch"+str(batchid)+"x.npy", data_x)
            
            tf.variables_initializer([self.z_need_tune]).run()
            for i in range(0,self.impute_iter):
                _, impute_out, summary_str, impute_loss, imputed = self.sess.run([self.impute_optim, self.impute_out, self.impute_sum, self.impute_loss, self.imputed], \
                                                       feed_dict={self.x: data_x,
                                                                  self.m: data_m,
                                                                  self.deltaPre: data_deltaPre,
                                                                  self.mean: data_mean,
                                                                  self.x_lengths: data_x_lengths,
                                                                  self.lastvalues: data_lastvalues,
                                                                  self.deltaSub:deltaSub,
                                                                  self.subvalues:subvalues,
                                                                  self.imputed_m:imputed_m,
                                                                  self.imputed_deltapre:imputed_deltapre,
                                                                  self.imputed_deltasub:imputed_deltasub,
                                                                  self.keep_prob: 1.0})
                impute_tune_time+=1
                counter+=1
                if counter%10==0:
                    print("Batchid: [%2d] [%4d/%4d] time: %4.4f, impute_loss: %.8f" \
                          % (batchid, impute_tune_time, self.impute_iter, time.time() - start_time, impute_loss))
                    self.writer.add_summary(summary_str, counter/10)
            #imputed=tf.multiply((1-self.m),impute_out)+data_x
            self.save_imputation(imputed,batchid,data_x_lengths,data_deltaPre,data_y,isTrain)
            batchid+=1
            impute_tune_time=1
    @property
    def model_dir(self):
        return "{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
            self.epoch,self.disc_iters,
            self.batch_size, self.z_dim,
            self.lr,self.impute_iter,
            self.isNormal,self.isbatch_normal,
            self.isSlicing,self.g_loss_lambda,
            self.beta1
            )


    def save_imputation(self,impute_out,batchid,data_x_lengths,data_times,data_y,isTrain):
       
        if isTrain:
            imputation_dir="588/imputation_train_results"
        else:
            imputation_dir="imputation_test_results"
        
        if not os.path.exists(os.path.join(imputation_dir,\
                                     self.model_name,\
                                     self.model_dir)):
            os.makedirs(os.path.join(imputation_dir,\
                                     self.model_name,\
                                     self.model_dir))
            
        #write imputed data
        resultFile=open(os.path.join(imputation_dir,\
                                     self.model_name,\
                                     self.model_dir,\
                                     "batch"+str(batchid)+"x"),'w')
        for length in data_x_lengths:
            resultFile.writelines(str(length)+",")
        resultFile.writelines("\r\n")
        # impute_out:ndarray
        for oneSeries in impute_out:
            resultFile.writelines("begin\r\n")
            for oneClass in oneSeries:
                for i in oneClass.flat:
                    resultFile.writelines(str(i)+",")
                resultFile.writelines("\r\n")
            resultFile.writelines("end\r\n")
        resultFile.close()
        
        #write data_times data_times:list
        resultFile=open(os.path.join(imputation_dir,\
                                     self.model_name,\
                                     self.model_dir,\
                                     "batch"+str(batchid)+"delta"),'w')
        for oneSeries in data_times:
            resultFile.writelines("begin\r\n")
            for oneClass in oneSeries:
                for i in oneClass:
                    resultFile.writelines(str(i)+",")
                resultFile.writelines("\r\n")
            resultFile.writelines("end\r\n")
        resultFile.close()
        
        
    def writeG_Samples(self,filename,step,o):
        if not os.path.exists(os.path.join("G_results",\
                                     self.model_name,\
                                     self.model_dir)):
            os.makedirs(os.path.join("G_results",\
                                     self.model_name,\
                                     self.model_dir))
        resultFile=open(os.path.join("G_results",\
                                     self.model_name,\
                                     self.model_dir,\
                                     filename+str(step)),'w')
        for oneSeries in o:
            resultFile.writelines("begin\r\n")
            for oneClass in oneSeries:
                for i in oneClass.flat:
                    resultFile.writelines(str(i)+",")
                resultFile.writelines("\r\n")
            resultFile.writelines("end\r\n")
        resultFile.close()
    
    def save(self, checkpoint_dir, step):
        checkpoint_dir = os.path.join(checkpoint_dir, self.model_name, self.model_dir )

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        self.saver.save(self.sess,os.path.join(checkpoint_dir, self.model_name+'.model'), global_step=step)

    def load(self, checkpoint_dir):
        import re
        print(" [*] Reading checkpoints...")
        checkpoint_dir = os.path.join(checkpoint_dir, self.model_name, self.model_dir)

        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            self.saver.restore(self.sess, os.path.join(checkpoint_dir, ckpt_name))
            counter = int(next(re.finditer("(\d+)(?!.*\d)",ckpt_name)).group(0))
            print(" [*] Success to read {}".format(ckpt_name))
            return True, counter
        else:
            print(" [*] Failed to find a checkpoint")
            return False, 0

In [6]:
class ReadPhysionetData():
    # first read all dataset
    # before call, determine wheher shuffle
    # produce next batch
    def __init__(self, dataPath, isNormal):

        fileNames=os.listdir('E:/master/thesis/code/mutivariate/data/588')[1:]
        labels=[1,0]*5
        
        self.dataPath = dataPath
        self.fileNames = fileNames

        dic={'time':-1,'glucose_value':0}
    
        self.dic=dic
        mean=[0.0]*(len(dic)-1)
        meancount=[0]*(len(dic)-1)
        x=[]
        times=[]
        non_in_dic_count=0
        # times: totalFilesLength*steps
        # x: totalFilesLength*steps*feature_length
        for fileName in fileNames:
            f=open(os.path.join(self.dataPath, fileName))
            count=0
            
            lastTime=0
            totalData=[]
            t_times=[]
            for line in f.readlines():
                if count >= 0:
                    words=line.strip().split(",")
                    timestamp=words[0]
                    feature='glucose_value'
                    value=words[1]
                
                    if timestamp!=lastTime:
                        data=[0.0]*(len(dic)-1)
                        
                        t_times.append(float((datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') - datetime.strptime('2021-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')).total_seconds())/60)

                        data[self.dic[feature]]=float(value)
                        mean[self.dic[feature]]+=float(value)
                        meancount[self.dic[feature]]+=1

                        totalData.append(data)
                    else:
                        
                        totalData[-1][self.dic[feature]]=float(value)
                        mean[self.dic[feature]]+=float(value)
                        meancount[self.dic[feature]]+=1
                            
                            
                    lastTime=timestamp      
                count+=1
                
            
            x.append(totalData)
            times.append(t_times)
            f.close()
       
        self.x=x
        self.y=labels
        self.times=times
       
        
        print(mean)
        print(meancount)
        for i in range(len(mean)):
            if meancount[i]!=0:
                mean[i]=mean[i]/meancount[i]
        self.mean=mean
        
        
        # normalization
        m=[] # mask 0/1
        # first calculate std
        self.std=[0.0]*(len(dic)-1)
        for onefile in self.x:
            one_m=[]
            for oneclass in onefile:
                t_m=[0]*len(oneclass)
                for j in range(len(oneclass)):
                    if oneclass[j] !=0:
                        self.std[j]+=(oneclass[j]-self.mean[j])**2
                        t_m[j]=1
                one_m.append(t_m)
            m.append(one_m)
        for j in range(len(self.std)):
            self.std[j]=math.sqrt(1.0/(meancount[j]-1)*self.std[j])
        
        self.isNormal=isNormal
        self.normalization(isNormal)    
            
                        
        x_lengths=[]
        deltaPre=[] #time difference 
        lastvalues=[] # if missing, last values, last observed value of each point
        deltaSub=[]
        subvalues=[]
        for h in range(len(self.x)):
            # oneFile: steps*value_number
            oneFile=self.x[h]
            one_time=self.times[h]
            x_lengths.append(len(oneFile))
            
            one_deltaPre=[]
            one_lastvalues=[]
            
            one_deltaSub=[]
            one_subvalues=[]
            
            one_m=m[h]
            for i in range(len(oneFile)):
                t_deltaPre=[0.0]*len(oneFile[i])
                t_lastvalue=[0.0]*len(oneFile[i])
                one_deltaPre.append(t_deltaPre)
                one_lastvalues.append(t_lastvalue)
                
                if i==0:
                    for j in range(len(oneFile[i])):
                        one_lastvalues[i][j]=0.0 if one_m[i][j]==0 else oneFile[i][j]
                    continue
                #i!=0
                for j in range(len(oneFile[i])):
                    if one_m[i-1][j]==1:
                        one_deltaPre[i][j]=one_time[i]-one_time[i-1]
                    if one_m[i-1][j]==0:
                        one_deltaPre[i][j]=one_time[i]-one_time[i-1]+one_deltaPre[i-1][j]
                        
                    if one_m[i][j]==1:
                        one_lastvalues[i][j]=oneFile[i][j]
                    if one_m[i][j]==0:
                        one_lastvalues[i][j]=one_lastvalues[i-1][j]
        
            for i in range(len(oneFile)):
                t_deltaSub=[0.0]*len(oneFile[i])
                t_subvalue=[0.0]*len(oneFile[i])
                one_deltaSub.append(t_deltaSub)
                one_subvalues.append(t_subvalue)
            #construct array 
            for i in range(len(oneFile)-1,-1,-1):    
                if i==len(oneFile)-1:
                    for j in range(len(oneFile[i])):
                        one_subvalues[i][j]=0.0 if one_m[i][j]==0 else oneFile[i][j]
                    continue
                for j in range(len(oneFile[i])):
                    if one_m[i+1][j]==1:
                        one_deltaSub[i][j]=one_time[i+1]-one_time[i]
                    if one_m[i+1][j]==0:
                        one_deltaSub[i][j]=one_time[i+1]-one_time[i]+one_deltaSub[i+1][j]
                        
                    if one_m[i][j]==1:
                        one_subvalues[i][j]=oneFile[i][j]
                    if one_m[i][j]==0:
                        one_subvalues[i][j]=one_subvalues[i+1][j]   
                
            
            #m.append(one_m)
            deltaPre.append(one_deltaPre)
            lastvalues.append(one_lastvalues)
            deltaSub.append(one_deltaSub)
            subvalues.append(one_subvalues)
        self.m=m
        self.deltaPre=deltaPre
        self.lastvalues=lastvalues
        self.deltaSub=deltaSub
        self.subvalues=subvalues
        self.x_lengths=x_lengths
        self.maxLength=max(x_lengths)
        
        print("max_length is : "+str(self.maxLength))
        print("non_in_dic_count is : "+str(non_in_dic_count))
        
        resultFile=open(os.path.join("588/","meanAndstd"),'w')
        for i in range(len(self.mean)):
            resultFile.writelines(str(self.mean[i])+","+str(self.std[i])+","+str(meancount[i])+"\r\n")
        resultFile.close()
        
    def normalization(self,isNormal):
        if not isNormal:
            return
        for onefile in self.x:
            for oneclass in onefile:
                for j in range(len(oneclass)):
                    if oneclass[j] !=0:
                        if self.std[j]==0:
                            oneclass[j]=0.0
                        else:
                            oneclass[j]=1.0/self.std[j]*(oneclass[j]-self.mean[j])
                            
        print('22')
       
    
    def nextBatch(self):
        print('33')
        i=1
        #batchsize: 128, len(x)=3594
        while i*self.batchSize<=len(self.x):
            #sub_time=[]
            x=[]
            y=[]
            m=[]
            deltaPre=[]
            x_lengths=[]
            lastvalues=[]
            deltaSub=[]
            subvalues=[]
            imputed_deltapre=[]
            imputed_m=[]
            imputed_deltasub=[]
            mean=self.mean
            files=[]
            
            # 0-128 / 128-256...
            for j in range((i-1)*self.batchSize,i*self.batchSize):
                files.append(self.fileNames[j])
                
                #sub_time.append(self.times[j])
                x.append(self.x[j])
                y.append(self.y[j])
                m.append(self.m[j])
                deltaPre.append(self.deltaPre[j])
                deltaSub.append(self.deltaSub[j])
               
                x_lengths.append(self.x_lengths[j])
                lastvalues.append(self.lastvalues[j])
                subvalues.append(self.subvalues[j])
                
                #0-128, i=1 / 128-256, i=2 ... -> jj: 0-128
                jj=j-(i-1)*self.batchSize
                #times.append(self.times[j])
                while len(x[jj])<self.maxLength:
                   
                    t1=[0.0]*(len(self.dic)-1)
                    x[jj].append(t1)
                    #times[jj].append(0.0)
                    t2=[0]*(len(self.dic)-1)
                    m[jj].append(t2)
                    t3=[0.0]*(len(self.dic)-1)
                    deltaPre[jj].append(t3)
                    t4=[0.0]*(len(self.dic)-1)
                    lastvalues[jj].append(t4)
                    t5=[0.0]*(len(self.dic)-1)
                    deltaSub[jj].append(t5)
                    t6=[0.0]*(len(self.dic)-1)
                    subvalues[jj].append(t6)
           
            for j in range((i-1)*self.batchSize,i*self.batchSize):
                one_imputed_deltapre=[]
                one_imputed_deltasub=[]
                one_G_m=[]
               
                for h in range(0,self.x_lengths[j]):
                        
                    if h==0:
                        one_f_time=[0.0]*(len(self.dic)-1)
                        one_imputed_deltapre.append(one_f_time)
                        try:
                            one_sub=[self.times[j][h+1]-self.times[j][h]]*(len(self.dic)-1)
                        except:
                            print("error: "+str(h)+" "+str(len(self.times[j]))+" "+self.fileNames[j])
                        one_imputed_deltasub.append(one_sub)
                        one_f_g_m=[1.0]*(len(self.dic)-1)
                        one_G_m.append(one_f_g_m)
                    elif h==self.x_lengths[j]-1:
                        one_f_time=[self.times[j][h]-self.times[j][h-1]]*(len(self.dic)-1)
                        one_imputed_deltapre.append(one_f_time)
                        one_sub=[0.0]*(len(self.dic)-1)
                        one_imputed_deltasub.append(one_sub)
                        one_f_g_m=[1.0]*(len(self.dic)-1)
                        one_G_m.append(one_f_g_m)
                    else:
                        one_f_time=[self.times[j][h]-self.times[j][h-1]]*(len(self.dic)-1)
                        one_imputed_deltapre.append(one_f_time)
                        one_sub=[self.times[j][h+1]-self.times[j][h]]*(len(self.dic)-1)
                        one_imputed_deltasub.append(one_sub)
                        one_f_g_m=[1.0]*(len(self.dic)-1)
                        one_G_m.append(one_f_g_m)
                while len(one_imputed_deltapre)<self.maxLength:
                    one_f_time=[0.0]*(len(self.dic)-1)
                    one_imputed_deltapre.append(one_f_time)
                    one_sub=[0.0]*(len(self.dic)-1)
                    one_imputed_deltasub.append(one_sub)
                    one_f_g_m=[0.0]*(len(self.dic)-1)
                    one_G_m.append(one_f_g_m)
                imputed_deltapre.append(one_imputed_deltapre)
                imputed_deltasub.append(one_imputed_deltasub)
                imputed_m.append(one_G_m)
               
            i+=1
            if self.isNormal:
                yield  x,y,[0.0]*(len(self.dic)-1),m,deltaPre,x_lengths,lastvalues,files,imputed_deltapre,imputed_m,deltaSub,subvalues,imputed_deltasub
            else:
                yield x,y,mean,m,deltaPre,x_lengths,lastvalues,files,imputed_deltapre,imputed_m,deltaSub,subvalues,imputed_deltasub
            
                
        
    def shuffle(self,batchSize=10,isShuffle=False):
        self.batchSize=batchSize
        if isShuffle:
            c = list(zip(self.x,self.y,self.m,self.deltaPre,self.x_lengths,self.lastvalues,self.fileNames,self.times,self.deltaSub,self.subvalues))
            random.shuffle(c)
            self.x,self.y,self.m,self.deltaPre,self.x_lengths,self.lastvalues,self.fileNames,self.times,self.deltaSub,self.subvalues=zip(*c)
        #print('ww')

if __name__ == '__main__':
    
    dt=ReadPhysionetData("data/588/",isNormal=True)
    dt.shuffle(5,False)
    batchCount=1
    X_lengths=dt.x_lengths
    Time=dt.times[-144:-16]
    print(sum(X_lengths)/len(X_lengths))
    
    for x,y,mean,m,deltaPre,x_lengths,lastvalues,files,imputed_deltapre,imputed_m,deltaSub,subvalues,imputed_deltasub in dt.nextBatch():
        #print(x)
        batchCount+=1
        if batchCount%100==0:
            print(files)

def f():
    print("readData")

[205815.0]
[1720]
22
max_length is : 344
non_in_dic_count is : 0
344.0
33


In [None]:
"""main"""
def main():
    # parse arguments
    
    args = argparse.Namespace()
    args.gpus = None
    args.batch_size = 5
    args.gen_length = 96
    args.impute_iter = 400
    args.pretrain_epoch = 5
    args.run_type = 'train'
    args.data_path = "data/588/"
    args.model_path = None
    args.result_path = None
    args.dataset_name = None
    args.g_loss_lambda = 0.15
    args.beta1 = 0.5
    args.lr = 0.0005
    args.epoch = 15
    args.n_inputs = 1
    args.n_hidden_units = 64
    args.n_classes = 2
    args.z_dim = 256
    args.checkpoint_dir = '588/checkpoint'
    args.result_dir = '588/results'
    args.log_dir = '588/logs'
    args.isNormal = 1
    args.isBatch_normal = 1
    args.isSlicing = 1
    args.disc_iters = 8
    
    if args.isBatch_normal==0:
            args.isBatch_normal=False
    if args.isBatch_normal==1:
            args.isBatch_normal=True
    if args.isNormal==0:
            args.isNormal=False
    if args.isNormal==1:
            args.isNormal=True
    if args.isSlicing==0:
            args.isSlicing=False
    if args.isSlicing==1:
            args.isSlicing=True

    #make the max step length of two datasett the same
    epochs=[15]
    g_loss_lambdas=[0.15]
    beta1s=[0.5]
    for beta1 in beta1s:
        for e in epochs:
            for g_l in g_loss_lambdas:
                args.epoch=e
                args.beta1=beta1
                args.g_loss_lambda=g_l
                tf.reset_default_graph()
                dt_train=ReadPhysionetData(os.path.join(args.data_path),isNormal=args.isNormal)
               
                tf.reset_default_graph()
                config = tf.ConfigProto() 
                config.gpu_options.allow_growth = True 
                with tf.Session(config=config) as sess:
                    gan = WGAN(sess,
                                args=args,
                                datasets=dt_train,
                                )
            
                    # build graph
                    gan.build_model()
            
                   
                    gan.train()
                    print(" [*] Training finished!")
                    
                    gan.imputation(dt_train,True)
                    
                    print(" [*] Train dataset Imputation finished!")
                    
                    
                    print(" [*] Test dataset Imputation finished!")
                tf.reset_default_graph()
if __name__ == '__main__':
    main()

# Evaluation

In [8]:
# read original missing data
def read_ori(filename):
    values = []
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()  
            parts = line.split(',') 
            if len(parts) >= 2: 
                value_str = parts[1].strip() 
                try:
                    value = float(value_str) 
                    values.append(value) 
                except ValueError:
                    pass  

    return values

In [9]:
# read imputed data
def read_text(filename):
    glucose = []
    with open(filename, 'r') as file:
      
        next(file)
        
      
        for line in file:
            line = line.strip() 
            if line == "begin":
                glucose.append([])  
            elif line == "end":
                pass  
            elif line:  
                value = float(line.strip(',')) 
                glucose[-1].append(value)  
    
    return glucose

In [10]:
def normalization (data, parameters=None):
  '''Normalize data in [0, 1] range.
  
  Args:
    - data: original data
  
  Returns:
    - norm_data: normalized data
    - norm_parameters: min_val, max_val for each feature for renormalization
  '''

  # Parameters
  _, dim = data.shape
  norm_data = data.copy()
  
  if parameters is None:
  
    # MixMax normalization
    min_val = np.nanmin(norm_data)
    norm_data = norm_data - np.nanmin(norm_data)
    max_val = np.nanmax(norm_data)
    norm_data = norm_data / (np.nanmax(norm_data) + 1e-6)  
    
    norm_parameters = {'min_val': min_val,
                       'max_val': max_val}
    
  else:
    min_val = parameters['min_val']
    max_val = parameters['max_val']

    # For each dimension
    norm_data = norm_data - min_val
    norm_data = norm_data / (max_val + 1e-6)  

    norm_parameters = parameters  
          
      
  return norm_data, norm_parameters

In [11]:
def rmse_loss (ori_data, imputed_data, data_m):
  '''Compute RMSE loss between ori_data and imputed_data
  
  Args:
    - ori_data: original data without missing values
    - imputed_data: imputed data
    - data_m: indicator matrix for missingness
    
  Returns:
    - rmse: Root Mean Squared Error
  '''
  
  ori_data, norm_parameters = normalization(ori_data)
  imputed_data, _ = normalization(imputed_data, norm_parameters)
    
  # Only for missing values
  nominator = np.sum(((1-data_m) * ori_data - (1-data_m) * imputed_data)**2)
  denominator = np.sum(1-data_m)
  
  rmse = np.sqrt(nominator/float(denominator))
  
  return rmse

# Patient 570

In [38]:
filename1 = "data/570/5701.txt"
filename2 = "data/570/5702.txt"
filename3 = "data/570/5703.txt"
filename4 = "data/570/5704.txt"
filename5 = "data/570/5705.txt"
filename6 = "data/570/5706.txt"
value1 = read_ori(filename1)
value2 = read_ori(filename2)
value3 = read_ori(filename3)
value4 = read_ori(filename4)
value5 = read_ori(filename5)
value6 = read_ori(filename6)

In [39]:
# orginal data with missing values in one list
glucose_miss_ori_570 = value1 + value2 + value3 + value4 + value5 + value6 
# mask list
mask_list_570 = [1 if value != 0 else 0 for value in glucose_miss_ori_570]

In [40]:
# original data without missing values
glucose_nomiss_ori_570 = pd.read_csv('data/570_12.8_1614.csv')['value'].tolist()

In [15]:
mean_570 = 143.54089219330854
std_570 = 65.08417318014587

In [16]:
#  read normalized imputed values
filename_570 = "570/imputation_train_results/WGAN_no_mask/45_8_6_256_0.0005_400_True_True_True_0.15_0.5/batch1x"
glucose_imputed_norm_570 = read_text(filename_570)
# imputed_norm -> orginal
glucose_imputed_570 = np.array(glucose_imputed_norm_570)*std_570+mean_570

In [25]:
# combine imputed data into one file, the same order with orginal data
glucose_imputed_570_final = glucose_imputed_570[5].reshape(269,).tolist() + glucose_imputed_570[3].reshape(269,).tolist() + glucose_imputed_570[2].reshape(269,).tolist() + glucose_imputed_570[0].reshape(269,).tolist() + glucose_imputed_570[4].reshape(269,).tolist() + glucose_imputed_570[1].reshape(269,).tolist()

In [26]:
pd.DataFrame(glucose_imputed_570_final).to_csv('570/570_imputed_final.csv', index=False)

In [None]:
# normalize orginal + imputed data
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_570).reshape(1614,1))
imputed_data, _ = normalization(np.array(glucose_imputed_570_final).reshape(1614,1), norm_parameters)

In [28]:
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_570).reshape(1614,1))

In [29]:
rmse

0.3158442876852117

In [47]:
# use mean value to impute data
df_570_miss = pd.read_csv('data/570_miss.csv')
df_570_miss['value'] = df_570_miss['value'].fillna(143.54089219330854)
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_570).reshape(1614,1))
imputed_data, _ = normalization(np.array(df_570_miss['value']).reshape(1614,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_570).reshape(1614,1))
rmse

0.2847668037678422

# Patient 559

In [50]:
# read original data 
filename1 = "data/559/5591.txt"
filename2 = "data/559/5592.txt"
filename3 = "data/559/5593.txt"
filename4 = "data/559/5594.txt"
filename5 = "data/559/5595.txt"
value1 = read_ori(filename1)
value2 = read_ori(filename2)
value3 = read_ori(filename3)
value4 = read_ori(filename4)
value5 = read_ori(filename5)

In [51]:
# orginal data with missing values in one list
glucose_miss_ori_559 = value1 + value2 + value3 + value4 + value5
# mask list
mask_list_559 = [1 if value != 0 else 0 for value in glucose_miss_ori_559]

In [52]:
# original data without missing values
glucose_nomiss_ori_559 = pd.read_csv('data/559_12.27_1290.csv')['value'].tolist()

In [16]:
mean_559 = 127.24573643410852
std_559 = 61.36112157884535

In [17]:
# read the imputed data
filename_559 = "559/imputation_train_results/WGAN_no_mask/40_8_5_256_0.0005_400_True_True_True_0.15_0.5/batch1x"
glucose_imputed_norm_559 = read_text(filename_559)
# imputed_norm -> orginal
glucose_imputed_559 = np.array(glucose_imputed_norm_559)*std_559+mean_559

In [32]:
len(glucose_imputed_559_final)

1290

In [29]:
# combine imputed data into one file, the same order with orginal data
glucose_imputed_559_final = glucose_imputed_559[0].reshape(258,).tolist() + glucose_imputed_559[1].reshape(258,).tolist() + glucose_imputed_559[2].reshape(258,).tolist() + glucose_imputed_559[4].reshape(258,).tolist() + glucose_imputed_559[3].reshape(258,).tolist()

In [30]:
pd.DataFrame(glucose_imputed_559_final).to_csv('559/559_imputed_final.csv', index=False)

In [33]:
# normalize orginal + imputed data
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_559).reshape(1290,1))
imputed_data, _ = normalization(np.array(glucose_imputed_559_final).reshape(1290,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_559).reshape(1290,1))
rmse

0.24311996810677758

In [54]:
# use mean value to impute data
df_559_miss = pd.read_csv('data/559_miss.csv')
df_559_miss['value'] = df_559_miss['value'].fillna(127.24573643410852)
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_559).reshape(1290,1))
imputed_data, _ = normalization(np.array(df_559_miss['value']).reshape(1290,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_559).reshape(1290,1))
rmse

0.28791818922349305

# Patient 563

In [55]:
# read original data 
filename1 = "data/563/5631.txt"
filename2 = "data/563/5632.txt"
filename3 = "data/563/5633.txt"
filename4 = "data/563/5634.txt"
filename5 = "data/563/5635.txt"
value1 = read_ori(filename1)
value2 = read_ori(filename2)
value3 = read_ori(filename3)
value4 = read_ori(filename4)
value5 = read_ori(filename5)

In [56]:
# orginal data with missing values in one list
glucose_miss_ori_563 = value1 + value2 + value3 + value4 + value5
# mask list
mask_list_563 = [1 if value != 0 else 0 for value in glucose_miss_ori_563]

In [57]:
# original data without missing values
glucose_nomiss_ori_563 = pd.read_csv('data/563_10.15_1726.csv')['value'].tolist()

In [16]:
mean_563 = 119.84869565217392
std_563 = 51.592521406907515

In [17]:
# read the imputed data
filename_563 = "563/imputation_train_results/WGAN_no_mask/30_8_5_256_0.0005_400_True_True_True_0.15_0.5/batch1x"
glucose_imputed_norm_563 = read_text(filename_563)
# imputed_norm -> orginal
glucose_imputed_563 = np.array(glucose_imputed_norm_563)*std_563+mean_563

In [24]:
# combine imputed data into one file, the same order with orginal data
glucose_imputed_563_final = glucose_imputed_563[3].reshape(345,).tolist() + glucose_imputed_563[0].reshape(345,).tolist() + glucose_imputed_563[2].reshape(345,).tolist() + glucose_imputed_563[4].reshape(345,).tolist() + glucose_imputed_563[1].reshape(345,).tolist()

In [25]:
pd.DataFrame(glucose_imputed_563_final).to_csv('563/563_imputed_final.csv', index=False)

In [27]:
# normalize orginal + imputed data
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_563).reshape(1725,1))
imputed_data, _ = normalization(np.array(glucose_imputed_563_final).reshape(1725,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_563).reshape(1725,1))
rmse

0.2980753930515585

In [58]:
# use mean value to impute data
df_563_miss = pd.read_csv('data/563_miss.csv')
df_563_miss['value'] = df_563_miss['value'].fillna(119.84869565217392)
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_563).reshape(1725,1))
imputed_data, _ = normalization(np.array(df_563_miss['value']).reshape(1725,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_563).reshape(1725,1))
rmse

0.2879509508751353

# Patient 575

In [59]:
# read original data 
filename1 = "data/575/5751.txt"
filename2 = "data/575/5752.txt"
filename3 = "data/575/5753.txt"
filename4 = "data/575/5754.txt"
filename5 = "data/575/5755.txt"
value1 = read_ori(filename1)
value2 = read_ori(filename2)
value3 = read_ori(filename3)
value4 = read_ori(filename4)
value5 = read_ori(filename5)

In [60]:
# orginal data with missing values in one list
glucose_miss_ori_575 = value1 + value2 + value3 + value4 + value5
# mask list
mask_list_575 = [1 if value != 0 else 0 for value in glucose_miss_ori_575]
# original data without missing values
glucose_nomiss_ori_575 = pd.read_csv('data/575_1.1_731.csv')['value'].tolist()

mean_575 = 125.80684931506849
std_575 = 73.12277605521172

In [61]:
# read the imputed data
filename_575 = "575/imputation_train_results/WGAN_no_mask/30_8_5_256_0.0005_400_True_True_True_0.15_0.5/batch1x"
glucose_imputed_norm_575 = read_text(filename_575)
# imputed_norm -> orginal
glucose_imputed_575 = np.array(glucose_imputed_norm_575)*std_575+mean_575

In [41]:
glucose_imputed_575.shape

(5, 146)

In [42]:
# combine imputed data into one file, the same order with orginal data
glucose_imputed_575_final = glucose_imputed_575[1].reshape(146,).tolist() + glucose_imputed_575[3].reshape(146,).tolist() + glucose_imputed_575[4].reshape(146,).tolist() + glucose_imputed_575[0].reshape(146,).tolist() + glucose_imputed_575[2].reshape(146,).tolist()

In [43]:
pd.DataFrame(glucose_imputed_575_final).to_csv('575/575_imputed_final.csv', index=False)

In [44]:
# normalize orginal + imputed data
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_575).reshape(730,1))
imputed_data, _ = normalization(np.array(glucose_imputed_575_final).reshape(730,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_575).reshape(730,1))
rmse

0.23505804566877062

In [62]:
# use mean value to impute data
df_575_miss = pd.read_csv('data/575_miss.csv')
df_575_miss['value'] = df_575_miss['value'].fillna(125.80684931506849)
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_575).reshape(730,1))
imputed_data, _ = normalization(np.array(df_575_miss['value']).reshape(730,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_575).reshape(730,1))
rmse

0.23939629001445042

# Patient 588

In [63]:
# read original data 
filename1 = "data/588/5881.txt"
filename2 = "data/588/5882.txt"
filename3 = "data/588/5883.txt"
filename4 = "data/588/5884.txt"
filename5 = "data/588/5885.txt"
value1 = read_ori(filename1)
value2 = read_ori(filename2)
value3 = read_ori(filename3)
value4 = read_ori(filename4)
value5 = read_ori(filename5)

In [64]:
# orginal data with missing values in one list
glucose_miss_ori_588 = value1 + value2 + value3 + value4 + value5
# mask list
mask_list_588 = [1 if value != 0 else 0 for value in glucose_miss_ori_588]
# original data without missing values
glucose_nomiss_ori_588 = pd.read_csv('data/588_9.17_1720.csv')['value'].tolist()

mean_588 = 119.65988372093024
std_588 = 44.89488026998585

In [14]:
# read the imputed data
filename_588 = "588/imputation_train_results/WGAN_no_mask/15_8_5_256_0.0005_400_True_True_True_0.15_0.5/batch1x"
glucose_imputed_norm_588 = read_text(filename_588)
# imputed_norm -> orginal
glucose_imputed_588 = np.array(glucose_imputed_norm_588)*std_588+mean_588

In [23]:
len(glucose_imputed_588_final)

1720

In [21]:
# combine imputed data into one file, the same order with orginal data
glucose_imputed_588_final = glucose_imputed_588[2].reshape(344,).tolist() + glucose_imputed_588[1].reshape(344,).tolist() + glucose_imputed_588[0].reshape(344,).tolist() + glucose_imputed_588[3].reshape(344,).tolist() + glucose_imputed_588[4].reshape(344,).tolist()

In [22]:
pd.DataFrame(glucose_imputed_588_final).to_csv('588/588_imputed_final.csv', index=False)

In [24]:
# normalize orginal + imputed data
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_588).reshape(1720,1))
imputed_data, _ = normalization(np.array(glucose_imputed_588_final).reshape(1720,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_588).reshape(1720,1))
rmse

0.23615638925618718

In [65]:
# use mean value to impute data
df_588_miss = pd.read_csv('data/588_miss.csv')
df_588_miss['value'] = df_588_miss['value'].fillna(119.65988372093024)
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_588).reshape(1720,1))
imputed_data, _ = normalization(np.array(df_588_miss['value']).reshape(1720,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_588).reshape(1720,1))
rmse

0.23905359198335752

# Patient 591

In [66]:
# read original data 
filename1 = "data/591/5911.txt"
filename2 = "data/591/5912.txt"
filename3 = "data/591/5913.txt"
filename4 = "data/591/5914.txt"
filename5 = "data/591/5915.txt"
value1 = read_ori(filename1)
value2 = read_ori(filename2)
value3 = read_ori(filename3)
value4 = read_ori(filename4)
value5 = read_ori(filename5)

In [67]:
# orginal data with missing values in one list
glucose_miss_ori_591 = value1 + value2 + value3 + value4 + value5
# mask list
mask_list_591 = [1 if value != 0 else 0 for value in glucose_miss_ori_591]
# original data without missing values
glucose_nomiss_ori_591 = pd.read_csv('data/591_1.10_1476.csv')['value'].tolist()

mean_591 = 113.08338983050848
std_591 = 48.667230310724904

In [64]:
# read the imputed data
filename_591 = "591/imputation_train_results/WGAN_no_mask/40_8_5_256_0.0005_400_True_True_True_0.15_0.5/batch1x"
glucose_imputed_norm_591 = read_text(filename_591)
# imputed_norm -> orginal
glucose_imputed_591 = np.array(glucose_imputed_norm_591)*std_591+mean_591

In [73]:
len(glucose_imputed_591_final)

1475

In [71]:
# combine imputed data into one file, the same order with orginal data
glucose_imputed_591_final = glucose_imputed_591[2].reshape(295,).tolist() + glucose_imputed_591[1].reshape(295,).tolist() + glucose_imputed_591[3].reshape(295,).tolist() + glucose_imputed_591[0].reshape(295,).tolist() + glucose_imputed_591[4].reshape(295,).tolist()

In [72]:
pd.DataFrame(glucose_imputed_591_final).to_csv('591/591_imputed_final.csv', index=False)

In [74]:
# normalize orginal + imputed data
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_591).reshape(1475,1))
imputed_data, _ = normalization(np.array(glucose_imputed_591_final).reshape(1475,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_591).reshape(1475,1))
rmse

0.19407295757402662

In [68]:
# use mean value to impute data
df_591_miss = pd.read_csv('data/591_miss.csv')
df_591_miss['value'] = df_591_miss['value'].fillna(113.08338983050848)
ori_data, norm_parameters = normalization(np.array(glucose_nomiss_ori_591).reshape(1475,1))
imputed_data, _ = normalization(np.array(df_591_miss['value']).reshape(1475,1), norm_parameters)
rmse = rmse_loss(ori_data, imputed_data, np.array(mask_list_591).reshape(1475,1))
rmse

0.20663888455632368