# Model Training
In this notebook, we will train our VAE model. This involves:

1. Encoder
2. Decoder
3. Full Model

## Importing Packages
We will be using Keras to build and train our VAE

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
%matplotlib notebook
import tensorflow as tf

import os
import time
import numpy as np
import glob
import matplotlib.pyplot as plt
import PIL
import imageio
import h5py

# import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Dense, Lambda, Layer, Add, Multiply, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import model_from_json
import mdn

import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()

import tensorflow_probability as tfp

from IPython import display
import numpy as np
import pandas as pd
from sklearn import preprocessing
from pickle import dump, load

from matplotlib.ticker import FormatStrFormatter
from IPython.display import SVG

## Definition of Constants
Hyper parameters and constants will all be located here for ease of adjustments

In [0]:
continuous_dim = 6
original_dim = 6
intermediate_dim_1 = 100
intermediate_dim_2 = 100
latent_dim = 4
batch_size = 100
epochs = 50
epsilon_std = 1.0

## Constants for the Mixture layer
N_HIDDEN = 15  # number of hidden units in the Dense layer
N_MIXES = 10  # number of mixture components
OUTPUT_DIMS = 2  # number of real-values predicted by each mixture component

# Model File Paths
full_model_file = "vae_full_model.json"
full_model_weights = "vae_full_model.h5"
encoder_file = "vae_encoder.json"
encoder_weights = "vae_encoder.h5"
decoder_file = 'vae_decoder.json'
decoder_weights = 'vae_decoder.h5'
z_meta_file = 'z_meta.npy'

## Building the Model
We have to write our own custom layer and custom loss function as these are not supported on Keras natively. There are a few things to be done:

1. Custom KLDivergence Layer
2. Custom Loss Functions
3. Building the Model

### KL Divergence Layer
To ensure modularity, we decided to create a separate layer for KL Divergence. This layer will account for the loss required. 

In [0]:
class KLDivergenceLayer(Layer):

    """ Identity transform layer that adds KL divergence
    to the final model loss.
    """

    def __init__(self, *args, **kwargs):
        self.is_placeholder = True
        super(KLDivergenceLayer, self).__init__(*args, **kwargs)

    def call(self, inputs):

        mu, log_var = inputs

        kl_batch = - .5 * K.sum(1 + log_var -
                                K.square(mu) -
                                K.exp(log_var), axis=-1)

        self.add_loss(K.mean(kl_batch), inputs=inputs)

        return inputs

### Custom Loss Functions
As the yelp dataset contains binary, categorical as well as continuous data, we will build 3 custom loss functions.

#### Binary Loss Function

In [0]:
def binary_loss(y_true, y_pred):
	# input dimension is (batchsize, 1)
    return K.binary_crossentropy(y_true, y_pred) # the dimension of return value is (batchsize , 1)

#### Categorical Loss Function

In [0]:
def categorical_loss(y_true, y_pred):
	# input dimension is (batchsize, number of categories)
  return K.categorical_crossentropy(y_true, y_pred) # the dimension of return value is (batchsize , 1)

#### Continuous Loss Function

In [0]:
def log_normal_pdf(sample, mean, logvar, raxis=1):
  log2pi = tf.math.log(2. * np.pi)
  return tf.reduce_sum(
      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
      axis=raxis) # return a tensor of shape (batch_size, 1)

# Added a postfix because now we also have poisson as a distribution
def continuous_loss_gaussian(y_true, y_pred):
	# need to return log probability for continuous gaussian loss.
	# will get a (batchsize, 6 continuous variable input) where 3 of the 6 represents mu and the others logvar
	# y_true will be (batchsize, 3)
  mu, logvar = tf.split(y_pred, num_or_size_splits = 2, axis = 1)
  return -1 * log_normal_pdf(y_true, mu, logvar) 

def continuous_loss_poisson(y_true, y_pred):
	# need to return log probability for continuous poisson loss.
	# will get a (batchsize, 6 continuous variable input) where 3 of the 6 represents mu and the others logvar
	# y_true will be (batchsize, 3)
  return tf.nn.log_poisson_loss(y_true, y_pred)

### Model Architecture
- tanh/sigmoid is used because Relu resulted in loss going to infinity
- going to delete review_log_var and review_mu since we are trying out the Poission distribution which only takes 1 parameter.

In [0]:
# x = Input(shape=(original_dim,), name='input_x')
# h = Dense(intermediate_dim_1, activation='tanh', name='hidden_enc_1')(x)
# h = Dense(intermediate_dim_2, activation='tanh', name='hidden_enc_2')(h)

# z_mu = Dense(latent_dim, name='z_mu')(h)
# z_log_var = Dense(latent_dim, name='z_log_var')(h)

# z_mu, z_log_var = KLDivergenceLayer(name='KL_Divergence')([z_mu, z_log_var])
# z_sigma = Lambda(lambda t: K.exp(.5*t))(z_log_var)

# eps = Input(name='input_eps',tensor=K.random_normal(stddev=epsilon_std,
#                                    shape=(K.shape(x)[0], latent_dim)))
# z_eps = Multiply()([z_sigma, eps])
# z = Add()([z_mu, z_eps])

# decode_1 = Dense(intermediate_dim_2, activation='tanh', name='hidden_dec_2')
# h_dec = decode_1(z)

# decode_2 = Dense(intermediate_dim_1, activation='tanh', name='hidden_dec_1')
# h_dec = decode_2(h_dec)


# ## This outputs the lambda require for poisson distribution and thats all that we need
# x_pred_passenger_count_log_lambda_layer = Dense(1,name = 'x_pred_passenger_count_log_lambda_layer')
# x_pred_passenger_count = x_pred_passenger_count_log_lambda_layer(h_dec)

# ## This outputs the lambda require for poisson distribution and thats all that we need
# x_pred_travel_duration_log_lambda_layer = Dense(1,name = 'x_pred_travel_duration_log_lambda_layer')
# x_pred_travel_duration = x_pred_travel_duration_log_lambda_layer(h_dec)


# vae = Model(inputs=[x,eps], outputs=[x_pred_coordinates,x_pred_coordinates_2, x_pred_passenger_count, x_pred_travel_duration])

In [0]:
tfk = tf.keras
tfkl = tf.keras.layers
tfpl = tfp.layers
tfd = tfp.distributions


input_shape = (original_dim,)
prior = tfd.Independent(tfd.Normal(loc=tf.zeros(latent_dim), scale=1), reinterpreted_batch_ndims=1)

## Encoder
encoder = tfk.Sequential([
    tfkl.InputLayer(input_shape=input_shape, name = 'input_x'),
    tfkl.Dense(intermediate_dim_1, activation='tanh', name='hidden_enc_1'),
    tfkl.Dense(intermediate_dim_2, activation='tanh', name='hidden_enc_2'),
    tfkl.Dense(tfpl.IndependentNormal.params_size(latent_dim), activation=None, name = 'z_probability_distribution'),
])

sampler = tfk.Sequential([
    tfkl.InputLayer(input_shape=(tfpl.IndependentNormal.params_size(latent_dim), ), name = 'input_z_params'),
    tfpl.IndependentNormal( latent_dim, name = 'sample_layer'),
    tfpl.KLDivergenceAddLoss(prior),
])
decode_1 = tfkl.Dense(intermediate_dim_2, activation='tanh', name='hidden_dec_2')
h_dec = decode_1(sampler(encoder.outputs[0]))
print(h_dec)

decode_2 = tfkl.Dense(intermediate_dim_1, activation='tanh', name='hidden_dec_1')
h_dec = decode_2(h_dec)


x_pred_coordinates_mu_layer = Dense(2, name='x_pred_coordinates_mu')
x_pred_coordinates_mu = x_pred_coordinates_mu_layer(h_dec)

x_pred_coordinates_log_var_layer = Dense(2, name='x_pred_coordinates_log_var')
x_pred_coordinates_log_var = x_pred_coordinates_log_var_layer(h_dec)

x_pred_coordinates_layer = Concatenate(axis=-1, name = 'x_pred_coordinates')
x_pred_coordinates = x_pred_coordinates_layer([x_pred_coordinates_mu, x_pred_coordinates_log_var])

x_pred_coordinates_2_mu_layer = Dense(2, name='x_pred_coordinates_2_mu')
x_pred_coordinates_2_mu = x_pred_coordinates_2_mu_layer(h_dec)

x_pred_coordinates_2_log_var_layer = Dense(2, name='x_pred_coordinates_2_log_var')
x_pred_coordinates_2_log_var = x_pred_coordinates_2_log_var_layer(h_dec)

x_pred_coordinates_2_layer = Concatenate(axis=-1, name = 'x_pred_coordinates_2')
x_pred_coordinates_2 = x_pred_coordinates_2_layer([x_pred_coordinates_2_mu, x_pred_coordinates_2_log_var])


## This outputs the lambda require for poisson distribution and thats all that we need
x_pred_travel_duration_log_lambda_layer = tfkl.Dense(1,name = 'x_pred_travel_duration_log_lambda_layer')
x_pred_travel_duration = x_pred_travel_duration_log_lambda_layer(h_dec)


x_pred_passenger_count_log_lambda_layer = tfkl.Dense(1,name = 'x_pred_passenger_count_log_lambda_layer')
x_pred_passenger_count = x_pred_passenger_count_log_lambda_layer(h_dec)

vae = tfk.Model(inputs=encoder.inputs, outputs=[x_pred_coordinates,x_pred_coordinates_2, x_pred_passenger_count, x_pred_travel_duration])

Tensor("hidden_dec_2/Identity:0", shape=(None, 100), dtype=float32)


### Compiling Model and Setting Parameters

In [0]:
optimizer = tf.keras.optimizers.Adam(lr=0.0001)
vae.compile(optimizer=optimizer, loss=[continuous_loss_gaussian, continuous_loss_gaussian, continuous_loss_poisson, continuous_loss_poisson], loss_weights=[1, 1, 1, 1])

In [0]:
vae.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_x (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
hidden_enc_1 (Dense)            (None, 100)          700         input_x[0][0]                    
__________________________________________________________________________________________________
hidden_enc_2 (Dense)            (None, 100)          10100       hidden_enc_1[0][0]               
__________________________________________________________________________________________________
z_probability_distribution (Den (None, 8)            808         hidden_enc_2[0][0]               
______________________________________________________________________________________________

## Data Loading and Preprocessing

In [0]:
import math
def lonlat2meters(lon, lat):
    semimajoraxis = 6378137.0
    east = lon * 0.017453292519943295
    north = lat * 0.017453292519943295
    t = math.sin(north)
    return semimajoraxis * east, 3189068.5 * math.log((1 + t) / (1 - t))

def meters2lonlat(x, y):
    semimajoraxis = 6378137.0
    lon = x / semimajoraxis / 0.017453292519943295
    t = math.exp(y / 3189068.5)
    lat = math.asin((t - 1) / (t + 1)) / 0.017453292519943295
    return lon, lat

In [0]:
dataset = np.genfromtxt('../processed_nyc_train.csv',delimiter=',', skip_header=1)
dataset = dataset[~np.isnan(dataset).any(axis=1)]

def format_data(dataset, pick_up_scaler=None, drop_off_scaler = None ,save_scaler=True):
    
    pick_up_c, drop_off_c, num_passenger, travel_duration = np.split(dataset, [2, 4, 5], axis = 1)
    
    # Handling of the coordinates
    for i, c in enumerate(pick_up_c):
        lon = pick_up_c[i][0]
        lat = pick_up_c[i][1]
        x, y = lonlat2meters(lon, lat)
        pick_up_c[i][0] = x
        pick_up_c[i][1] = y
    
    if pick_up_scaler is None:
        pick_up_scaler = preprocessing.StandardScaler()
        pick_up_scaler = pick_up_scaler.fit(pick_up_c)
    
    pick_up_c = pick_up_scaler.transform(pick_up_c)
    
    for i, c in enumerate(drop_off_c):
        lon = drop_off_c[i][0]
        lat = drop_off_c[i][1]
        x, y = lonlat2meters(lon, lat)
        drop_off_c[i][0] = x
        drop_off_c[i][1] = y
    
    if drop_off_scaler is None:
        drop_off_scaler = preprocessing.StandardScaler()
        drop_off_scaler = drop_off_scaler.fit(drop_off_c)
    
    drop_off_c = drop_off_scaler.transform(drop_off_c)
    
    
    if save_scaler:
        dump(pick_up_scaler, open('pick_up_scaler.pkl', 'wb'))
        dump(drop_off_scaler, open('drop_off_scaler.pkl', 'wb'))

    final = np.concatenate([pick_up_c, drop_off_c, num_passenger, travel_duration], axis = 1)
    return final, pick_up_c, drop_off_c, num_passenger, travel_duration

dataset, pick_up_c, drop_off_c, num_passenger, travel_duration = format_data(dataset)

## Training the Model

In [0]:
for i in range(50): # change to desired number of epochs
  print('epoch ' ,i)
  vae.load_weights('weights_2.h5')
  vae.fit(dataset , [pick_up_c, drop_off_c, num_passenger, travel_duration], shuffle = True, epochs = 1, batch_size = batch_size)
  vae.save_weights('weights_2.h5')

epoch  0
epoch  1
epoch  2
epoch  3
epoch  4
epoch  5
epoch  6
epoch  7
epoch  8
epoch  9
epoch  10
epoch  11
epoch  12
epoch  13
epoch  14
epoch  15
epoch  16
epoch  17
epoch  18
epoch  19
epoch  20
epoch  21
epoch  22
epoch  23
epoch  24
epoch  25
epoch  26
epoch  27
epoch  28
epoch  29
epoch  30
epoch  31
epoch  32
epoch  33
epoch  34
epoch  35
epoch  36
epoch  37
epoch  38
epoch  39
epoch  40
epoch  41
epoch  42
epoch  43
epoch  44
epoch  45
epoch  46
epoch  47
epoch  48
epoch  49


## Building the Decoder

In [0]:
decode_input = Input(shape=(latent_dim, ), name = 'decode_input')
decode_layer_1 = decode_1(decode_input)
decode_layer_2 = decode_2(decode_layer_1)

x_pred_coordinates_mu = x_pred_coordinates_mu_layer(decode_layer_2)
x_pred_coordinates_log_var = x_pred_coordinates_log_var_layer(decode_layer_2)
decode_x_pred_coordinates = x_pred_coordinates_layer([x_pred_coordinates_mu, x_pred_coordinates_log_var])

x_pred_coordinates_2_mu = x_pred_coordinates_2_mu_layer(decode_layer_2)
x_pred_coordinates_2_log_var = x_pred_coordinates_2_log_var_layer(decode_layer_2)
decode_x_pred_coordinates_2 = x_pred_coordinates_2_layer([x_pred_coordinates_2_mu, x_pred_coordinates_2_log_var])

decode_x_pred_passenger_count = x_pred_passenger_count_log_lambda_layer(decode_layer_2)

decode_x_pred_travel_duration = x_pred_travel_duration_log_lambda_layer(decode_layer_2)


decoder = Model(decode_input, [decode_x_pred_coordinates, decode_x_pred_coordinates, decode_x_pred_passenger_count, decode_x_pred_travel_duration])

### Saving the Models and Metadata
#### Saving Models (Actually, only the decoder matter)

In [0]:
model_json = vae.to_json()
with open("./model/vae_full_model.json", "w") as json_file:
    json_file.write(model_json)
vae.save_weights("./model/vae_full_model.h5")
print("Saved model to disk")

encoder = Model(x, [z_mu, z_log_var])
model_json = encoder.to_json()
with open("./model/vae_encoder.json", "w") as json_file:
    json_file.write(model_json)
encoder.save_weights("./model/vae_encoder.h5")
print("Saved model to disk")

model_json = decoder.to_json()
with open("./model/vae_decoder.json", "w") as json_file:
    json_file.write(model_json)
decoder.save_weights("./model/vae_decoder.h5")
print("Saved model to disk")


NotImplementedError: ignored

#### Generating and Saving Metadata

In [0]:
vae_sample = dataset[np.random.choice(len(dataset), size=14000, replace=False)]
metadata = encoder.predict(vae_sample, batch_size=batch_size)
np.save('metadata.npy', metadata)

### Generating some Samples for Testing
#### Functions for data generation

In [0]:
# reverse_categorical_map = {0:0, 1:0.5, 2:1, 3:1.5, 4:2, 5:2.5, 6:3, 7:3.5, 8:4, 9:4.5, 10:5}
# def sample(model, input_mu, input_log_var, samples_per_z=1):
    # multiplied_input_mu = np.repeat(input_mu, samples_per_z, axis=0)
    # multiplied_input_log_var = np.repeat(input_log_var, samples_per_z, axis=0)
    # eps = np.random.normal(size=(multiplied_input_mu.shape[0], latent_dim))
    # z = reparameterize(multiplied_input_mu, multiplied_input_log_var, eps)
    # predictions = model.predict(z, batch_size = None, steps = 1)
    # return reconstruct(predictions)

reverse_categorical_map = {0:0, 1:0.5, 2:1, 3:1.5, 4:2, 5:2.5, 6:3, 7:3.5, 8:4, 9:4.5, 10:5}
def sample(decoder, sampler, input_z_params,samples_per_z=1):
    multiplied_input_z_params = np.repeat(input_z_params, samples_per_z, axis=0)
    z = sampler.predict(multiplied_input_z_params)
    predictions = decoder.predict(z, batch_size = None, steps = 1)
    return reconstruct(predictions)
    
def reconstruct(predictions):
    coordinates, coordinates_2, passenger_count, travel_duration= predictions
    mu, log_var = np.split(coordinates, indices_or_sections = 2,axis = 1)
    eps = np.random.normal(size=mu.shape)
    
    
    ## coordinates handled here
    coordinates_data = reparameterize(mu, log_var, eps)
    pickup_scaler = load(open('pick_up_scaler.pkl', 'rb'))
    coordinates_data = pickup_scaler.inverse_transform(coordinates_data)
    
    for i, c in enumerate(coordinates_data):
        lon = coordinates_data[i][0]
        lat = coordinates_data[i][1]
        x, y = meters2lonlat(lon, lat)
        coordinates_data[i][0] = x
        coordinates_data[i][1] = y

    for i, c in enumerate(coordinates_data):
        if c[0] > 180.0:
            coordinates_data[i][0]= 180.0
        if c[0] < -180.0:
            coordinates_data[i][0]= -180.0
        if c[1] > 180.0:
            coordinates_data[i][1]= 180.0
        if c[1] < -180.0:
            coordinates_data[i][1]= -180.0
    
    ## coordinates handled here
    mu, log_var = np.split(coordinates_2, indices_or_sections = 2,axis = 1)
    eps = np.random.normal(size=mu.shape)


    coordinates_2_data = reparameterize(mu, log_var, eps)
    dropoff_scaler = load(open('drop_off_scaler.pkl', 'rb'))
    coordinates_2_data = dropoff_scaler.inverse_transform(coordinates_2_data)
    
    for i, c in enumerate(coordinates_2_data):
        lon = coordinates_2_data[i][0]
        lat = coordinates_2_data[i][1]
        x, y = meters2lonlat(lon, lat)
        coordinates_2_data[i][0] = x
        coordinates_2_data[i][1] = y
    
    for i, c in enumerate(coordinates_2_data):
        if c[0] > 180.0:
            coordinates_2_data[i][0]= 180.0
        if c[0] < -180.0:
            coordinates_2_data[i][0]= -180.0
        if c[1] > 180.0:
            coordinates_2_data[i][1]= 180.0
        if c[1] < -180.0:
            coordinates_2_data[i][1]= -180.0
    
    ## passenger_count handled here
    exp_log_passenger_count = np.exp(passenger_count)
    passenger_count_data = np.random.poisson(lam=exp_log_passenger_count, size = passenger_count.shape)
    for i, r in enumerate(passenger_count_data):
        if r[0] < 0:
            passenger_count_data[i][0] = 0
        passenger_count_data[i][0] = float(int(passenger_count_data[i][0]))

    ## review_count handled here
    exp_log_travel_duration = np.exp(travel_duration)
    travel_duration_data = np.random.poisson(lam=exp_log_travel_duration, size = travel_duration.shape)
    for i, r in enumerate(travel_duration_data):
        if r[0] < 0:
            travel_duration_data[i][0] = 0
        travel_duration_data[i][0] = float(int(travel_duration_data[i][0]))
        # print(travel_duration_data[i][0] )
    
    return np.concatenate([coordinates_data, coordinates_2_data, passenger_count_data, travel_duration_data], axis = 1)
    

def reparameterize(input_mu, input_log_var, eps):
    sigma = np.exp(0.5*input_log_var)
    return eps*sigma + input_mu

### Generating Samples
Make use of the funciton sample to generate samples with our model. U need to supply an array of mu and their respective log var in a separate array to do that. 

In [0]:
vae_samples = sample(decoder, sampler, metadata, 5)

# # Saving the samples in a separate file
file_name = 'nyc_vae_' + str(len(metadata))+'_times_'+ str(len(vae_samples)//len(metadata)) + '.csv'
np.savetxt(file_name, vae_samples, delimiter = ',', header='pickup_longitude,pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count,trip_duration')
print(file_name)

nyc_vae_14000_times_5.csv


In [0]:
vae_samples[50:60]

array([[-7.39849722e+01,  4.07893836e+01, -7.39772578e+01,
         4.07514487e+01,  3.00000000e+00,  4.23000000e+02],
       [-7.39851313e+01,  4.07656644e+01, -7.39737269e+01,
         4.07492343e+01,  2.00000000e+00,  4.34000000e+02],
       [-7.40223500e+01,  4.07647697e+01, -7.39303982e+01,
         4.08002151e+01,  3.00000000e+00,  6.04000000e+02],
       [-7.39594563e+01,  4.07525196e+01, -7.39283271e+01,
         4.07627181e+01,  0.00000000e+00,  9.32000000e+02],
       [-7.39431459e+01,  4.07113147e+01, -7.40231765e+01,
         4.07440255e+01,  0.00000000e+00,  6.15000000e+02],
       [-7.39852891e+01,  4.07081859e+01, -7.39571242e+01,
         4.07184679e+01,  4.00000000e+00,  3.05000000e+02],
       [-7.39460589e+01,  4.07609297e+01, -7.39663621e+01,
         4.07680510e+01,  1.00000000e+00,  9.75000000e+02],
       [-7.39961156e+01,  4.07395104e+01, -7.38964835e+01,
         4.07732447e+01,  1.00000000e+00,  1.29700000e+03],
       [-7.40030389e+01,  4.07377161e+01, -7.396