In [1]:
import pandas as pd
import numpy as np

In [None]:
# import umap
# import umap.plot

In [2]:
import pickle

In [3]:
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
from sklearn.mixture import GaussianMixture

In [5]:
from sklearn.metrics import silhouette_score

In [6]:
import matplotlib.pyplot as plt


In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [8]:
class Sampling(layers.Layer):

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))

        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_MSE_tracker = keras.metrics.Mean(
            name="reconstruction_loss_MSE"
        )
        self.reconstruction_loss_binary_tracker = keras.metrics.Mean(
            name="reconstruction_loss_binary"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_MSE_tracker,
            self.reconstruction_loss_binary_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss_MSE = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.MeanSquaredError()(data[:,0:2], reconstruction[:,0:2])
                )
            )
            reconstruction_loss_binary = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.BinaryCrossentropy()(data[:,2:], reconstruction[:,2:])
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))

            # total_loss = 0.2*reconstruction_loss_MSE + 0.4*reconstruction_loss_binary + 0.4*kl_loss
            total_loss = reconstruction_loss_MSE + reconstruction_loss_binary + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_MSE_tracker.update_state(reconstruction_loss_MSE)
        self.reconstruction_loss_binary_tracker.update_state(reconstruction_loss_binary)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss_MSE": self.reconstruction_loss_MSE_tracker.result(),
            "reconstruction_loss_binary": self.reconstruction_loss_binary_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [9]:
def get_user_history(df):
    gr = df.groupby(['order_requests', 'avatar_id'])

    new_df = pd.DataFrame()
    for _, v in gr:
        new_df = new_df.append(v.head(1)[['order_requests', 'avatar_id']])
    new_df['user_history'] = new_df.groupby('avatar_id').cumcount()

    df = pd.merge(df, new_df, how='inner', on=['order_requests', 'avatar_id'])

    return df

In [12]:
def map_hotel_group(group):
    groups = {'Boss Western': 'Boss_Western_Group', 'Accar Hotels': 'Accar_Hotels', 'Independant': 'Independant_Group',
              'Yin Yang': 'Yin_Yang', 'Chillton Worldwide': 'Chillton_Worldwide',
              'Morriott International': 'Morriott_International'}

    return groups[group]

def map_hotel_brand(brand):
    brands = {'J.Halliday Inn': 'J_Halliday_Inn', 'Marcure': 'Marcure', 'Independant': 'Independant_Brand',
              'Ibas': 'Ibas', 'Safitel': 'Safitel', '8 Premium': '8_Premium', 'Tripletree': 'Tripletree',
              'CourtYord': 'CourtYord', 'Royal Lotus': 'Royal_Lotus', 'Boss Western': 'Boss_Western_Brand',
              'Corlton': 'Corlton', 'Navatel': 'Navatel', 'Ardisson': 'Ardisson', 'Morriot': 'Morriot',
              'Chill Garden Inn': 'Chill_Garden_Inn', 'Quadrupletree': 'Quadrupletree'}

    return brands[brand]

def load_full_feature_set():
    # load data
    queries = pd.read_csv('all_queries.csv')
    prices = pd.read_csv('all_prices.csv')
    hotels = pd.read_csv('features_hotels.csv')
    test = pd.read_csv('test_set.csv')

    # drop query duplicates
    # queries = queries.drop_duplicates(subset=['language', 'city', 'date', 'mobile'])
    queries = queries.rename(columns={'queryId': 'order_requests'})
    prices = prices.rename(columns={'queryId': 'order_requests'})
    # queries = get_user_history(queries)

    ### X_TRAIN ###
    # merge queries, prices and hotel_features
    X_train = pd.merge(queries, prices, how='inner', on='order_requests')
    X_train = pd.merge(X_train, hotels, how='inner', on='hotel_id')
    X_train = X_train.drop(columns='city_y')
    X_train = X_train.rename(columns={'city_x': 'city'})

    # brand and group correction
    X_train['brand'] = X_train.apply(lambda x: map_hotel_brand(x['brand']), axis=1)
    X_train['group'] = X_train.apply(lambda x: map_hotel_group(x['group']), axis=1)

    X_train = X_train.drop(columns=['avatar_name'])
    
    # feature ordering to match test set
    X_train = X_train[['order_requests', 'avatar_id', 'city', 'language', 'date', 'mobile',
                       # 'user_history',
                       'stock', 'group', 'brand', 'parking', 'pool', 'hotel_id',
                       'children_policy', 'price']]
    ### X_TRAIN ###
    
    
    ### X_TEST ###
    # merge test_set with hotel_features
    # test = get_user_history(test)
    X_test = pd.merge(test, hotels, how='inner', on='hotel_id')
    X_test = X_test.drop(columns='city_y')
    X_test = X_test.rename(columns={'city_x': 'city'})

    # brand and group correction
    X_test['brand'] = X_test.apply(lambda x: map_hotel_brand(x['brand']), axis=1)
    X_test['group'] = X_test.apply(lambda x: map_hotel_group(x['group']), axis=1)

    X_test = X_test[['index', 'order_requests', 'avatar_id', 'city', 'language', 'date', 'mobile',
                     # 'user_history',
                     'stock', 'group', 'brand', 'parking', 'pool', 'hotel_id',
                     'children_policy']]
    ### X_TEST ###
    
    return X_train, X_test

In [13]:
X_train, X_test = load_full_feature_set()

X_train = X_train.set_index(['order_requests', 'avatar_id', 'hotel_id'])
X_test = X_test.set_index(['order_requests', 'avatar_id', 'hotel_id'])

y_train = X_train.pop('price')
test_idxs = X_test.pop('index')

In [14]:
categories = ['city', 'language', 'mobile', 'group', 'brand', 'parking', 'pool', 'children_policy']

X_train = pd.get_dummies(X_train, columns=categories)
X_test = pd.get_dummies(X_test, columns=categories)

In [15]:
# X = X_train.sample(200000, random_state=0)
X = X_train.copy()

In [None]:
X_index = X.index

In [16]:
input_dim = X.shape[1]
latent_dim = 10

######################################################
# Encoder Architecture
######################################################
encoder_inputs = keras.Input(shape=input_dim)
x = layers.Dense(64, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(encoder_inputs)
x = layers.Dense(32, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(x)
x = layers.Dense(16, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(x)
x = layers.Dense(12, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(x)
######################################################
# Sampling Layer
######################################################
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])

encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

######################################################
# Decoder Architecture
######################################################
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(12, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(latent_inputs)
x = layers.Dense(16, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(x)
x = layers.Dense(32, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(x)
x = layers.Dense(64, activation=tf.nn.leaky_relu, kernel_initializer='glorot_normal', bias_initializer='zeros')(x)
x1 = x[:,0:2]
x2 = x[:,2:]
x1 = layers.Dense(2, activation='relu')(x1)
x2 = layers.Dense(input_dim-2, activation='sigmoid')(x2)

decoder_outputs = tf.concat([x1, x2], 1)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 69)]         0           []                               
                                                                                                  
 dense (Dense)                  (None, 64)           4480        ['input_1[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 32)           2080        ['dense[0][0]']                  
                                                                                                  
 dense_2 (Dense)                (None, 16)           528         ['dense_1[0][0]']                
                                                                                            

In [17]:
vae = VAE(encoder, decoder)
# vae.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001, clipnorm=1.0))
vae.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001))
vae.fit(X, epochs=40, batch_size=64)

Z_mean, Z_log_var, Z = vae.encoder.predict(X)

print("latent variables are ready to used")

Epoch 1/40
Epoch 2/40
Epoch 3/40
 374/9346 [>.............................] - ETA: 36s - loss: 6.1314 - reconstruction_loss_MSE: 1.2124 - reconstruction_loss_binary: 0.2486 - kl_loss: 4.5880

KeyboardInterrupt: 

In [None]:
Z

array([[ 1.1627884 ,  0.36382967, -1.9693272 , ...,  0.51040834,
        -1.8198822 , -0.18090591],
       [-0.28934494, -0.94989425,  2.0511403 , ...,  0.62608874,
         0.23161753,  0.82270646],
       [ 1.8215923 , -0.15188862, -0.2334091 , ...,  1.3688658 ,
         1.577463  , -0.85774934],
       ...,
       [-0.2635221 ,  0.2726285 ,  2.450029  , ..., -1.2220204 ,
        -0.00587447,  0.44881904],
       [-0.48593444,  2.136825  ,  2.4633138 , ...,  1.555804  ,
         0.10123453, -0.8165832 ],
       [ 0.1999354 , -0.02391236,  2.4473205 , ..., -0.5206073 ,
         0.01083857,  0.9335608 ]], dtype=float32)

In [None]:
Z.shape

(598114, 10)

In [None]:
Z_mean

array([[ 2.3137587e-03,  1.0750362e-03, -1.2315954e+00, ...,
         3.9957291e-03, -4.7166005e-04,  3.7779240e-03],
       [ 1.3884320e-04, -6.3873366e-03,  2.0385087e+00, ...,
         1.3800473e-03, -7.3623345e-03, -9.0158265e-03],
       [-3.3529769e-03, -3.1204445e-03, -2.1445733e-01, ...,
         2.0309589e-03,  4.1578157e-04,  1.3662793e-03],
       ...,
       [-1.7632721e-03, -5.5003604e-03,  2.4500885e+00, ...,
        -6.9552334e-04, -1.0520962e-02, -1.0599390e-02],
       [-1.7310954e-03, -5.5810427e-03,  2.4501286e+00, ...,
        -6.3820928e-04, -1.0475635e-02, -1.0613852e-02],
       [-1.8533729e-03, -5.3540058e-03,  2.4495959e+00, ...,
        -8.8807894e-04, -1.0659224e-02, -1.0468761e-02]], dtype=float32)

In [None]:
Z_log_var

array([[ 4.3404463e-04,  3.4793017e-03, -2.6241369e+00, ...,
        -2.1427544e-04,  5.3302320e-03,  9.2198083e-04],
       [-7.6318458e-03, -1.7413255e-02, -8.9611855e+00, ...,
        -1.1936810e-02, -2.1525843e-02,  1.4607872e-02],
       [-3.2425406e-03, -2.3295602e-03, -5.7142162e+00, ...,
        -2.7391429e-03, -1.4571417e-02,  6.7619737e-03],
       ...,
       [-9.8784845e-03, -1.7779309e-02, -8.9979858e+00, ...,
        -1.5130653e-02, -2.2583274e-02,  1.6808735e-02],
       [-9.8624537e-03, -1.7834419e-02, -9.0011520e+00, ...,
        -1.5099277e-02, -2.2615775e-02,  1.6815709e-02],
       [-9.9891908e-03, -1.7701836e-02, -8.9962368e+00, ...,
        -1.5264818e-02, -2.2556579e-02,  1.6750272e-02]], dtype=float32)

In [None]:
np.save('10d/Z_10d.npy', Z)

In [None]:
np.save('10d/Z_mean_10d.npy', Z_mean)

In [None]:
np.save('10d/Z_log_var_10d.npy', Z_log_var)

In [None]:
X.to_csv('10d/original_data.csv')

In [None]:
Z_df = pd.DataFrame(data=Z, index=X_index)

In [None]:
Z_df.to_csv('10d/Z_df.csv')