# Wide and Deep Recommender System - Expedia Hotel dataset

## Import Libraries

In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import warnings

import sys
#sys.path.append('/Users/yas/Downloads/github/recommender_system')

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense

from deepctr.inputs import build_input_features, get_linear_logit, input_from_feature_columns, combined_dnn_input
from deepctr.layers.core import PredictionLayer, DNN
from deepctr.layers.utils import add_func
from deepctr.models import WDL


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr.inputs import SparseFeat,get_feature_names

In [138]:
df = pd.read_csv('/Users/yas/Downloads/github/recommender_system/data/hotel_data/train.csv', sep=',', nrows=100000)
df.shape

(100000, 24)

### Read train and test data

In [139]:
df.head(n=2)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
date_time                    100000 non-null object
site_name                    100000 non-null int64
posa_continent               100000 non-null int64
user_location_country        100000 non-null int64
user_location_region         100000 non-null int64
user_location_city           100000 non-null int64
orig_destination_distance    63078 non-null float64
user_id                      100000 non-null int64
is_mobile                    100000 non-null int64
is_package                   100000 non-null int64
channel                      100000 non-null int64
srch_ci                      99929 non-null object
srch_co                      99929 non-null object
srch_adults_cnt              100000 non-null int64
srch_children_cnt            100000 non-null int64
srch_rm_cnt                  100000 non-null int64
srch_destination_id          100000 non-null int64
srch_destination_type

### drop orig_destination_distance and null values.

In [141]:
df= df.drop(['orig_destination_distance'],axis=1)

In [142]:
df = df.dropna()

In [143]:
df= df.drop(['date_time'],axis=1)
df= df.drop(['srch_ci'],axis=1)
df= df.drop(['srch_co'],axis=1)
df= df.drop(['srch_destination_id'],axis=1)

In [144]:
#Define an information dectionary for features
info_dic = {'date_time':'Timestamp',
'site_name':'ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, …)',
'posa_continent':'ID of continent associated with site_name',
'user_location_country':'The ID of the country the customer is located',
'user_location_region':'The ID of the region the customer is located',
'user_location_city':'The ID of the city the customer is located',
'orig_destination_distance':'Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated',
'user_id':'ID of user','is_mobile':'1 when a user connected from a mobile device, 0 otherwise',
'is_package':'1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise',
'channel':'ID of a marketing channel',
'srch_ci':'Checkin date','srch_co':'Checkout date',
'srch_adults_cnt':'The number of adults specified in the hotel room',
'srch_children_cnt':'The number of (extra occupancy) children specified in the hotel room',
'srch_rm_cnt':'The number of hotel rooms specified in the search',
'srch_destination_id':'ID of the destination where the hotel search was performed',
'srch_destination_type_id':'Type of destination','hotel_continent':'Hotel continent',
'hotel_country':'Hotel country',
'hotel_market':'Hotel market','is_booking':'1 if a booking, 0 if a click',
'cnt':'Numer of similar events in the context of the same user session','hotel_cluster':'ID of a hotel cluster'}

In [145]:
df = df.rename(columns={'hotel_cluster': 'item_id', 'is_booking': 'rating'})

In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99929 entries, 0 to 99999
Data columns (total 19 columns):
site_name                   99929 non-null int64
posa_continent              99929 non-null int64
user_location_country       99929 non-null int64
user_location_region        99929 non-null int64
user_location_city          99929 non-null int64
user_id                     99929 non-null int64
is_mobile                   99929 non-null int64
is_package                  99929 non-null int64
channel                     99929 non-null int64
srch_adults_cnt             99929 non-null int64
srch_children_cnt           99929 non-null int64
srch_rm_cnt                 99929 non-null int64
srch_destination_type_id    99929 non-null int64
rating                      99929 non-null int64
cnt                         99929 non-null int64
hotel_continent             99929 non-null int64
hotel_country               99929 non-null int64
hotel_market                99929 non-null int64
item_id  

In [147]:
info_dic['srch_destination_id']

'ID of the destination where the hotel search was performed'

In [148]:
df.shape

(99929, 19)

In [149]:
df.head()

Unnamed: 0,site_name,posa_continent,user_location_country,user_location_region,user_location_city,user_id,is_mobile,is_package,channel,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_type_id,rating,cnt,hotel_continent,hotel_country,hotel_market,item_id
0,2,3,66,348,48862,12,0,1,9,2,0,1,1,0,3,2,50,628,1
1,2,3,66,348,48862,12,0,1,9,2,0,1,1,1,1,2,50,628,1
2,2,3,66,348,48862,12,0,0,9,2,0,1,1,0,1,2,50,628,1
3,2,3,66,442,35390,93,0,0,3,2,0,1,1,0,1,2,50,1457,80
4,2,3,66,442,35390,93,0,0,3,2,0,1,1,0,1,2,50,1457,21


In [150]:
'''#Remove rows with the same user_id and item_id and different rating

max_rating = df.groupby(['user_id', 'item_id']).rating.transform(max)
df = df.loc[df.rating == max_rating]
df.drop_duplicates(keep='first',inplace=True)'''

"#Remove rows with the same user_id and item_id and different rating\n\nmax_rating = df.groupby(['user_id', 'item_id']).rating.transform(max)\ndf = df.loc[df.rating == max_rating]\ndf.drop_duplicates(keep='first',inplace=True)"

In [151]:
continuous_cols = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city', 'user_id', 'is_mobile',
       'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_type_id', 'rating', 'cnt',
       'hotel_continent', 'hotel_country', 'hotel_market', 'item_id']

In [152]:
sparse_features = ["item_id", "user_id","is_mobile", "is_package"]
target = ['rating']

In [153]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [154]:
# 2.count #unique features for each sparse field
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(),embedding_dim=4)
                          for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [155]:
# 3.generate input data for model
train, test = train_test_split(df, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [156]:
# 4.Define Model,train,predict and evaluate
model = WDL(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128, 128), l2_reg_linear=1e-5,
        l2_reg_embedding=1e-5, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0, dnn_activation='relu',
        task='binary')


model.compile("adam", "mse", metrics=['mse'], )

In [157]:
history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Train on 63954 samples, validate on 15989 samples
Epoch 1/10
63954/63954 - 2s - loss: 0.0868 - mean_squared_error: 0.0868 - val_loss: 0.0726 - val_mean_squared_error: 0.0724
Epoch 2/10
63954/63954 - 1s - loss: 0.0704 - mean_squared_error: 0.0702 - val_loss: 0.0731 - val_mean_squared_error: 0.0728
Epoch 3/10
63954/63954 - 1s - loss: 0.0693 - mean_squared_error: 0.0690 - val_loss: 0.0739 - val_mean_squared_error: 0.0735
Epoch 4/10
63954/63954 - 1s - loss: 0.0690 - mean_squared_error: 0.0685 - val_loss: 0.0744 - val_mean_squared_error: 0.0740
Epoch 5/10
63954/63954 - 1s - loss: 0.0687 - mean_squared_error: 0.0683 - val_loss: 0.0745 - val_mean_squared_error: 0.0741
Epoch 6/10
63954/63954 - 1s - loss: 0.0686 - mean_squared_error: 0.0681 - val_loss: 0.0744 - val_mean_squared_error: 0.0739
Epoch 7/10
63954/63954 - 1s - loss: 0.0684 - mean_squared_error: 0.0679 - val_loss: 0.0745 - val_mean_squared_error: 0.0740
Epoch 8/10
63954/63954 - 1s - loss: 0.0683 - mean_squared_error: 0.0678 - val_loss

In [158]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [159]:
print("test MSE", round(mean_squared_error(
        test[target].values, pred_ans), 4))

test MSE 0.0756


In [None]:
"""
    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
    :param l2_reg_linear: float. L2 regularizer strength applied to wide part
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
    :param init_std: float,to use as the initialize std of embedding vector
    :param seed: integer ,to use as random seed.
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param dnn_activation: Activation function to use in DNN
    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
    :return: A Keras model instance.
    

    features = build_input_features(
        linear_feature_columns + dnn_feature_columns)

    inputs_list = list(features.values())

    sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
                                                                         l2_reg_embedding, init_std, seed)

    linear_logit = get_linear_logit(features, linear_feature_columns, init_std=init_std, seed=seed, prefix='linear',
                                    l2_reg=l2_reg_linear)

    dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
    dnn_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                  False, seed)(dnn_input)
    dnn_logit = Dense(
        1, use_bias=False, activation=None)(dnn_out)

    final_logit = add_func([dnn_logit, linear_logit])

    output = PredictionLayer(task)(final_logit)

    model = Model(inputs=inputs_list, outputs=output)
    return model