# DeepFM Recommender System - Expedia Hotel dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import warnings
import math
from math import sqrt
import sys
import holidays
import datetime

from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr.inputs import build_input_features, get_linear_logit, input_from_feature_columns, combined_dnn_input
from deepctr.layers.core import PredictionLayer, DNN
from deepctr.layers.utils import add_func
from deepctr.models import WDL, DeepFM
from deepctr.inputs import SparseFeat,get_feature_names


from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense

import scipy.sparse as sparse
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
import implicit

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv('../data/hotel_data/train.csv', sep=',', nrows=1500000)
destinations = pd.read_csv('../data/hotel_data/destinations.csv', sep=',')
df.shape

(1500000, 24)

In [3]:
#merge only top 10 most correlated columns with rating column
df = pd.merge(df,destinations[['srch_destination_id','d33', 'd64', 'd52', 'd120', 'd72', 'd136', 'd7', 'd59', 'd50', 'd30']],on='srch_destination_id')

In [4]:
# rename 2 columns
df = df.rename(columns={'hotel_cluster': 'item_id', 'is_booking': 'rating'})
df = df.dropna()

## Feature Engineering

In [5]:
# sort values
from pandas.tseries.offsets import Week
df = df.sort_values("date_time").reset_index()
df.drop('index',axis=1,inplace=True)

In [6]:
df["date_time"] =  pd.to_datetime(df["date_time"], infer_datetime_format=True)
df["date_time"] = df.date_time.dt.strftime('%Y-%m-%d')
#df["date_time_timestamp"] =  pd.to_datetime(df["date_time"], infer_datetime_format=True)

In [7]:
d = datetime.timedelta(days=14)
df['lagged_date_time'] = df["date_time"].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d") + d)

def extract_week(feature,week,lag):
    df[feature] =  pd.to_datetime(df[feature], infer_datetime_format=True)
    df[feature] = df.date_time.dt.strftime('%Y-%m-%d')
    if lag == True:
        d = datetime.timedelta(days=14)
        df['lag_date_time'] = df[feature].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d") + d)
        df['week'] = pd.DatetimeIndex(df['lag_date_time']).week
        df['year']=pd.DatetimeIndex(df['lag_date_time']).year
        
        # countinue week numbers for the next year
        df[week] = df['week'].where(df['year'] ==2013 , df['week']+52)
extract_week('date_time','click_week',lag=True)

# extract month from date_time
df['click_month'] = pd.DatetimeIndex(df['date_time']).month

In [8]:
df['checkin_month'] = pd.DatetimeIndex(df['srch_ci']).month
df['checkout_month'] = pd.DatetimeIndex(df['srch_co']).month

df['checkin_year'] = pd.DatetimeIndex(df['srch_ci']).year
df['checkout_year'] = pd.DatetimeIndex(df['srch_co']).year

In [9]:
# Define holidays in some countries
ca_holidays = holidays.Canada()
us_holidays = holidays.UnitedStates()

# check if checkin or checkout date is in holiday of different countries

df['north_am_ci'] = df['srch_ci'].apply(lambda x: 1 if x in (us_holidays or ca_holidays)  else 0)
df['north_am_co'] = df['srch_co'].apply(lambda x: 1 if x in (us_holidays or ca_holidays)  else 0)

In [10]:
df= df.drop(['date_time'],axis=1)
df= df.drop(['week'],axis=1)
df= df.drop(['year'],axis=1)
df= df.drop(['srch_ci'],axis=1)
df= df.drop(['srch_co'],axis=1)
df= df.drop(['lag_date_time'],axis=1)
#df= df.drop(['date_time_timestamp'],axis=1)
df= df.drop(['lagged_date_time'],axis=1)
#df= df.drop(['num_visit'],axis=1)

In [11]:
#Note that we add 1 to the raw count to prevent the logarithm from
# exploding into negative infinity in case the count is zero.
df['log_orig_destination_distance'] = np.log10(df['orig_destination_distance'] + 1)

df= df.drop(['orig_destination_distance'],axis=1)

In [12]:
from sklearn.cluster import KMeans
def create_cluster(feature):
    y = df[feature]
    X = df.drop(feature,axis=1)
    wcss=[]
    for i in range(1,11):
        kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
        kmeans.fit(X)
        wcss.append(kmeans.inertia_)
    plt.plot(range(1,11), wcss)
    plt.title('The Elbow Method')
    plt.xlabel('number of clusters')
    plt.ylabel('wcss')
    plt.show()

In [13]:
X = df.drop("user_location_region",axis=1)
kmeansmodel = KMeans(n_clusters= 3, init='k-means++', random_state=0)
y_kmeans= kmeansmodel.fit_predict(X)
df['kmeans_user_location_region']=y_kmeans
df= df.drop(['user_location_region'],axis=1)

In [14]:
X = df.drop("user_location_city",axis=1)
kmeansmodel = KMeans(n_clusters= 3, init='k-means++', random_state=0)
y_kmeans= kmeansmodel.fit_predict(X)
df['kmeans_user_location_city']=y_kmeans
df= df.drop(['user_location_city'],axis=1)

In [15]:
condlist = [(df['srch_adults_cnt']==0) & (df['srch_children_cnt']==0),
            (df['srch_adults_cnt']==2) & (df['srch_children_cnt']==0),
            (df['srch_adults_cnt']==2) & (df['srch_children_cnt']==1),
            (df['srch_adults_cnt']==2) & (df['srch_children_cnt']==2),
           (df['srch_adults_cnt']==1) & (df['srch_children_cnt']==0),
            (df['srch_adults_cnt']>1) & (df['srch_children_cnt']>0),
           (df['srch_adults_cnt']==1) & (df['srch_children_cnt'] > 0),
           (df['srch_adults_cnt']>2) & (df['srch_children_cnt'] == 0),
           (df['srch_adults_cnt']==0) & (df['srch_children_cnt'] > 0)]

choicelist = ['empty_room',
                'couple_with_no_children',
                'couple_with_one_child',
                'couple_with_two_children',
                'single',
                'big_family',
                'single_parent',
                'friends',
                'unsupervised_children']

df['family_status'] = np.select(condlist,choicelist)

In [16]:
#Convert the family_status into dummy variables
dummies = pd.get_dummies(df['family_status'],drop_first=True)
df= pd.concat( [df.drop('family_status',axis=1),dummies],axis=1)

if "unsupervised_children" in df.columns:
    df= df.drop("unsupervised_children",axis=1)
if "empty_room" in df.columns:
    df= df.drop("empty_room",axis=1)

In [17]:
df['cnt'] = (df['cnt'] - df['cnt'].mean())/df['cnt'].std()

# Define features

In [18]:
# categ_sparse / conti_dense
sparse_features = ["site_name", #ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, …)
"posa_continent", #ID of continent associated with site_name
"user_location_country", #The ID of the country the customer is located
"kmeans_user_location_region", #The ID of the region the customer is located clustered in 2 groups
"kmeans_user_location_city", #The ID of the city the customer is located clustered in 2 groups
"user_id", #ID of user
"is_mobile", #1 when a user connected from a mobile device, 0 otherwise
"is_package", #1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise
"channel", #ID of a marketing channel
"cnt", #Numer of similar events in the context of the same user session
"srch_destination_id", #ID of the destination where the hotel search was performed'
"srch_destination_type_id", #Type of destination
"hotel_continent", #'Hotel continent',
"hotel_country", #Hotel country
"item_id", #(hotel_cluster)ID of a hotel cluster
"north_am_ci", # 1 if check-in date it's a holiday in north America
"north_am_co",# 1 if check-out date it's a holiday in north America
'hotel_market', #Hotel market
'couple_with_no_children','couple_with_one_child','couple_with_two_children',"friends","single","single_parent",
#hotel search latent attributes highly correlated with rating:
'd33', 'd64','d52','d120', 'd72', 'd136', 'd7', 'd59', 'd50', 'd30'] 

dense_features = ["srch_adults_cnt", #The number of adults specified in the hotel room
"srch_children_cnt", #The number of (extra occupancy) children specified in the hotel room
"srch_rm_cnt", #The number of hotel rooms specified in the search
'log_orig_destination_distance', # Log transformed physical distance between a hotel and a customer at the time of search
"click_week",
"click_month",
"checkin_month",
"checkout_month",
"checkin_year",
"checkout_year"]
target = ['rating']

### Simple preprocessing

In [19]:
# Label Encoding for sparse features,and normalization for dense numerical features
for feat in sparse_features:
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [20]:
mms = MinMaxScaler(feature_range=(0,1))
df[dense_features] = mms.fit_transform(df[dense_features])

### Generate feature columns
For sparse features, we transform them into dense vectors by embedding techniques. For dense numerical features, we concatenate them to the input tensors of fully connected layer.

In [21]:
# count #unique features for each sparse field
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(),embedding_dim=4)
                          for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

### Generate the training samples and train the model

In [22]:
# generate input data for model
train, test = train_test_split(df, test_size=0.3)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [23]:
train.shape, test.shape

((658258, 45), (282111, 45))

# Best DeepFM Model after hyper-parameter tuning

In [24]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128,128)
            , init_std=0.0001, seed=1024, dnn_dropout=0.5, dnn_activation='relu',task='binary',
               fm_group=['default_group'],dnn_use_bn=False)

model.compile("adam", "mse", metrics=['mse'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [25]:
history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Train on 526606 samples, validate on 131652 samples
Epoch 1/10
526606/526606 - 37s - loss: 0.0758 - mean_squared_error: 0.0740 - val_loss: 0.0734 - val_mean_squared_error: 0.0709
Epoch 2/10
526606/526606 - 30s - loss: 0.0730 - mean_squared_error: 0.0701 - val_loss: 0.0740 - val_mean_squared_error: 0.0712
Epoch 3/10
526606/526606 - 28s - loss: 0.0722 - mean_squared_error: 0.0691 - val_loss: 0.0745 - val_mean_squared_error: 0.0714
Epoch 4/10
526606/526606 - 28s - loss: 0.0716 - mean_squared_error: 0.0685 - val_loss: 0.0747 - val_mean_squared_error: 0.0716
Epoch 5/10
526606/526606 - 30s - loss: 0.0710 - mean_squared_error: 0.0679 - val_loss: 0.0746 - val_mean_squared_error: 0.0716
Epoch 6/10
526606/526606 - 29s - loss: 0.0705 - mean_squared_error: 0.0673 - val_loss: 0.0752 - val_mean_squared_error: 0.0721
Epoch 7/10
526606/526606 - 30s - loss: 0.0703 - mean_squared_error: 0.0670 - val_loss: 0.0760 - val_mean_squared_error: 0.0727
Epoch 8/10
526606/526606 - 33s - loss: 0.0702 - mean_square

In [26]:
pred_ans = model.predict(test_model_input, batch_size=256)


In [27]:
auc = roc_auc_score(test[target].values, pred_ans)
print("RMSE:\t%f" % np.round(math.sqrt(mean_squared_error(test[target].values, pred_ans)),3),
      "MAE:\t%f" % np.round(mean_absolute_error(test[target].values, pred_ans),3),
      "MSE:\t%f" % np.round(mean_squared_error(test[target].values, pred_ans),3),
      "AUC:\t%f" % np.round(auc,3),
      sep='\n')

RMSE:	0.271000
MAE:	0.140000
MSE:	0.073000
AUC:	0.781000


In [28]:
warnings.filterwarnings("ignore")
new_df = test[['rating','item_id','user_id']]

#replace the rating with algorithm generated output
new_df['rating']=pred_ans

In [29]:
new_df

Unnamed: 0,rating,item_id,user_id
479619,0.001050,97,17859
147990,0.024518,91,21103
769502,0.000319,59,27928
91391,0.153385,48,4788
38904,0.096915,69,18090
...,...,...,...
25656,0.000182,98,6099
478452,0.112156,4,2472
188862,0.152395,41,1606
514057,0.041339,69,31645


In [30]:
#csr_matrix((data, (row, col))
sparse_item_user = sparse.csr_matrix((new_df['rating'].astype(float),(new_df['item_id'], new_df['user_id'])))
sparse_user_item = sparse.csr_matrix((new_df['rating'].astype(float),(new_df['user_id'], new_df['item_id'])))


model = implicit.als.AlternatingLeastSquares(factors=20,regularization=0.1,iterations=20)
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')
model.fit(data_conf)




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [31]:
def find_similar_clusters(item_id,n_similar):
    similar = model.similar_items(item_id,n_similar)
    return similar

### 65 is in hotel_resort and the results have 4 out of 5 clusters in common

In [32]:
find_similar_clusters(65,6)

[(65, 0.823982),
 (52, 0.8199171),
 (87, 0.8089151),
 (66, 0.8022266),
 (31, 0.77161545),
 (96, 0.70043224)]

### 70 is in business_hotels and the results have 2 out of 5 clusters in common

In [33]:
find_similar_clusters(70,6)

[(70, 0.85447913),
 (56, 0.76396084),
 (98, 0.62938166),
 (69, 0.594637),
 (41, 0.54545015),
 (97, 0.4288375)]

### 4 is in private_vacation_homes and the results have 2 out of 5 clusters in common

In [34]:
find_similar_clusters(4,6)

[(4, 0.9359653),
 (49, 0.47150153),
 (21, 0.42341626),
 (17, 0.41468728),
 (50, 0.39217094),
 (33, 0.38477224)]

We can compare that if the results are similar with the defined following clusters 

### Create dataframe to store clusters

In [35]:
hotel_df = pd.DataFrame(columns=['item_id','hotel_type'])
hotel_df['item_id']=list(range(100))

In [36]:
cluster = {"apartment":[5, 11, 22, 28,41, 56, 73],
          'business_hotels':[ 64,69, 70, 97],
          "condo":[3,8,36, 37, 55],
          "private_vacation_homes":[ 4, 9, 21, 49, 75, 77],
          "motel":[2,25,27, 95, 98],
          "beach_resort":[0, 17, 26, 31, 34, 80, 84, 92],
          "casino_hotel":[1, 19, 45, 54, 79,89, 93],
          "hotel_resort":[52, 65, 66, 87, 96],
          "bed_n_breakfast":[23, 39, 50, 51, 76],
          "hosetel":[12, 20, 38, 53, 57, 60, 61, 85, 86]}

## Store on dataframe

In [37]:
warnings.filterwarnings("ignore")
for i in cluster.keys():
    hotel_df['hotel_type'][cluster[i]]= i

In [38]:
hotel_df

Unnamed: 0,item_id,hotel_type
0,0,beach_resort
1,1,casino_hotel
2,2,motel
3,3,condo
4,4,private_vacation_homes
...,...,...
95,95,motel
96,96,hotel_resort
97,97,business_hotels
98,98,motel


## method 2 for finding similar clusters (Method 1 is more accurate)

In [39]:
hotel_matrix = new_df.pivot_table(index='user_id',columns='item_id',values='rating')

In [40]:
ratings = pd.DataFrame(new_df.groupby('item_id')['rating'].mean())
ratings['number_ratings'] = pd.DataFrame(new_df.groupby('item_id')['rating'].count())
ratings.head()

Unnamed: 0_level_0,rating,number_ratings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.040107,3426
1,0.083191,3782
2,0.113924,2891
3,0.054603,1463
4,0.106419,2958


In [41]:
def find_similar_clusters(cluster_number):
    #Select user ratings for twohotel_matrixmovies 
    item_user_ratings = hotel_matrix[cluster_number]

    # Find correlations between series with corrwith (instead of corr)
    similar_to_hotel = hotel_matrix.corrwith(item_user_ratings)

    # Removing NaN values and using a DataFrame instead of a series 
    corr_hotel = pd.DataFrame(similar_to_hotel,columns=['Correlation'])
    corr_hotel.dropna(inplace=True)

    corr_hotel = corr_hotel.join(ratings['number_ratings'])

    result = corr_hotel[corr_hotel['number_ratings']>0].sort_values('Correlation',ascending=False).head()
    return result

In [42]:
warnings.filterwarnings("ignore")
find_similar_clusters(65)

Unnamed: 0_level_0,Correlation,number_ratings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
65,1.0,5892
27,0.729484,541
66,0.676894,2497
0,0.675652,3426
40,0.630496,3193
