# Collaborative Filtering Recommender System - Expedia Hotel dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
import warnings


from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [2]:
hotel_train_set = pd.read_csv('data/hotel_data/train.csv', sep=',', nrows=100000)
hotel_train_set.shape

(100000, 24)

### Read train and test data

In [3]:
hotel_train_set.head(n=2)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1


In [4]:
hotel_train_set.columns

Index(['date_time', 'site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'is_booking', 'cnt', 'hotel_continent', 'hotel_country', 'hotel_market',
       'hotel_cluster'],
      dtype='object')

In [6]:
#Define an information dectionary for features
info_dic = {'date_time':'Timestamp',
'site_name':'ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, …)',
'posa_continent':'ID of continent associated with site_name',
'user_location_country':'The ID of the country the customer is located',
'user_location_region':'The ID of the region the customer is located',
'user_location_city':'The ID of the city the customer is located',
'orig_destination_distance':'Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated',
'user_id':'ID of user','is_mobile':'1 when a user connected from a mobile device, 0 otherwise',
'is_package':'1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise',
'channel':'ID of a marketing channel','srch_ci':'Checkin date','srch_co':'Checkout date',
'srch_adults_cnt':'The number of adults specified in the hotel room',
'srch_children_cnt':'The number of (extra occupancy) children specified in the hotel room',
'srch_rm_cnt':'The number of hotel rooms specified in the search',
'srch_destination_id':'ID of the destination where the hotel search was performed',
'srch_destination_type_id':'Type of destination','hotel_continent':'Hotel continent',
'hotel_country':'Hotel country','hotel_market':'Hotel market','is_booking':'1 if a booking, 0 if a click',
'cnt':'Numer of similar events in the context of the same user session','hotel_cluster':'ID of a hotel cluster'}

In [7]:
info_dic['user_id']

'ID of user'

In [8]:
len(hotel_train_set['srch_destination_type_id'].unique()),len(hotel_train_set['hotel_cluster'].unique())

(8, 100)

In [66]:
#df = hotel_train_set[['user_id','srch_destination_id','is_booking']]
df = hotel_train_set[['user_id','hotel_cluster','is_booking']]

In [67]:
df.shape

(100000, 3)

In [68]:
#rename columns
df.columns =['user_id', 'item_id', 'rating']

In [69]:
# for user 12 and item 12 we have 3 values
df.head()

Unnamed: 0,user_id,item_id,rating
0,12,1,0
1,12,1,1
2,12,1,0
3,93,80,0
4,93,21,0


### Remove rows with the same user_id and item_id and different rating

In [70]:
max_rating = df.groupby(['user_id', 'item_id']).rating.transform(max)
df = df.loc[df.rating == max_rating]
df.drop_duplicates(keep='first',inplace=True) 

In [71]:
train, test = train_test_split(df, test_size=0.3)

train.shape, test.shape

((28567, 3), (12244, 3))

# Find Similar Hotel clusters

In [33]:
ratings = pd.DataFrame(df.groupby('item_id')['rating'].mean())
ratings.head()

Unnamed: 0_level_0,rating
item_id,Unnamed: 1_level_1
0,0.117794
1,0.252396
2,0.209877
3,0.113043
4,0.184035


In [34]:
ratings['number_ratings'] = pd.DataFrame(df.groupby('item_id')['rating'].count())
ratings.head()

Unnamed: 0_level_0,rating,number_ratings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.117794,399
1,0.252396,313
2,0.209877,486
3,0.113043,345
4,0.184035,451


In [35]:
hotel_matrix = df.pivot_table(index='user_id',columns='item_id',values='rating')

In [36]:
hotel_matrix.head()

item_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,,1.0,,,,,,,,,...,,,,,,,,,,
93,,,,,,,,,,,...,,,0.0,,,,,,,
501,,,,,,,,,,,...,,,,,,,,,0.0,
756,,,1.0,,,,,,,,...,,,,,0.0,,,,,
776,,,,,,,,,,,...,,,,,,,,,,


In [37]:
def find_similar_clusters(cluster_number):
    #Select user ratings for twohotel_matrixmovies 
    item_user_ratings = hotel_matrix[cluster_number]

    # Find correlations between series with corrwith (instead of corr)
    similar_to_hotel = hotel_matrix.corrwith(item_user_ratings)

    # Removing NaN values and using a DataFrame instead of a series 
    corr_hotel = pd.DataFrame(similar_to_hotel,columns=['Correlation'])
    corr_hotel.dropna(inplace=True)

    corr_hotel = corr_hotel.join(ratings['number_ratings'])

    result = corr_hotel[corr_hotel['number_ratings']>0].sort_values('Correlation',ascending=False).head()
    return result

In [38]:
warnings.filterwarnings("ignore")
find_similar_clusters(11)

Unnamed: 0_level_0,Correlation,number_ratings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
11,1.0,411
66,0.469042,195
57,0.459353,304
35,0.404846,191
32,0.401742,336


# Recommendation Engine - collaborative filtering model from scratch

## Memory-Based CF by computing cosine similarity

In [39]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
n_users,n_items

(3478, 100)

In [41]:
train.head()

Unnamed: 0,user_id,item_id,rating
70528,269598,18,0
10567,58062,10,0
72276,282367,40,0
62491,216712,69,0
8837,51787,18,1


### Method 1 for generating user-item matrix produces error

In [42]:
# create a user-item matrix which can be used to calculate the similarity between users and items

data_matrix = np.zeros((n_users, n_items))
for line in df.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

    
# because of splitting the data into test and train we need to create two [943 x 1682] matrices

train_data_matrix = np.zeros((n_users, n_items))
# unpack the Pandas object
for line in train.itertuples():
    # adjust to count rows and cols from 0 and fill in the matrix
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

IndexError: index 3924 is out of bounds for axis 0 with size 3478

## Method 2 

In [46]:
data_matrix = np.zeros((n_users, n_items))
data_matrix = df.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

train_data_matrix = np.zeros((n_users, n_items))
train_data_matrix = train.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

test_data_matrix = np.zeros((n_users, n_items))
test_data_matrix = test.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

In [47]:
train_data_matrix.shape, test_data_matrix.shape

((3401, 100), (2967, 100))

In [48]:
# calculating the similarity by using the pairwise_distance from sklearn to calculate the cosine similarity
from sklearn.metrics.pairwise import pairwise_distances 

# user-user similarity
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')

# item-item similarity
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

user_similarity.shape , item_similarity.shape

((3401, 3401), (100, 100))

we can make a prediction by applying the following formula for user-based CF.


1) We can look at the similarity between users k and a as weights

2)  weights are multiplied by the ratings of a similar user a (corrected for the average rating of that user)

3) We need to normalize it so that the ratings stay between 1 and 5

4) As a final step, sum the average ratings for the user that you are trying to predict.

$$\hat{x}_{k,m}= \bar{x}_{k} + \frac{\sum_{u_{a}}sim_{u}(u_{k},u_{a})(x_{a,m}-\bar{x}_{u_{a}})}
{\sum_{u_{a}\left | sim_{u}(u_{k},u_{a}) \right |}}
$$

Also, we can make a prediction by applying the following formula for item-based CF.

 $$\hat{x}_{k,m}=  \frac{\sum_{i_{b}}sim_{i}(i_{m},i_{b})(x_{k,b})}
{\sum_{i_{b}\left | sim_{i}(i_{m},i_{b}) \right |}}
$$

In [49]:
#make predictions based on these similarities

def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [50]:
user_prediction = predict(train_data_matrix, user_similarity, type='user')
item_prediction = predict(train_data_matrix, item_similarity, type='item')

user_prediction.shape,item_prediction.shape

((3401, 100), (3401, 100))

## Model-Based CF by using singular value decomposition (SVD)

$$X = USV^{T}$$



In [51]:
sparsity = round(1.0-len(df)/float(n_users*n_items), 3)
print('The sparsity level of MovieLens100K is ' + str(sparsity*100) + '%')


import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k=20)
s_diag_matrix=np.diag(s)

#prediction
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

The sparsity level of MovieLens100K is 88.3%


## Evaluation with RMSE and MAE

In [54]:
user_prediction

array([[ 0.00657492,  0.00248392,  0.01460007, ...,  0.01182536,
         0.01930774,  0.01173131],
       [-0.00349015,  0.0035666 ,  0.00444869, ...,  0.00150838,
         0.00944722,  0.00150838],
       [-0.00349015,  0.0035666 ,  0.00444869, ...,  0.00150838,
         0.00944722,  0.00150838],
       ...,
       [ 0.00660781,  0.01368158,  0.01447693, ...,  0.0116184 ,
         0.01946498,  0.01152953],
       [ 0.00664416,  0.01349781,  0.0140849 , ...,  0.01174743,
         0.01949775,  0.01142064],
       [ 0.00639836,  0.01387238,  0.01467053, ...,  0.01158772,
         0.01934856,  0.0115896 ]])

In [58]:
#test_data_matrix = test_data_matrix.as_matrix()

In [72]:
#np.nonzero(test_data_matrix)

(array([   2,    5,    7, ..., 2955, 2961, 2964]),
 array([ 2, 95, 38, ..., 72, 30,  7]))

In [73]:
#test_data_matrix.nonzero()

(array([   2,    5,    7, ..., 2955, 2961, 2964]),
 array([ 2, 95, 38, ..., 72, 30,  7]))

In [80]:
test_data_matrix.shape,user_prediction.shape

((2967, 100), (3401, 100))

In [76]:
mean_squared_error(user_prediction, test_data_matrix)

ValueError: Found input variables with inconsistent numbers of samples: [3401, 2967]

In [74]:
def rmse(prediction, y):
    prediction = prediction[y.nonzero()].flatten()
    y = y[y.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, y))

def mae(prediction, y):
    prediction = prediction[y.nonzero()].flatten()
    y = y[y.nonzero()].flatten()
    return sqrt(mean_absolute_error(prediction, y))

print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
print('Model-Based CF RMSE: ' + str(rmse(X_pred, test_data_matrix)))
print("\n")
print('User-based CF MAE: ' + str(mae(user_prediction, test_data_matrix)))
print('Item-based CF MAE: ' + str(mae(item_prediction, test_data_matrix)))
print('Model-Based CF MAE: ' + str(mae(X_pred, test_data_matrix)))

User-based CF RMSE: 0.9814690092554227


TypeError: '(array([   2,    5,    7, ..., 2955, 2961, 2964]), array([ 2, 95, 38, ..., 72, 30,  7]))' is an invalid key

## Evaluation with Precision and recall
Precision and recall are binary metrics used to evaluate models with binary output. 

We need a way to translate the ratings from 1 to 5 into a binary problem.

To do the translation we will assume that any true rating above 3.5 corresponds to a relevant item and any true rating below 3.5 is irrelevant. 

We are intrested in recommending top-N items to the user. So it makes more sense to compute precision and recall metrics in the first N items instead of all the items.

Thus the notion of precision and recall at k where k is a user definable integer that is set by the user to match the top-N recommendations objective.

In [None]:
# threshold = 3.5


### Recommend hotel cluster for a user

In [174]:
rows = []
movies = list(df.drop_duplicates(subset='item_id', keep='first').item_id)
names = list(df.drop_duplicates(subset='item_id', keep='first').item_id)

for i in range(len(movies)):
    rows.append( dict({'name': names[i], 'item_id' : movies[i], 'user_id' : 9}))
test_data = pd.DataFrame(rows)

In [175]:
for idx, (score, name) in enumerate(sorted(zip(preds, names), reverse=True)):
    if idx >= 10:
        break
    print("Score: ", round(float(score), 2), " for hotel cluster: ", name)

Score:  0.41  for hotel cluster:  33
Score:  0.31  for hotel cluster:  45
Score:  0.31  for hotel cluster:  34
Score:  0.3  for hotel cluster:  81
Score:  0.29  for hotel cluster:  32
Score:  0.25  for hotel cluster:  96
Score:  0.25  for hotel cluster:  28
Score:  0.23  for hotel cluster:  15
Score:  0.23  for hotel cluster:  88
Score:  0.22  for hotel cluster:  29


# Resources

https://blog.cambridgespark.com/nowadays-recommender-systems-are-used-to-personalize-your-experience-on-the-web-telling-you-what-120f39b89c3c

https://course.fast.ai/videos/?lesson=4

https://towardsdatascience.com/collaborative-filtering-with-fastai-3dbdd4ef4f00

https://medium.com/quantyca/deep-learning-for-collaborative-filtering-using-fastai-b28e197ccd59