In [5]:
import time
import datetime
import random
import pandas as pd
import numpy as np
import six
from tabulate import tabulate

from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering

In [6]:
user_ratings = pd.read_csv("user_hotel_rating-1555730075105.csv")
user_ratings.head()

Unnamed: 0,userid,Hotelid,OverallRating
0,user_78131,hotel_558,3
1,user_78131,hotel_603,2
2,user_78131,hotel_610,3
3,user_78131,hotel_574,3
4,user_78131,hotel_570,3


In [7]:
n_users = user_ratings.userid.unique().shape[0]
n_hotels = user_ratings.Hotelid.unique().shape[0]

In [8]:
print("The total number of users are {f}".format(f = n_users))
print("The total number of Hotelids are {f}".format(f = n_hotels))

The total number of users are 5010
The total number of Hotelids are 130


In [9]:
from surprise import Reader
reader = Reader(rating_scale = (1,5))

In [12]:
data = Dataset.load_from_df(user_ratings[['userid', 'Hotelid', 'OverallRating']], reader)

- Checking which baseline algorithm is giving the best metrics

In [13]:
classes = (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor)

In [15]:
np.random.seed(12321)
random.seed(12321)

data = Dataset.load_from_df(user_ratings[['userid', 'Hotelid', 'OverallRating']], reader)
kf = KFold(random_state=12321)  # folds will be the same for all algorithms.

table = []
for klass in classes:
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae'], kf, n_jobs=-1)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
    new_line = [klass, mean_rmse, mean_mae, cv_time]
    table.append(new_line)

header = ['Model','RMSE','MAE','Time']
print(tabulate(table, header, tablefmt="pipe"))

| Model                                                                |   RMSE |   MAE | Time    |
|:---------------------------------------------------------------------|-------:|------:|:--------|
| <class 'surprise.prediction_algorithms.matrix_factorization.SVD'>    |  0.832 | 0.633 | 0:00:59 |
| <class 'surprise.prediction_algorithms.matrix_factorization.SVDpp'>  |  0.826 | 0.622 | 0:08:51 |
| <class 'surprise.prediction_algorithms.matrix_factorization.NMF'>    |  0.849 | 0.643 | 0:01:10 |
| <class 'surprise.prediction_algorithms.slope_one.SlopeOne'>          |  0.842 | 0.64  | 0:00:34 |
| <class 'surprise.prediction_algorithms.knns.KNNBasic'>               |  0.846 | 0.636 | 0:04:53 |
| <class 'surprise.prediction_algorithms.knns.KNNWithMeans'>           |  0.836 | 0.632 | 0:04:50 |
| <class 'surprise.prediction_algorithms.knns.KNNBaseline'>            |  0.834 | 0.628 | 0:05:48 |
| <class 'surprise.prediction_algorithms.co_clustering.CoClustering'>  |  0.851 | 0.647 | 0:00:47 |


# Out of all the alogorithms, SVD is the efficient given the time taken for cross validation and RMSE.Hence,building the recommendation system based on SVD.


In [16]:
#Applying grid search on SVD model
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [10, 20], 'lr_all': [0.002, 0.005], 'reg_all': [0.02, 0.06, 0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs= -1)

%time gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Wall time: 6min 8s
0.8392766956241249
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.06}


In [17]:
# Using full data for training
algo = gs.best_estimator['rmse']
trainset = data.build_full_trainset() 
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1942478ef60>

In [18]:
# Getting data points where predictions can be made
testset = trainset.build_anti_testset()

In [19]:
print (len(testset))

379365


In [20]:
predictions = algo.test(testset)

In [21]:
predictions[0:5]

[Prediction(uid='user_78131', iid='hotel_517', r_ui=3.2571129130123007, est=3.40291277774096, details={'was_impossible': False}),
 Prediction(uid='user_78131', iid='hotel_519', r_ui=3.2571129130123007, est=3.427317478366814, details={'was_impossible': False}),
 Prediction(uid='user_78131', iid='hotel_565', r_ui=3.2571129130123007, est=3.6144485726813462, details={'was_impossible': False}),
 Prediction(uid='user_78131', iid='hotel_630', r_ui=3.2571129130123007, est=3.674346812551754, details={'was_impossible': False}),
 Prediction(uid='user_78131', iid='hotel_601', r_ui=3.2571129130123007, est=3.5086849857268705, details={'was_impossible': False})]

In [28]:
# Fetching top 10 predictions for each user
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


In [24]:
#Get top 5 predictions
top_n = get_top_n(predictions, n=5)
top_n.items()

dict_items([('user_78131', [('hotel_528', 3.751495637028117), ('hotel_630', 3.674346812551754), ('hotel_565', 3.6144485726813462), ('hotel_515', 3.6114583413495995), ('hotel_527', 3.608653794691293)]), ('user_21002', [('hotel_528', 3.7790306026379206), ('hotel_515', 3.7444423207609736), ('hotel_527', 3.711354834893113), ('hotel_573', 3.7050293355057162), ('hotel_587', 3.7040298131840874)]), ('user_24128', [('hotel_630', 2.9548672603077497), ('hotel_570', 2.8773986084306955), ('hotel_528', 2.870577747546706), ('hotel_515', 2.8395547260336618), ('hotel_573', 2.8341068557827613)]), ('user_27174', [('hotel_528', 2.8425427789625224), ('hotel_515', 2.7737572375956714), ('hotel_573', 2.726178614898859), ('hotel_507', 2.673252424237485), ('hotel_530', 2.6632229661876496)]), ('user_36448', [('hotel_528', 3.54894089454617), ('hotel_530', 3.3755624932698174), ('hotel_573', 3.3554134079758), ('hotel_550', 3.344912255066253), ('hotel_612', 3.326477238627268)]), ('user_44791', [('hotel_630', 2.76155

In [29]:
take(10, top_n.items())

[('user_78131',
  [('hotel_528', 3.751495637028117),
   ('hotel_630', 3.674346812551754),
   ('hotel_565', 3.6144485726813462),
   ('hotel_515', 3.6114583413495995),
   ('hotel_527', 3.608653794691293)]),
 ('user_21002',
  [('hotel_528', 3.7790306026379206),
   ('hotel_515', 3.7444423207609736),
   ('hotel_527', 3.711354834893113),
   ('hotel_573', 3.7050293355057162),
   ('hotel_587', 3.7040298131840874)]),
 ('user_24128',
  [('hotel_630', 2.9548672603077497),
   ('hotel_570', 2.8773986084306955),
   ('hotel_528', 2.870577747546706),
   ('hotel_515', 2.8395547260336618),
   ('hotel_573', 2.8341068557827613)]),
 ('user_27174',
  [('hotel_528', 2.8425427789625224),
   ('hotel_515', 2.7737572375956714),
   ('hotel_573', 2.726178614898859),
   ('hotel_507', 2.673252424237485),
   ('hotel_530', 2.6632229661876496)]),
 ('user_36448',
  [('hotel_528', 3.54894089454617),
   ('hotel_530', 3.3755624932698174),
   ('hotel_573', 3.3554134079758),
   ('hotel_550', 3.344912255066253),
   ('hotel_61

In [30]:
# Printing top predictions
for uid, user_ratings in take(5,top_n.items()):
    print(uid, [iid for (iid, _) in user_ratings])

user_78131 ['hotel_528', 'hotel_630', 'hotel_565', 'hotel_515', 'hotel_527']
user_21002 ['hotel_528', 'hotel_515', 'hotel_527', 'hotel_573', 'hotel_587']
user_24128 ['hotel_630', 'hotel_570', 'hotel_528', 'hotel_515', 'hotel_573']
user_27174 ['hotel_528', 'hotel_515', 'hotel_573', 'hotel_507', 'hotel_530']
user_36448 ['hotel_528', 'hotel_530', 'hotel_573', 'hotel_550', 'hotel_612']
