- Develop a recommendation system using the ratings data available.

- Train Dataset name: user_hotel_rating-1555730075105

- Features:

- Hotelid, userid, rating

In [1]:
!pip install scikit-surprise




You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
from surprise import Dataset
from surprise import Reader, KNNWithMeans
from surprise.model_selection import cross_validate
import pandas as pd

In [3]:
user_ratings = pd.read_csv("user_hotel_rating-1555730075105.csv")
user_ratings.head()

Unnamed: 0,userid,Hotelid,OverallRating
0,user_78131,hotel_558,3
1,user_78131,hotel_603,2
2,user_78131,hotel_610,3
3,user_78131,hotel_574,3
4,user_78131,hotel_570,3


In [4]:
user_ratings.dtypes

userid           object
Hotelid          object
OverallRating     int64
dtype: object

In [5]:
user_ratings.shape

(271935, 3)

In [6]:
user_ratings.userid.nunique()

5010

In [7]:
user_ratings.Hotelid.nunique()

130

In [8]:
user_ratings['OverallRating'].min()

1

In [9]:
user_ratings['OverallRating'].max()

5

In [10]:
no_of_users = 1000
## Preparing data to make it compatible for "Surprise" Package
reader = Reader(line_format = 'user item rating', rating_scale=(1, 5))
## Reader is a function which returns a object compaitable with surprise package
data = Dataset.load_from_df(user_ratings.iloc[:1000], reader)

In [11]:
sim_parameters = {'name': 'cosine',
               'user_based': True 
               }
algo = KNNWithMeans(sim_options=sim_parameters)

# Running  a cross_validation procedure for a given algorithm, reporting accuracy measures and computation times

In [12]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0306  0.9324  1.0465  0.9883  0.9556  0.9907  0.0432  
MAE (testset)     0.8061  0.7554  0.8245  0.7795  0.7406  0.7812  0.0310  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.03    0.02    0.02    0.02    0.02    0.02    0.00    


{'test_rmse': array([1.03058069, 0.93238155, 1.04648155, 0.98832495, 0.95563999]),
 'test_mae': array([0.80607602, 0.75538862, 0.82454975, 0.77950248, 0.74057921]),
 'fit_time': (0.01093292236328125,
  0.007974386215209961,
  0.0059833526611328125,
  0.009006977081298828,
  0.006978034973144531),
 'test_time': (0.029904603958129883,
  0.023890256881713867,
  0.022941112518310547,
  0.024901628494262695,
  0.015958547592163086)}

In [13]:
#### Training the model on complete data

In [14]:
#Using full data for training
trainset = data.build_full_trainset()
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1764b6b5d68>

## Filtering instances which can be used for predictions

In [15]:
# Getting data points where predictions can be made
testset = trainset.build_anti_testset()

In [16]:
print (type(testset))

<class 'list'>


In [17]:
print (len(testset))

1562


## Making Predictions

In [18]:
predictions = algo.test(testset)

In [19]:
predictions[0:4]

[Prediction(uid='user_78131', iid='hotel_517', r_ui=3.356, est=4.273445981397632, details={'actual_k': 7, 'was_impossible': False}),
 Prediction(uid='user_78131', iid='hotel_519', r_ui=3.356, est=3.6941598013110486, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='user_78131', iid='hotel_565', r_ui=3.356, est=3.1889780522621587, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='user_78131', iid='hotel_630', r_ui=3.356, est=3.9830660765615544, details={'actual_k': 13, 'was_impossible': False})]

#### Function to calculate top 10 predictions for each user

In [20]:
# Fetching top 10 predictions for each user
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


In [21]:
top_n = get_top_n(predictions, n=10)

In [22]:
type(top_n)

collections.defaultdict

In [23]:
top_n.items()

dict_items([('user_78131', [('hotel_536', 4.505715411903823), ('hotel_517', 4.273445981397632), ('hotel_539', 4.2203583682805945), ('hotel_545', 4.2203583682805945), ('hotel_547', 4.2203583682805945), ('hotel_524', 4.019518006073826), ('hotel_526', 4.0023255813953496), ('hotel_630', 3.9830660765615544), ('hotel_529', 3.945507488709044), ('hotel_515', 3.8215312164626667)]), ('user_21002', [('hotel_536', 4.492278719397364), ('hotel_539', 4.206921675774135), ('hotel_545', 4.206921675774135), ('hotel_547', 4.206921675774135), ('hotel_524', 4.005363578287614), ('hotel_526', 3.988888888888889), ('hotel_529', 3.9298079760462112), ('hotel_600', 3.8455296950041493), ('hotel_593', 3.841875696646769), ('hotel_629', 3.830441757916404)]), ('user_24128', [('hotel_536', 3.63928726640591), ('hotel_539', 3.3539302227826817), ('hotel_545', 3.3539302227826817), ('hotel_547', 3.3539302227826817), ('hotel_628', 3.209888665414903), ('hotel_524', 3.152940984927734), ('hotel_526', 3.135897435897436), ('hotel_

In [24]:
take(10, top_n.items())

[('user_78131',
  [('hotel_536', 4.505715411903823),
   ('hotel_517', 4.273445981397632),
   ('hotel_539', 4.2203583682805945),
   ('hotel_545', 4.2203583682805945),
   ('hotel_547', 4.2203583682805945),
   ('hotel_524', 4.019518006073826),
   ('hotel_526', 4.0023255813953496),
   ('hotel_630', 3.9830660765615544),
   ('hotel_529', 3.945507488709044),
   ('hotel_515', 3.8215312164626667)]),
 ('user_21002',
  [('hotel_536', 4.492278719397364),
   ('hotel_539', 4.206921675774135),
   ('hotel_545', 4.206921675774135),
   ('hotel_547', 4.206921675774135),
   ('hotel_524', 4.005363578287614),
   ('hotel_526', 3.988888888888889),
   ('hotel_529', 3.9298079760462112),
   ('hotel_600', 3.8455296950041493),
   ('hotel_593', 3.841875696646769),
   ('hotel_629', 3.830441757916404)]),
 ('user_24128',
  [('hotel_536', 3.63928726640591),
   ('hotel_539', 3.3539302227826817),
   ('hotel_545', 3.3539302227826817),
   ('hotel_547', 3.3539302227826817),
   ('hotel_628', 3.209888665414903),
   ('hotel_52

## Top Predictions Matrix

In [25]:
# Printing top predictions
for uid, user_ratings in take(10,top_n.items()):
    print(uid, [iid for (iid, _) in user_ratings])

user_78131 ['hotel_536', 'hotel_517', 'hotel_539', 'hotel_545', 'hotel_547', 'hotel_524', 'hotel_526', 'hotel_630', 'hotel_529', 'hotel_515']
user_21002 ['hotel_536', 'hotel_539', 'hotel_545', 'hotel_547', 'hotel_524', 'hotel_526', 'hotel_529', 'hotel_600', 'hotel_593', 'hotel_629']
user_24128 ['hotel_536', 'hotel_539', 'hotel_545', 'hotel_547', 'hotel_628', 'hotel_524', 'hotel_526', 'hotel_630', 'hotel_529', 'hotel_600']
user_27174 ['hotel_536', 'hotel_517', 'hotel_539', 'hotel_545', 'hotel_547', 'hotel_628', 'hotel_524', 'hotel_526', 'hotel_529', 'hotel_605']
user_36448 ['hotel_536', 'hotel_539', 'hotel_545', 'hotel_547', 'hotel_524', 'hotel_526', 'hotel_529', 'hotel_600', 'hotel_593', 'hotel_605']
user_44791 ['hotel_536', 'hotel_539', 'hotel_545', 'hotel_547', 'hotel_524', 'hotel_526', 'hotel_630', 'hotel_529', 'hotel_593', 'hotel_629']
user_53199 ['hotel_593', 'hotel_605', 'hotel_600', 'hotel_628', 'hotel_566', 'hotel_560', 'hotel_599', 'hotel_506', 'hotel_517', 'hotel_519']
user_5