In [1]:
import pandas as pd
import numpy as np

## Obtain Data

In [2]:
reviews_df = pd.read_csv('../data/interim/final_ratings.csv')

In [3]:
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,lang,polarity
0,2539,55688172,2015-12-04,25160947,Peter,Great host,ENGLISH,0.6249
1,2539,97474898,2016-08-27,91513326,Liz,Nice room for the price. Great neighborhood. J...,ENGLISH,0.9228
2,2539,105340344,2016-10-01,90022459,Евгений,Very nice apt. New remodeled.,ENGLISH,0.4754
3,2539,133131670,2017-02-20,116165195,George,Great place to stay for a while. John is a gre...,ENGLISH,0.9231
4,2539,157777930,2017-06-04,1806142,Isaac,I really enjoyed my time here in deep south Br...,ENGLISH,0.9881


In [4]:
reviews_df.shape

(1053172, 8)

In [5]:
reviews_df.polarity.describe()

count    1.053172e+06
mean     8.133919e-01
std      2.722260e-01
min     -1.000000e+00
25%      7.906000e-01
50%      9.215000e-01
75%      9.666667e-01
max      1.000000e+00
Name: polarity, dtype: float64

In [6]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split

In [7]:
reader = Reader(rating_scale=(-1,1))

In [8]:
data = Dataset.load_from_df(reviews_df[['listing_id', 'reviewer_id', 'polarity']], reader)

In [9]:
data.read_ratings

<bound method Dataset.read_ratings of <surprise.dataset.DatasetAutoFolds object at 0x10f616be0>>

In [10]:
trainset, testset = train_test_split(data, test_size=.25)

In [11]:
algo = SVD(n_factors=10, n_epochs=10)

In [12]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

In [43]:
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.2661


0.2660984221479662

### Using SVDpp

In [13]:
from surprise import SVDpp

In [15]:
#SVD Matrix Factorization
algo = SVDpp(n_factors=10,n_epochs=10)
algo.fit(trainset)
predictions_svd = algo.test(testset)

In [16]:
accuracy.rmse(predictions_svd)

RMSE: 0.2669


0.26688308324524357

### Knn

In [None]:
#lets configure some parameters for Collaborative Filtering Algo
sim_options = {
    'name': 'pearson', #similarity measure
    'user_based': False #item based
}
#For user based ->True
#For name ->pearson,cosine,msd,pearson_baseline


In [None]:
#this is baseline configuration for optimizing the error
bsl_options = {'method': 'als',#another option is sgd
               'n_epochs': 5, # number of iterations
               'reg_u': 12, # user-regularisation parameter
               'reg_i': 5   # item-regularisation parameter
               }

We will try KNN with maximum neighbour as 3 and minimum as 1.We train our model on trainingset and generate prediction on test set.

In [9]:
#KNN
knn = KNNBasic(sim_options=sim_options,k=3,min_k=1) #neighbours=3
knn.fit(trainset)
predictions_knn = knn.test(testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


## Predicting top 3 reccomendations

In [17]:
from collections import defaultdict
 
def get_top3_recommendations(predictions, topN = 3):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [18]:
top3_recommendations = get_top3_recommendations(predictions)

In [22]:
# For each user we print 3 recommendation
i=0;
for uid, user_ratings in top3_recommendations.items():
    print(uid, [iid for (iid, _) in user_ratings])
    i=i+1;
    if(i==5):
        break;

6800277 [117180600, 223960747, 33160735]
18402330 [66761764, 46562245, 184214926]
10404638 [10234262, 1222411, 138698041]
14805984 [98518310, 98410232, 122178507]
16157731 [37187868, 89539814, 35971773]
