In [90]:
import pandas as pd
import numpy as np
from surprise.model_selection import KFold
import json

## Obtain Data

In [22]:
reviews_df = pd.read_csv('shared_final_prediction_file.csv')

In [78]:
reviews_df.describe()

Unnamed: 0,rowid,listing_id,reviewer_id,polarity
count,1053172.0,1053172.0,1053172.0,1053172.0
mean,526585.5,11312220.0,66766500.0,0.8133919
std,304024.7,8636989.0,62495760.0,0.272226
min,0.0,2539.0,1.0,-1.0
25%,263292.8,3394964.0,15035780.0,0.7906
50%,526585.5,10052590.0,45005490.0,0.9215
75%,789878.2,18399860.0,106947900.0,0.9666667
max,1053171.0,32771250.0,247006600.0,1.0


In [24]:
reviews_df.shape

(1053172, 4)

In [25]:
reviews_df.polarity.describe()

count    1.053172e+06
mean     8.133919e-01
std      2.722260e-01
min     -1.000000e+00
25%      7.906000e-01
50%      9.215000e-01
75%      9.666667e-01
max      1.000000e+00
Name: polarity, dtype: float64

In [26]:
from surprise import SVD, KNNBasic, Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split

In [27]:
reader = Reader(rating_scale=(-1,1))

In [28]:
data = Dataset.load_from_df(reviews_df[['listing_id', 'reviewer_id', 'polarity']], reader)

In [36]:
trainset, testset = train_test_split(data, test_size=.3)

In [37]:
algo = SVD(n_factors=10, n_epochs=10)

In [38]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10d056b70>

In [39]:
predictions = algo.test(testset)

In [40]:
accuracy.rmse(predictions, verbose=True)

RMSE: 0.2659


0.26592393666752945

In [43]:
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.2661


0.2660984221479662

In [44]:
p = predictions[0]

In [45]:
p

Prediction(uid=21065293, iid=20810098, r_ui=0.7424, est=0.7428243756684929, details={'was_impossible': False})

In [48]:
output_file = open("/Users/vishu/SJSU/cmpe256/AirbnbProject/cmpe256-airbnbproject/svd_predictions.csv", "w")
output_file.write("userid,itemid,rating\n".format(user,item,rating ))

for i in range(0, len(predictions)):
    pred_tup = predictions[i]
    user = pred_tup[0]
    item = pred_tup[1]
    rating = pred_tup[3]
    output_file.write("{},{},{}\n".format(user,item,rating ))

output_file.close()

### Using SVDpp

In [13]:
from surprise import SVDpp

In [15]:
#SVD Matrix Factorization
algo = SVDpp(n_factors=10,n_epochs=10)
algo.fit(trainset)
predictions_svd = algo.test(testset)

In [16]:
accuracy.rmse(predictions_svd)

RMSE: 0.2669


0.26688308324524357

### Knn

In [12]:
#lets configure some parameters for Collaborative Filtering Algo
sim_options = {
    'name': 'pearson', #similarity measure
    'user_based': False #item based
}
#For user based ->True
#For name ->pearson,cosine,msd,pearson_baseline


In [None]:
#this is baseline configuration for optimizing the error
bsl_options = {'method': 'als',#another option is sgd
               'n_epochs': 5, # number of iterations
               'reg_u': 12, # user-regularisation parameter
               'reg_i': 5   # item-regularisation parameter
               }

We will try KNN with maximum neighbour as 3 and minimum as 1.We train our model on trainingset and generate prediction on test set.

In [13]:
#KNN
knn = KNNBasic(sim_options=sim_options,k=3,min_k=1) #neighbours=3


In [None]:
knn.fit(trainset)


Computing the pearson similarity matrix...


In [None]:
predictions_knn = knn.test(testset)

## Predicting top 10 recomendations

In [79]:
from collections import defaultdict
 
def get_top10_recommendations(predictions, topN = 10):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [80]:
top10_recommendations = get_top10_recommendations(predictions)

In [92]:
# For each user we print top 10 recommendations
final_reco = dict()

for uid, user_ratings in top10_recommendations.items():
    final_reco[uid] = [iid for (iid, _) in user_ratings]
    
with open('svd_user_recommendations.json', 'w') as fp:
    json.dump(final_reco, fp)

fp.close()