<a id="a7b1"></a>
### Import Packages and load Surprise Data from DataFrame

In [None]:
!pip install scikit-surprise



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_color_codes()
sns.set(style="whitegrid")
%matplotlib inline
from scipy.stats import zscore
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
from google.colab import drive
drive.mount('/content/drive')
products = '/content/drive/My Drive/ratings_filtered.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
products = pd.read_csv(products, names=['userId', 'productId', 'rating','timestamp'], header=None)

# Now the 'products' DataFrame should be the same as if you had read it from the local file 'ratings_Electronics.csv'
print(products.head())

           userId   productId  rating   timestamp
0   AKM1MP6P0OYPR  0132793040     5.0  1365811200
1  A2CX7LUOHB2NDG  0321732944     5.0  1341100800
2  A2NWSAGRHCP8N5  0439886341     1.0  1367193600
3  A2WNBOD3WNDNKT  0439886341     3.0  1374451200
4  A1GI0U4ZRJA8WN  0439886341     1.0  1334707200


In [None]:
products = products.head(10000)

In [None]:
products.drop('timestamp', axis=1, inplace=True)

In [None]:

from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD
from surprise import KNNBasic
from surprise import KNNWithMeans
reader = Reader()
surprise_data = Dataset.load_from_df(products, reader)

In [None]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(surprise_data, test_size=.3, random_state=10)

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
class collab_filtering_based_recommender_model():
    def __init__(self, model, trainset, testset, data):
        self.model = model
        self.trainset = trainset
        self.testset = testset
        self.data = data
        self.pred_test = None
        self.recommendations = None
        self.top_n = None
        self.recommenddf = None

    def fit_and_predict(self):
        print('**Fitting the train data...**')
        self.model.fit(self.trainset)

        print('**Predicting the test data...**')

        #printmd('**Predicting the test data...**', color='brown')
        self.pred_test = self.model.test(self.testset)
        rmse = round(accuracy.rmse(self.pred_test), 3)
        #printmd('**RMSE for the predicted result is ' + str(rmse) + '**', color='brown')
        print('**RMSE for the predicted result is ' + str(rmse) + '**')

        self.top_n = get_top_n(self.pred_test)
        self.recommenddf = pd.DataFrame(columns=['userId', 'productId', 'Rating'])
        for item in self.top_n:
            subdf = pd.DataFrame(self.top_n[item], columns=['productId', 'Rating'])
            subdf['userId'] = item
            cols = subdf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            subdf = subdf[cols]
            self.recommenddf = pd.concat([self.recommenddf, subdf], axis = 0)
        return rmse

    def cross_validate(self):
        #printmd('**Cross Validating the data...**', color='brown')
        print('**Cross Validating the data...**')
        cv_result = cross_validate(self.model, self.data, n_jobs=-1)
        cv_result = round(cv_result['test_rmse'].mean(),3)
        #printmd('**Mean CV RMSE is ' + str(cv_result)  + '**', color='brown')
        print('**Mean CV RMSE is ' + str(cv_result)  + '**')
        return cv_result

    def recommend(self, user_id, n=5):
        #printmd('**Recommending top ' + str(n)+ ' products for userid : ' + user_id + ' ...**', color='brown')
        print('**Recommending top ' + str(n)+ ' products for userid : ' + user_id + ' ...**')

        #df = pd.DataFrame(self.top_n[user_id], columns=['productId', 'Rating'])
        #df['UserId'] = user_id
        #cols = df.columns.tolist()
        #cols = cols[-1:] + cols[:-1]
        #df = df[cols].head(n)
        df = self.recommenddf[self.recommenddf['userId'] == user_id].head(n)
        display(df)
        return df

In [None]:
from surprise.model_selection import RandomizedSearchCV

def find_best_model(model, parameters,data):
    clf = RandomizedSearchCV(model, parameters, n_jobs=-1, measures=['rmse'])
    clf.fit(data)
    print(clf.best_score)
    print(clf.best_params)
    print(clf.best_estimator)
    return clf

**User-User Collaborative Filtering**

To find the rating R that a user U would give to an item I, the approach includes:

1. Finding users similar to U who have rated the item I<br>
2. Calculating the rating R based the ratings of users found in the previous step

Here we are using KNN with means to remove the bias by taking into account the mean ratings of each user.

<a id="a7b3a"></a>
#### Find optimal model using RandomizedSearchCV

In [None]:
sim_options = {
    "name": ["msd", "cosine", "pearson", "pearson_baseline"],
    "min_support": [3, 4, 5],
    "user_based": [True],
}
params = { 'k': range(30,50,1), 'sim_options': sim_options}
clf = find_best_model(KNNWithMeans, params, surprise_data)

{'rmse': 1.4648949054737466}
{'rmse': {'k': 44, 'sim_options': {'name': 'pearson', 'min_support': 4, 'user_based': True}}}
{'rmse': <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x783396a6a770>}


In [None]:
knnwithmeans = clf.best_estimator['rmse']
col_fil_knnwithmeans = collab_filtering_based_recommender_model(knnwithmeans, trainset, testset, surprise_data)

In [None]:
knnwithmeans_rmse = col_fil_knnwithmeans.fit_and_predict()

**Fitting the train data...**
Computing the pearson similarity matrix...
Done computing similarity matrix.
**Predicting the test data...**
RMSE: 1.4749
**RMSE for the predicted result is 1.475**


In [None]:
knnwithmeans_cv_rmse = col_fil_knnwithmeans.cross_validate()

**Cross Validating the data...**
**Mean CV RMSE is 1.463**


In [None]:
res_knn_user1 = col_fil_knnwithmeans.recommend(user_id='A3IQGFB959IR4P', n=5)
res_knn_user2 = col_fil_knnwithmeans.recommend(user_id='A1KKUYTDUZDZSA', n=5)
res_knn_user3 = col_fil_knnwithmeans.recommend(user_id='A1L7JCX9IKGKIQ', n=5)

**Recommending top 5 products for userid : A3IQGFB959IR4P ...**


Unnamed: 0,userId,productId,Rating


**Recommending top 5 products for userid : A1KKUYTDUZDZSA ...**


Unnamed: 0,userId,productId,Rating


**Recommending top 5 products for userid : A1L7JCX9IKGKIQ ...**


Unnamed: 0,userId,productId,Rating


<a id="a7b3f"></a>
#### Observations

**KNN (K-Nearest Neighbours) With Means** model has an test RMSE value of **1.04** and cross validation RMSE value of **1.037**.

AS for the recommendations, **each user will have different products recommended to them** as they are infered based on the ratings provided by the similar users. To find the similar users, KNN model uses **cosine similarity or Pearson's correlation** to find the nerghbours.

<a id="a7b4"></a>
### SVD - Model Based Collaborative Filtering

The **Singular-Value Decomposition, or SVD** for short, is a matrix decomposition method for reducing a matrix to its constituent parts in order to make certain subsequent matrix calculations simpler. It provides another way to factorize a matrix, into singular vectors and singular values.

$A = U . Sigma . V^T$

<a id="a7b4a"></a>
#### Find optimal model using RandomizedSearchCV

In [None]:
params= {
    "n_epochs": [5, 10, 15, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
clf = find_best_model(SVD, params, surprise_data)

{'rmse': 1.4078162007562303}
{'rmse': {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}}
{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7833dc106920>}


<a id="a7b4b"></a>
#### Initialize

In [None]:
svd = clf.best_estimator['rmse']
col_fil_svd = collab_filtering_based_recommender_model(svd, trainset, testset, surprise_data)

In [None]:
svd_rmse = col_fil_svd.fit_and_predict()

**Fitting the train data...**
**Predicting the test data...**
RMSE: 1.4188
**RMSE for the predicted result is 1.419**


<a id="a7b4d"></a>
#### Cross Validate

In [None]:
svd_cv_rmse = col_fil_svd.cross_validate()

**Cross Validating the data...**
**Mean CV RMSE is 1.406**


<a id="a7b4e"></a>
#### Recommend


In [None]:
res_svd_user1 = col_fil_svd.recommend(user_id='AKM1MP6P0OYPR', n=5)
res_svd_user2 = col_fil_svd.recommend(user_id='A1JLEDHBYBOLG2', n=5)
res_svd_user3 = col_fil_svd.recommend(user_id='A169NZ5I5UQLRY', n=5)

**Recommending top 5 products for userid : AKM1MP6P0OYPR ...**


Unnamed: 0,userId,productId,Rating


**Recommending top 5 products for userid : A1JLEDHBYBOLG2 ...**


Unnamed: 0,userId,productId,Rating
0,A1JLEDHBYBOLG2,972683275,4.313709


**Recommending top 5 products for userid : A169NZ5I5UQLRY ...**


Unnamed: 0,userId,productId,Rating


<a id="a7b4f"></a>
#### Observations

SVD (Singular Value Decomposition) model has an test RMSE value of **0.99** and cross validation RMSE value of **0.981**. Using this model we have a reduced RMSE value compared to KNNWithMeans.

AS for the recommendations, **each user will have different products recommended to them** as they are infered by filling out missing entries in the matrix during matrix factorization using SVD.

<a id="a7b5"></a>
### Other SURPRISE algorithms


In [None]:
from surprise import NMF
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import BaselineOnly
from surprise import CoClustering
benchmark = []
# Iterate over all algorithms
for algorithm in [NMF(), KNNBaseline(), KNNBasic(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, surprise_data, measures=['RMSE'], cv=5, verbose=False)

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,1.405568,0.068546,0.016268
KNNBaseline,1.406959,3.104406,0.047084
KNNWithZScore,1.462662,2.207873,0.028308
CoClustering,1.46333,1.034513,0.009081
NMF,1.464006,0.82686,0.033816
KNNBasic,1.474867,1.994111,0.029922


In [None]:
#function to display dataframes side by side
from IPython.display import display_html
def display_side_by_side(args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline;margin-left:50px !important;margin-right: 40px !important"'),raw=True)


In [None]:
display_side_by_side([res_knn_user1, res_knn_user2, res_knn_user3])

Unnamed: 0,userId,productId,Rating

Unnamed: 0,userId,productId,Rating

Unnamed: 0,userId,productId,Rating


In [None]:
display_side_by_side([res_svd_user1, res_svd_user2, res_svd_user3])

Unnamed: 0,userId,productId,Rating

Unnamed: 0,userId,productId,Rating
0,A1JLEDHBYBOLG2,972683275,4.313709

Unnamed: 0,userId,productId,Rating


**KNN (K-Nearest Neighbours) With Means** model has an test RMSE value of **1.04** and cross validation RMSE value of **1.037. Each user will have different products recommended to them** as they are infered based on the ratings provided by the similar users.

SVD (Singular Value Decomposition) model has an test RMSE value of 0.99 and cross validation RMSE value of 0.981. Each user will have different products recommended to them.

<a id="a7b5a"></a>
#### Observations


**SVD** is better model compared to KNN or Popularity with a better RMSE value of **0.981**. We can also see the products that are recommended in SVD is different to that of KNNWithMeans as SVD uses matrix factorization. This is more useful when the data is sparse with many missing ratings.