In [1]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import KNNWithMeans
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
import os
from surprise.model_selection import train_test_split
import pandas as pd

### Read and explore the given dataset.

In [2]:
df = pd.read_csv('./ratings_Electronics.csv', names = ['user', 'item', 'rating', 'timestamp'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
user         object
item         object
rating       float64
timestamp    int64
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


### Take a subset of the dataset to make it less sparse/ denser.	

In [3]:
names = df.user.unique()

In [4]:
df_dense = df[df['user'].isin(names[0:1000])]

In [5]:
reader = Reader(line_format='user item rating')

data = Dataset.load_from_df(df_dense[['user', 'item', 'rating']], reader=reader)

### Split the data randomly into train and test dataset.	

In [6]:
trainset, testset = train_test_split(data, test_size=.15)

### Build Popularity Recommender model	

In [7]:
algo = BaselineOnly()
algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fb600d2eb90>

In [8]:
algo.predict("A2PSBXHPJIXJ6J", "1400532655", verbose = True)

user: A2PSBXHPJIXJ6J item: 1400532655 r_ui = None   est = 4.36   {'was_impossible': False}


Prediction(uid='A2PSBXHPJIXJ6J', iid='1400532655', r_ui=None, est=4.359293950137876, details={'was_impossible': False})

In [9]:
test_pred = algo.test(testset)

In [10]:
accuracy.rmse(test_pred)

RMSE: 1.1414


1.1414296845542886

### Build Collaborative Filtering model	

In [11]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fb600d70650>

In [12]:
algo.predict("A2PSBXHPJIXJ6J", "1400532655", verbose = True)

user: A2PSBXHPJIXJ6J item: 1400532655 r_ui = None   est = 4.30   {'was_impossible': True, 'reason': 'User and/or item is unkown.'}


Prediction(uid='A2PSBXHPJIXJ6J', iid='1400532655', r_ui=None, est=4.302163833075734, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

In [13]:
test_pred = algo.test(testset)

In [14]:
accuracy.rmse(test_pred)

RMSE: 1.2378


1.2378298253989417

### Evaluate both the models	

In [None]:
The RMSE of collaborative model is 1.2378 while that of popularity based model is 1.1414.

### Get top - K ( K = 5) recommendations

In [29]:
def getTop5(uid):
    items = df_dense.item.unique()
    predictions = [algo.predict(uid, item) for item in items]
    top = pd.DataFrame(predictions).sort_values(by = ["est"], ascending = False)
    return top[0:5]

In [30]:
getTop5('A2BLGQ2SCSKCCY')

Unnamed: 0,uid,iid,r_ui,est,details
0,A2BLGQ2SCSKCCY,0132793040,,5.0,"{'actual_k': 0, 'was_impossible': False}"
1988,A2BLGQ2SCSKCCY,B005HSG3L0,,5.0,"{'actual_k': 0, 'was_impossible': False}"
1965,A2BLGQ2SCSKCCY,B005G92RCE,,5.0,"{'actual_k': 0, 'was_impossible': False}"
1966,A2BLGQ2SCSKCCY,B005G9RB3Y,,5.0,"{'actual_k': 0, 'was_impossible': False}"
1968,A2BLGQ2SCSKCCY,B005GCSZD6,,5.0,"{'actual_k': 0, 'was_impossible': False}"


### Description of criterion	

The top recommendations are based on the estimated rating that the user would have given to the product based on the
user user similarity.