In [1]:
import pandas as pd
import numpy as np
import surprise
import os

In [2]:
ratings=pd.read_csv("ratings.txt",sep=' ',names=['uid','iid','rating'])
ratings.head()

Unnamed: 0,uid,iid,rating
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [3]:
lowest_rating=ratings['rating'].min()
highest_rating=ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))

Ratings range between 0.5 and 4.0


In [4]:
reader=surprise.Reader(rating_scale=(lowest_rating,highest_rating))
data=surprise.Dataset.load_from_df(ratings,reader)
type(data)


surprise.dataset.DatasetAutoFolds

In [5]:
similarity_options={'name':'cosine','user_based':True}
#Default k=40
algo=surprise.KNNBasic(sim_options=similarity_options)
output=algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


 #### Expected rating for user 50 for item 217

In [6]:
pred=algo.predict(uid='50',iid='217')
print(pred.est)

3.0028030537791928


### Total Items

In [7]:
iids = ratings['iid'].unique()
iids

array([   1,    2,    3, ..., 2069, 2070, 2071])

#### the list of the item rated by user 50

In [8]:
u_iid = ratings[ratings['uid']==50]['iid'].unique()
print(u_iid)

[  8 211   3   2 219 234  12 254 250 207  11 253 236  84  10   7 233  13
   1   5   6 252 241 216 257 206   4 217   9 215 213  17 255 220 121 245
 239 251 235]


#### the list of item not rated by user 50

In [9]:
iids_to_predict = np.setdiff1d(iids,u_iid)
print(iids_to_predict)

[  14   15   16 ... 2069 2070 2071]


In [10]:
len(iids_to_predict)

2032

#### Extracting thr estimated rating from iids_to_predict

In [11]:
testset=[[50,iid,0] for iid in iids_to_predict]
testset

[[50, 14, 0],
 [50, 15, 0],
 [50, 16, 0],
 [50, 18, 0],
 [50, 19, 0],
 [50, 20, 0],
 [50, 21, 0],
 [50, 22, 0],
 [50, 23, 0],
 [50, 24, 0],
 [50, 25, 0],
 [50, 26, 0],
 [50, 27, 0],
 [50, 28, 0],
 [50, 29, 0],
 [50, 30, 0],
 [50, 31, 0],
 [50, 32, 0],
 [50, 33, 0],
 [50, 34, 0],
 [50, 35, 0],
 [50, 36, 0],
 [50, 37, 0],
 [50, 38, 0],
 [50, 39, 0],
 [50, 40, 0],
 [50, 41, 0],
 [50, 42, 0],
 [50, 43, 0],
 [50, 44, 0],
 [50, 45, 0],
 [50, 46, 0],
 [50, 47, 0],
 [50, 48, 0],
 [50, 49, 0],
 [50, 50, 0],
 [50, 51, 0],
 [50, 52, 0],
 [50, 53, 0],
 [50, 54, 0],
 [50, 55, 0],
 [50, 56, 0],
 [50, 57, 0],
 [50, 58, 0],
 [50, 59, 0],
 [50, 60, 0],
 [50, 61, 0],
 [50, 62, 0],
 [50, 63, 0],
 [50, 64, 0],
 [50, 65, 0],
 [50, 66, 0],
 [50, 67, 0],
 [50, 68, 0],
 [50, 69, 0],
 [50, 70, 0],
 [50, 71, 0],
 [50, 72, 0],
 [50, 73, 0],
 [50, 74, 0],
 [50, 75, 0],
 [50, 76, 0],
 [50, 77, 0],
 [50, 78, 0],
 [50, 79, 0],
 [50, 80, 0],
 [50, 81, 0],
 [50, 82, 0],
 [50, 83, 0],
 [50, 85, 0],
 [50, 86, 0],
 [50, 

In [12]:
testset=[[50,iid,0] for iid in iids_to_predict]
predictions=algo.test(testset)

In [13]:
type(predictions)

list

In [14]:
predictions[:5]

[Prediction(uid=50, iid=14, r_ui=0, est=1.0249112823172175, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=50, iid=15, r_ui=0, est=2.3010819030660024, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=50, iid=16, r_ui=0, est=3.365656247496976, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid=50, iid=18, r_ui=0, est=3.4750887176827825, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=50, iid=19, r_ui=0, est=2.9501774353655654, details={'actual_k': 2, 'was_impossible': False})]

In [15]:
pred_ratings= [(predictions[i].iid,predictions[i].est) for i in range(0,len(predictions))]
predicted_ratings=pd.DataFrame(pred_ratings,columns=['iid','est_rating'])
#predicted_ratings=pd.DataFrame(pred_ratings,columns=['iid','est_rating'])
predicted_ratings.sort_values(by='est_rating',ascending=False)

Unnamed: 0,iid,est_rating
562,602,4.0
1750,1790,4.0
563,603,4.0
561,601,4.0
1753,1793,4.0
...,...,...
1480,1520,0.5
1487,1527,0.5
895,935,0.5
1514,1554,0.5


## USING KFOLD 

In [16]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection import KFold

In [18]:
param_grid = {'k':np.arange(30,70,10)}
param_grid

{'k': array([30, 40, 50, 60])}

In [19]:
kfold = KFold(n_splits=5,random_state=23,shuffle=True)
gs = GridSearchCV(surprise.KNNBasic,param_grid,measures=['rmse','mae'],cv = kfold)

In [20]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

#### Best Score

In [21]:
print(gs.best_score['rmse'])

0.8641633357915124


#### Best Parameter

In [22]:
print(gs.best_params['mae'])

{'k': 40}


### Item Based Filtering

In [None]:
param_grid = {'k':np.arange(30,70,10),'user_based':[False]}
param_grid

In [24]:
kfold = KFold(n_splits=5,random_state=23,shuffle=True)
gs = GridSearchCV(surprise.KNNBasic,param_grid,measures=['rmse','mae'],cv = kfold)
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [25]:
print("RMSE",gs.best_score['rmse'])
print("MAE",gs.best_params['mae'])

RMSE 0.8641633357915124
MAE {'k': 40}
