In [14]:
import pandas as pd
import numpy as np
import math
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import KNNWithMeans

In [2]:
file_path = 'ratings_small.csv'

reader = Reader(line_format='user item rating', sep=',', rating_scale = (1,5))
#ratings = pd.read_csv('ratings_small.csv')

ratings = pd.read_csv(file_path)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [6]:
trainset = data.build_full_trainset()
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  671 

Number of items:  9066 



In [7]:
#find the unique id of items and users
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = np.array(list(map(iid_converter, trainset_iids)))

trainset_uids = list(trainset.all_users())
uid_converter = lambda x: trainset.to_raw_uid(x)
trainset_raw_uids = np.array(list(map(uid_converter, trainset_uids)))

In [8]:
#show the item ids
trainset_raw_iids[0:10]

array([  31, 1029, 1061, 1129, 1172, 1263, 1287, 1293, 1339, 1343])

In [9]:
#show the user ids
trainset_raw_uids[0:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [10]:
#Memory-based model using KNN
#we use a smaller dataset (2mb) in this method
#since a KNN method on the whole ratings dataset (692mb)
#will consume 547. GiB memory during the training process
#which is too large for almost all the machines at DKU I believe

In [22]:
#define a item based KNN method using cosine similarity
my_sim_option = {'name':'cosine', 'user_based':True }

In [23]:
#define the knn with means
knnmean = KNNWithMeans(k = my_k, min_k = my_min_k, sim_options = my_sim_option, verbose = True)

In [24]:
my_k = 20
my_min_k = 5

results = cross_validate(algo = knnmean, data = data, measures=['RMSE'], cv=5, return_train_measures=True, verbose=True)
print(results['test_rmse'].mean())


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9221  0.9173  0.9165  0.9118  0.9251  0.9186  0.0046  
RMSE (trainset)   0.8033  0.8047  0.8029  0.8037  0.8023  0.8034  0.0008  
Fit time          1.25    1.05    1.01    0.84    0.91    1.01    0.14    
Test time         2.04    2.10    1.77    1.70    1.92    1.91    0.15    
0.9185681735289097


In [25]:
knnmean.fit(trainsetfull)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x18cf8e96ca0>

In [27]:
knnmean.predict(uid = 1, iid = 302).est

2.559303304627955

In [29]:
knnmean.predict(uid = 1, iid = 302)

In [None]:
#build the initial table of user/item
init_useritem_table=np.zeros((trainset.n_users, trainset.n_items),dtype='float16')

for i in range(0,len(ratings)):
    iuserid=ratings.iloc[i,0]
    iiid=ratings.iloc[i,1]
    irating=ratings.iloc[i,2]
    init_useritem_table[int(np.argwhere(trainset_raw_uids==iuserid))][int(np.argwhere(trainset_raw_iids==iiid))]=irating

In [31]:
np.save("init_useritem_table.npy", init_useritem_table)
print("save init_useritem_table.npy done")

In [32]:
#build the filled table of user/item after using SVD to predict the empty
filled_useritem_table=np.copy(init_useritem_table)

for i in range(0, init_useritem_table.shape[0]):
    for j in range(0, init_useritem_table.shape[1]):
        if(init_useritem_table[i][j])==0:
            filled_useritem_table[i][j]=knnmean.predict(uid=trainset_raw_uids[i], iid=trainset_raw_iids[j]).est
        else:
            continue

array([[2.5 , 3.  , 3.  , ..., 2.55, 2.55, 2.55]], dtype=float16)

In [33]:
filled_useritem_table[0:1]

save filled_useritem_table_knnmean.npy done


In [None]:
np.save("filled_useritem_table_knnmean.npy", filled_useritem_table)
print("save filled_useritem_table_knnmean.npy done")