In [1]:
import pandas as pd
import numpy as np
import math
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

In [5]:
file_path = 'ratings_small.csv'

reader = Reader(line_format='user item rating', sep=',', rating_scale = (1,5))
#ratings = pd.read_csv('ratings_small.csv')

ratings = pd.read_csv(file_path)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [7]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  671 

Number of items:  9066 



In [10]:
#define a svd method with bias term
svd = SVD(biased=True)

In [9]:
cross_validate(algo=svd, data=data, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8982  0.8962  0.8982  0.8970  0.8979  0.8975  0.0008  
MAE (testset)     0.6926  0.6927  0.6919  0.6890  0.6926  0.6918  0.0014  
Fit time          4.48    4.49    4.43    4.53    4.46    4.48    0.03    
Test time         0.15    0.25    0.15    0.25    0.15    0.19    0.05    


{'test_rmse': array([0.89820983, 0.89618547, 0.89820521, 0.89703203, 0.8979276 ]),
 'test_mae': array([0.69262555, 0.69270717, 0.69189721, 0.68895604, 0.69258143]),
 'fit_time': (4.482998371124268,
  4.494997262954712,
  4.433995485305786,
  4.531998872756958,
  4.462998151779175),
 'test_time': (0.15401053428649902,
  0.2539997100830078,
  0.15000033378601074,
  0.2479875087738037,
  0.14599990844726562)}

In [11]:
#use all the data as training set and later use cross validation for test
#alert: If using full dataset (700mb) this could consume around 15gb memory
#and might take 5 hours to run

svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x147c1972880>

In [12]:
#a simple prediction made with userid 1 and itemid 302
svd.predict(uid=1, iid=302, r_ui=3)

Prediction(uid=1, iid=302, r_ui=3, est=2.6738710750190657, details={'was_impossible': False})

In [8]:
#find the unique id of items and users
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = np.array(list(map(iid_converter, trainset_iids)))

trainset_uids = list(trainset.all_users())
uid_converter = lambda x: trainset.to_raw_uid(x)
trainset_raw_uids = np.array(list(map(uid_converter, trainset_uids)))

In [22]:
#show the item ids
print(trainset_raw_iids[0:10])

#build the transformation from raw iid to index
def rawiid2index(rawiid):
    return int(np.argwhere(trainset_raw_iids==rawiid))

def index2rawiid(index):
    return trainset_raw_iids[index]

[  31 1029 1061 1129 1172 1263 1287 1293 1339 1343]


In [23]:
#show the user ids
print(trainset_raw_uids[0:10])


#build the transformation from raw uid to index
def rawuid2index(rawuid):
    return int(np.argwhere(trainset_raw_uids==rawuid))

def index2rawuid(index):
    return trainset_raw_uids[index]

[ 1  2  3  4  5  6  7  8  9 10]


In [26]:
#build the initial table of user/item
init_useritem_table=np.zeros((trainset.n_users, trainset.n_items),dtype='float16')

for i in range(0,len(ratings)):
    iuserid=ratings.iloc[i,0]
    iiid=ratings.iloc[i,1]
    irating=ratings.iloc[i,2]
    init_useritem_table[rawuid2index(iuserid)][rawiid2index(iiid)]=irating
    

In [28]:
np.save("init_useritem_table.npy", init_useritem_table)
print("save init_useritem_table.npy done")

save init_useritem_table.npy done


In [29]:
#build the filled table of user/item after using SVD to predict the empty
filled_useritem_table=np.copy(init_useritem_table)

for i in range(0, init_useritem_table.shape[0]):
    for j in range(0, init_useritem_table.shape[1]):
        if(init_useritem_table[i][j])==0:
            filled_useritem_table[i][j]=svd.predict(uid=trainset_raw_uids[i], iid=trainset_raw_iids[j]).est
        else:
            continue

In [30]:
filled_useritem_table[0:1]

array([[2.5  , 3.   , 3.   , ..., 2.736, 2.465, 2.469]], dtype=float16)

In [31]:
np.save("filled_useritem_table_svd.npy", filled_useritem_table)
print("save filled_useritem_table_svd.npy done")

save filled_useritem_table_svd.npy done
