## Library

In [53]:
import pandas as pd
import numpy as np
import pickle
import operator


from collections import Counter
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import Reader, SVD, Dataset 
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

## Loading Data

In [2]:
smd = pd.read_csv('../the-movies-dataset/movies_metadata_equal_ratings.csv')
ratings = pd.read_csv('../the-movies-dataset/ratings_equal_movies_metadata.csv')
train = pd.read_csv('../the-movies-dataset/ratings_train.csv')
test = pd.read_csv('../the-movies-dataset/ratings_test.csv')

In [3]:
smd.shape

(9025, 26)

In [60]:
N = smd.shape[0]

In [4]:
ratings

Unnamed: 0,userId,id,rating,movieId,timestamp,movieIndex
0,0,152.0,2.5,1371,1260759135,1107
1,0,9909.0,2.5,31,1260759144,30
2,0,847.0,2.0,2193,1260759198,1739
3,0,9426.0,2.5,2455,1260759113,1958
4,0,6114.0,3.5,1339,1260759125,1079
...,...,...,...,...,...,...
99805,670,786.0,2.0,3897,1063503718,3108
99806,670,7443.0,4.0,3751,1065111939,2995
99807,670,1891.0,5.0,1196,1064890635,949
99808,670,279.0,4.0,1225,1065149143,977


## Collaborative Filter Using Suprise Framework 

In [5]:
#sim_options = {'name': 'cosine',
#               'user_based': True  # compute  similarities between items
#               }
#algo = KNNBasic(sim_options=sim_options)

In [6]:
reader = Reader()
algo = SVD()

In [7]:
train_subject = Dataset.load_from_df(train[['userId', 'movieIndex', 'rating']], reader)
cross_validate(algo, train_subject, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9116  0.9054  0.9020  0.8970  0.9119  0.9056  0.0057  
MAE (testset)     0.6975  0.6985  0.6964  0.6946  0.7042  0.6982  0.0033  
Fit time          7.11    5.82    5.47    5.31    5.23    5.79    0.69    
Test time         0.42    0.19    0.30    0.22    0.33    0.29    0.08    


{'test_rmse': array([0.91163727, 0.90537413, 0.90199026, 0.89701741, 0.91187766]),
 'test_mae': array([0.69745081, 0.69850584, 0.69636692, 0.69462958, 0.70421301]),
 'fit_time': (7.111264944076538,
  5.820726156234741,
  5.472599267959595,
  5.313228368759155,
  5.233631134033203),
 'test_time': (0.4241001605987549,
  0.19103026390075684,
  0.30051350593566895,
  0.2176826000213623,
  0.3325223922729492)}

In [8]:
train

Unnamed: 0,userId,id,rating,movieId,timestamp,movieIndex
0,0,36819.0,1.0,2968,1260759200,2375
1,0,9426.0,2.5,2455,1260759113,1958
2,0,152.0,2.5,1371,1260759135,1107
3,0,1598.0,2.0,1343,1260759131,1083
4,0,665.0,2.0,1287,1260759187,1037
...,...,...,...,...,...,...
79843,670,161.0,4.5,4963,1065111855,3838
79844,670,9377.0,4.0,2918,1065149106,2335
79845,670,134.0,4.0,4027,1063500993,3223
79846,670,165.0,3.5,2011,1063500873,1569


In [9]:
trainSet = train_subject.build_full_trainset()
algo.fit(trainSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f859af04c50>

In [10]:
#testSet = trainSet.build_anti_testset()
#predictions = algo.test(testSet)

In [40]:
#predict_rating_result = []
#for index, row in test.iterrows():
#    predict_rating_result.append(algo.predict(row.userId, row.movieIndex, 3).est)

In [99]:
predict_rating_result = {}
for index, row in test.iterrows():
    predict_rating_result[row.userId,row.movieIndex] = algo.predict(row.userId, row.movieIndex, 3).est

In [45]:
len(predict_rating_result)

19962

In [48]:
test['pred'] = predict_rating_result.values()    
test

Unnamed: 0,userId,id,rating,movieId,timestamp,movieIndex,pred
0,0,9909.0,2.5,31,1260759144,30,2.572320
1,0,847.0,2.0,2193,1260759198,1739,2.378862
2,0,97.0,4.0,2105,1260759139,1661,2.665476
3,0,11216.0,4.0,1172,1260759205,927,3.691603
4,0,1103.0,2.0,1129,1260759185,903,2.550290
...,...,...,...,...,...,...,...
19957,670,11310.0,2.5,432,1063503739,381,2.823411
19958,670,712.0,3.0,357,1063503998,322,3.827776
19959,670,630.0,4.0,919,1065149458,737,4.050632
19960,670,687.0,4.0,36,1065149314,34,3.834403


In [14]:
test.shape

(19962, 7)

In [49]:
se = 0
for index, row in test.iterrows():
    e = row.rating - row.pred
    se += e*e
np.sqrt(se/test.shape[0])    

0.8959977537790521

## Predict Movie Have Not Rated Yet

In [63]:
train_matrix = train[['userId','movieIndex']].values

In [133]:
def pred_movies_not_in_train(rate_matrix):
    users = rate_matrix[:,0]
    result = {}
    n_users = int(np.max(rate_matrix[:, 0])) + 1
    for i in range(n_users):
        result[i] = []
        ids = np.where(users == i)[0].astype(np.int32)
        item_ids = rate_matrix[ids,1]
        for j in range(N):
            if j not in item_ids: result[i].append(j)
    
    return result            

In [134]:
result = pred_movies_not_in_train(train_matrix)

In [136]:
len(result[0]),len(result)

(9010, 671)

In [137]:
predict_rating_result = {}
for user in result:
    for j in result[user]:
        predict_rating_result[user,j] = algo.predict(user, j, 3).est

In [140]:
user0 = {}
for i in predict_rating_result:
    if i[0] == 0: user0[i[1]] = predict_rating_result[i] 
len(user0)   

9010

In [56]:
def collaborative_fillter_recommender(userId,predict_rating_result):
    result = {}
    list_movie = []
    for i in predict_rating_result:
        if i[0] == userId:
            result[i[1]] = predict_rating_result[i]
    result = sorted(result.items(), key=operator.itemgetter(1),reverse=True)
    for i in result:
        list_movie.append(i[0])
    return list_movie       

In [57]:
collaborative_fillter_recommender(0,predict_rating_result)

[927.0, 1661.0, 30.0, 903.0, 1739.0]

In [141]:
f = open('predict_rating_result.pckl', 'wb')
pickle.dump(predict_rating_result, f)

f.close()