## Library

In [1]:
import pandas as pd
import numpy as np
import pickle
import operator


from collections import Counter
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import Reader, SVD, Dataset 
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

## Loading Data

In [2]:
smd = pd.read_csv('../the-movies-dataset/movies_metadata_equal_ratings.csv')
ratings = pd.read_csv('../the-movies-dataset/ratings_equal_movies_metadata.csv')
train = pd.read_csv('../the-movies-dataset/ratings_train.csv')
test = pd.read_csv('../the-movies-dataset/ratings_test.csv')

In [3]:
smd.shape

(9025, 26)

In [4]:
N = smd.shape[0]

In [5]:
ratings

Unnamed: 0,userId,id,rating,movieId,timestamp,movieIndex
0,0,152.0,2.5,1371,1260759135,1107
1,0,9909.0,2.5,31,1260759144,30
2,0,847.0,2.0,2193,1260759198,1739
3,0,9426.0,2.5,2455,1260759113,1958
4,0,6114.0,3.5,1339,1260759125,1079
...,...,...,...,...,...,...
99805,670,786.0,2.0,3897,1063503718,3108
99806,670,7443.0,4.0,3751,1065111939,2995
99807,670,1891.0,5.0,1196,1064890635,949
99808,670,279.0,4.0,1225,1065149143,977


## Collaborative Filter Using Suprise Framework 

In [6]:
#sim_options = {'name': 'cosine',
#               'user_based': True  # compute  similarities between items
#               }
#algo = KNNBasic(sim_options=sim_options)

In [7]:
reader = Reader()
algo = SVD()

In [8]:
train_subject = Dataset.load_from_df(train[['userId', 'movieIndex', 'rating']], reader)
cross_validate(algo, train_subject, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9112  0.9043  0.9068  0.9015  0.8981  0.9044  0.0045  
MAE (testset)     0.7021  0.6986  0.6965  0.6970  0.6909  0.6970  0.0036  
Fit time          2.93    2.94    3.45    3.43    2.91    3.13    0.25    
Test time         0.13    0.09    0.14    0.12    0.13    0.12    0.02    


{'test_rmse': array([0.9112441 , 0.90426223, 0.90678496, 0.90148623, 0.89809809]),
 'test_mae': array([0.7020992 , 0.6986164 , 0.69649548, 0.69697837, 0.69094787]),
 'fit_time': (2.9344067573547363,
  2.9362075328826904,
  3.4508309364318848,
  3.43106746673584,
  2.909374713897705),
 'test_time': (0.12965822219848633,
  0.08743739128112793,
  0.1382007598876953,
  0.11729931831359863,
  0.12637591361999512)}

In [9]:
train

Unnamed: 0,userId,id,rating,movieId,timestamp,movieIndex
0,0,36819.0,1.0,2968,1260759200,2375
1,0,9426.0,2.5,2455,1260759113,1958
2,0,152.0,2.5,1371,1260759135,1107
3,0,1598.0,2.0,1343,1260759131,1083
4,0,665.0,2.0,1287,1260759187,1037
...,...,...,...,...,...,...
79843,670,161.0,4.5,4963,1065111855,3838
79844,670,9377.0,4.0,2918,1065149106,2335
79845,670,134.0,4.0,4027,1063500993,3223
79846,670,165.0,3.5,2011,1063500873,1569


In [10]:
trainSet = train_subject.build_full_trainset()
algo.fit(trainSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f77a435ca58>

In [11]:
#testSet = trainSet.build_anti_testset()
#predictions = algo.test(testSet)

In [12]:
#predict_rating_result = []
#for index, row in test.iterrows():
#    predict_rating_result.append(algo.predict(row.userId, row.movieIndex, 3).est)

In [13]:
predict_rating_result = {}
for index, row in test.iterrows():
    predict_rating_result[row.userId,row.movieIndex] = algo.predict(row.userId, row.movieIndex, 3).est

In [14]:
len(predict_rating_result)

19962

In [15]:
test['pred'] = predict_rating_result.values()    
test

Unnamed: 0,userId,id,rating,movieId,timestamp,movieIndex,pred
0,0,9909.0,2.5,31,1260759144,30,2.554658
1,0,847.0,2.0,2193,1260759198,1739,2.646721
2,0,97.0,4.0,2105,1260759139,1661,2.657589
3,0,11216.0,4.0,1172,1260759205,927,3.351501
4,0,1103.0,2.0,1129,1260759185,903,2.599211
...,...,...,...,...,...,...,...
19957,670,11310.0,2.5,432,1063503739,381,2.755469
19958,670,712.0,3.0,357,1063503998,322,3.720541
19959,670,630.0,4.0,919,1065149458,737,3.960282
19960,670,687.0,4.0,36,1065149314,34,3.937466


In [16]:
test.shape

(19962, 7)

In [17]:
se = 0
for index, row in test.iterrows():
    e = row.rating - row.pred
    se += e*e
np.sqrt(se/test.shape[0])    

0.8965533636633798

## Predict Movie Have Not Rated Yet

In [18]:
train_matrix = train[['userId','movieIndex']].values

In [19]:
def pred_movies_not_in_train(rate_matrix):
    users = rate_matrix[:,0]
    result = {}
    n_users = int(np.max(rate_matrix[:, 0])) + 1
    for i in range(n_users):
        result[i] = []
        ids = np.where(users == i)[0].astype(np.int32)
        item_ids = rate_matrix[ids,1]
        for j in range(N):
            if j not in item_ids: result[i].append(j)
    
    return result            

In [20]:
result = pred_movies_not_in_train(train_matrix)

In [21]:
len(result[0]),len(result)

(9010, 671)

In [22]:
predict_rating_result = {}
for user in result:
    for j in result[user]:
        predict_rating_result[user,j] = algo.predict(user, j, 3).est

In [23]:
user0 = {}
for i in predict_rating_result:
    if i[0] == 0: user0[i[1]] = predict_rating_result[i] 
len(user0)   

9010

In [24]:
def collaborative_fillter_recommender(userId,predict_rating_result):
    result = {}
    list_movie = []
    for i in predict_rating_result:
        if i[0] == userId:
            result[i[1]] = predict_rating_result[i]
    result = sorted(result.items(), key=operator.itemgetter(1),reverse=True)
    for i in result:
        list_movie.append(i[0])
    return list_movie       

In [25]:
result = collaborative_fillter_recommender(0,predict_rating_result)

In [26]:
smd.iloc[result]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description,year
743,False,,1400000,['Drama'],,705,tt0042192,en,All About Eve,From the moment she glimpses her idol at the s...,...,138.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,It's all about women... and their men!,All About Eve,False,8.0,367.0,From the moment she glimpses her idol at the s...,1950
4219,False,,15000000,"['Fantasy', 'Adventure', 'Animation', 'Family']",http://movies.disney.com/spirited-away,129,tt0245429,ja,千と千尋の神隠し,A ten year old girl who wanders away from her ...,...,125.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,The tunnel led Chihiro to a mysterious town...,Spirited Away,False,8.3,3968.0,A ten year old girl who wanders away from her ...,2001
1327,False,,12000000,"['Crime', 'Drama', 'Romance', 'Thriller']",,9281,tt0090329,en,Witness,A sheltered Amish child is the sole witness of...,...,113.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,A big city cop. A small country boy. They have...,Witness,False,7.0,390.0,A sheltered Amish child is the sole witness of...,1985
4974,False,,31000000,"['Drama', 'Adventure', 'Action', 'History']",,79,tt0299977,zh,英雄,One man defeated three assassins who sought to...,...,99.0,"[{'iso_639_1': 'zh', 'name': '普通话'}]",Released,One man's strength will unite an empire.,Hero,False,7.3,647.0,One man defeated three assassins who sought to...,2002
3412,False,"{'id': 2150, 'name': 'Shrek Collection', 'post...",60000000,"['Adventure', 'Animation', 'Comedy', 'Family',...",http://www.shrek.com/,808,tt0126029,en,Shrek,It ain't easy bein' green -- especially if you...,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest fairy tale never told.,Shrek,False,7.3,4183.0,It ain't easy bein' green -- especially if you...,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,False,"{'id': 286162, 'name': 'Power Rangers Collecti...",15000000,"['Action', 'Adventure', 'Science Fiction', 'Fa...",http://www.powerrangers.com/,9070,tt0113820,en,Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,...,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Power Is On!,Mighty Morphin Power Rangers: The Movie,False,5.2,153.0,Power up with six incredible teens who out-man...,1995
2157,False,,170000000,"['Action', 'Adventure', 'Comedy', 'Science Fic...",,8487,tt0120891,en,Wild Wild West,Legless Southern inventor Dr. Arliss Loveless ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It's a whole new west.,Wild Wild West,False,5.1,1042.0,Legless Southern inventor Dr. Arliss Loveless ...,1999
1238,False,"{'id': 43064, 'name': 'Speed Collection', 'pos...",160000000,"['Action', 'Adventure', 'Thriller']",,1639,tt0120179,en,Speed 2: Cruise Control,Sandra Bullock and Jason Patric star as a youn...,...,121.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"As the stakes get higher, the ride gets even f...",Speed 2: Cruise Control,False,4.1,439.0,Sandra Bullock and Jason Patric star as a youn...,1997
1239,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",125000000,"['Action', 'Crime', 'Fantasy']",,415,tt0118688,en,Batman & Robin,Along with crime-fighting partner Robin and ne...,...,125.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Strength. Courage. Honor. And loyalty.,Batman & Robin,False,4.2,1447.0,Along with crime-fighting partner Robin and ne...,1997


In [27]:
f = open('SVD_predict_rating_result.pckl', 'wb')
pickle.dump(predict_rating_result, f)

f.close()