In [1]:
import pandas as pd
import numpy as np
import joblib

from collections import defaultdict
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

In [2]:
ratings_df = pd.read_csv('movielens/ratings.csv')

In [3]:
 # instantiate a reader and read in our rating data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId','movieId','rating']], reader)
 
# train SVD on 75% of known rates
trainset, testset = train_test_split(data, test_size=.25, random_state=60)

In [16]:
model = SVD()
model.fit(trainset)

model_filename = 'svd-model.pkl'
joblib.dump(model, model_filename)

['svd-model.pkl']

In [4]:
model = joblib.load('svd-model.pkl')

In [5]:
predictions = model.test(testset)

In [15]:
accuracy.rmse(predictions, verbose=True)

RMSE: 0.6991


0.6991493234777875

In [7]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [8]:
top_n = get_top_n(predictions, n=5)

In [13]:
top_n

defaultdict(list,
            {118670: [(356, 5),
              (457, 4.977576491183835),
              (527, 4.931337971420746),
              (1246, 4.8699245785874385),
              (1198, 4.796269609865979)],
             156108: [(1148, 5),
              (924, 5),
              (34, 5),
              (608, 4.860273103214339),
              (720, 4.850807500398439)],
             35588: [(1210, 4.554563381776305),
              (2706, 4.445961460038665),
              (296, 4.333998502826593),
              (745, 4.242346845404747),
              (3578, 4.2372136457313125)],
             35973: [(858, 3.8930068820248924),
              (2571, 3.6417021001628465),
              (2692, 3.454419416634362),
              (6796, 3.400027606454364),
              (59810, 3.396162889804855)],
             136511: [(50, 4.715617270233328),
              (7153, 4.6533689524299575),
              (2706, 4.635248504379179),
              (4963, 4.590301559656079),
              (2959, 4.5788

In [51]:
uid_list = []
ratings_list = []
for uid, user_ratings in top_n.items():
    uid_list.append(uid)
    ratings_list.append(user_ratings)

In [52]:
res = pd.DataFrame(list(zip(uid_list, ratings_list)),
               columns =['uid', 'ratings'])

In [54]:
res = res[res['ratings'].apply(lambda x: len(x) == 5)]

In [70]:
res_ratings = res.explode('ratings')

In [74]:
res_ratings

Unnamed: 0,uid,ratings
0,118670,"(356, 5)"
0,118670,"(457, 4.977576491183835)"
0,118670,"(527, 4.931337971420746)"
0,118670,"(1246, 4.8699245785874385)"
0,118670,"(1198, 4.796269609865979)"
...,...,...
162414,42939,"(1617, 4.276170900102013)"
162414,42939,"(1676, 3.6223080311957734)"
162414,42939,"(198, 3.4699881005777184)"
162414,42939,"(3977, 3.23546400230794)"


In [75]:
res_ratings_split = pd.DataFrame(
    res_ratings['ratings'].tolist(), 
    index=res_ratings.index
    )

In [76]:
res_ratings_split

Unnamed: 0,0,1
0,356,5.000000
0,457,4.977576
0,527,4.931338
0,1246,4.869925
0,1198,4.796270
...,...,...
162414,1617,4.276171
162414,1676,3.622308
162414,198,3.469988
162414,3977,3.235464


In [77]:
res_ratings

Unnamed: 0,uid,ratings
0,118670,"(356, 5)"
0,118670,"(457, 4.977576491183835)"
0,118670,"(527, 4.931337971420746)"
0,118670,"(1246, 4.8699245785874385)"
0,118670,"(1198, 4.796269609865979)"
...,...,...
162414,42939,"(1617, 4.276170900102013)"
162414,42939,"(1676, 3.6223080311957734)"
162414,42939,"(198, 3.4699881005777184)"
162414,42939,"(3977, 3.23546400230794)"


In [138]:
final_res = pd.merge(
    res_ratings[['uid']], 
    res_ratings_split, 
    left_index=True, 
    right_index=True
    )

In [139]:
len(final_res)

3847075

In [140]:
final_res.drop_duplicates(inplace=True)

In [141]:
len(final_res)

769415

In [142]:
final_res.rename(
    columns={0:'movieId', 1: 'predictedRating'},
    inplace=True
)

In [143]:
final_res.head(10)

Unnamed: 0,uid,movieId,predictedRating
0,118670,356,5.0
0,118670,457,4.977576
0,118670,527,4.931338
0,118670,1246,4.869925
0,118670,1198,4.79627
1,156108,1148,5.0
1,156108,924,5.0
1,156108,34,5.0
1,156108,608,4.860273
1,156108,720,4.850808


In [144]:
imdb_df = pd.read_csv('imdb-titles-df.csv')
movies_df = pd.read_csv('movielens/movies.csv')

In [145]:
final_res = pd.merge(
    final_res,
    movies_df[['movieId', 'title']],
    how='left',
    on='movieId'
)

In [146]:
final_res['Title'] = np.where(
    final_res['title'].str.contains(', The'),
    'The ' + final_res['title'].str.replace(', The', ''),
    final_res['title']
)

In [147]:
imdb_df

Unnamed: 0.1,Unnamed: 0,imdbId,Title
0,936,tt0000947,L
1,1037,tt0001049,G
2,2127,tt0002152,D
3,2568,tt0002595,
4,2700,tt0002727,B
...,...,...,...
8182,8463864,tt8760684,Apollo 11 (2019)
8183,8469033,tt8772262,Midsommar (2019)
8184,8616017,tt9092964,Who am I
8185,8686297,tt9243946,El Camino: A Breaking Bad Movie (2019)


In [148]:
final_res = pd.merge(
    final_res,
    imdb_df[['Title', 'imdbId']],
    how='left',
    on='Title'
)

In [159]:
final_res.uid.unique()[0:100]

array([118670, 156108,  35588,  35973, 136511, 122569,  89897, 149446,
       110833,  10037,  65956, 125355,  78960, 115102,  75659,  33349,
       120692,  80482, 103763,  58401, 110782,  54570, 117977,  76620,
        33844, 127681, 135594,  77495,  87879, 104739,  77979,  86442,
        24350, 143564,  30048,  74628, 151911,  10402,  84872, 139528,
        50099,  11491,  38208,  88005,  61412, 132366,  72315, 133990,
       108539, 134504, 145658,   9457,  76724, 129263,   6836,  65447,
       160014, 159070,  44812, 160925, 151068, 133040, 145447,  84516,
       148063, 157059,  66695, 116717, 139131,  77534, 138897,  61737,
        34616, 119917, 121522,  92198,  16874,  19041,  96459,  93123,
       116733,  58294, 142842, 102329, 144472,  49901,  24856, 147312,
        96584, 130060,  15148,  68338,  71533,  62685,  19615,  16041,
        67150,  72886, 148145, 130337])

In [155]:
final_res.to_csv('svd-model-predictions-df.csv')