In [1]:
import sys
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, SVD, accuracy, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection.validation import cross_validate
from baselines import GlobalMean, MeanofMeans

In [2]:
reader = Reader(name=None,
                line_format='user item rating',
                sep=',',
                rating_scale=(1,5),
                skip_lines=1)

In [3]:
data = Dataset.load_from_file('../data/movies/ratings.csv', reader=reader)
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [4]:
def get_top_n(predictions, n=5):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [5]:
def recs_dict(top_n, name_df):
    recs_dict = dict()
    for uid, user_ratings in top_n.items():
        # print(uid, [name_df.loc[int(iid)]['title'] for (iid, _) in user_ratings])
        recs_dict[int(uid)] = [name_df.loc[int(iid)]['title'] for (iid, _) in user_ratings]
    return recs_dict

In [6]:
name_df = pd.read_csv('../data/movies/movies.csv', header=0, index_col=0)

In [7]:
algo = SVD(n_epochs=20, n_factors=50, lr_all=0.01, reg_all=0.02, init_mean=0)
algo.fit(trainset)
predictions = algo.test(testset)
    
top_n = get_top_n(predictions, n=10)
svd_dict = recs_dict(top_n, name_df)

In [8]:
algo = GlobalMean()
algo.fit(trainset)
predictions = algo.test(testset)
    
top_n = get_top_n(predictions, n=10)
glob_mean_dict = recs_dict(top_n, name_df)

In [9]:
algo = MeanofMeans()
algo.fit(trainset)
predictions = algo.test(testset)
    
top_n = get_top_n(predictions, n=10)
mean_mean_dict = recs_dict(top_n, name_df)

In [None]:
algo = NMF(n_epochs=50, n_factors=1, biased=True)
algo.fit(trainset)
predictions = algo.test(testset)
    
top_n = get_top_n(predictions, n=10)
nmf_dict = recs_dict(top_n, name_df)

In [14]:
def compare_preds(user_id):
    print('Global Means Recs:')
    print(glob_mean_dict[user_id])
    print('\n')

    print('Means of Means Recs:')
    print(mean_mean_dict[user_id])
    print('\n')

    print('SVD Recs:')
    print(svd_dict[user_id])

In [15]:
compare_preds(100)

Global Means Recs:
['Dangerous Minds (1995)', 'Dumbo (1941)', 'Sleepers (1996)', 'Escape from New York (1981)', 'Cinema Paradiso (Nuovo cinema Paradiso) (1989)', 'Deer Hunter, The (1978)', 'Ben-Hur (1959)', 'Gandhi (1982)', "Dracula (Bram Stoker's Dracula) (1992)", 'Cape Fear (1991)']


Means of Means Recs:
['One Magic Christmas (1985)', 'Step Into Liquid (2002)', 'Art of War, The (2000)', "Taste of Cherry (Ta'm e guilass) (1997)", 'King Is Alive, The (2000)', 'Innocence (2000)', 'Maelstr√∂m (2000)', 'Seconds (1966)', 'Amazing Grace (2006)', 'Unvanquished, The (Aparajito) (1957)']


SVD Recs:
['On the Waterfront (1954)', 'Godfather, The (1972)', 'Godfather: Part II, The (1974)', 'To Kill a Mockingbird (1962)', 'City of God (Cidade de Deus) (2002)', 'All About Eve (1950)', 'Shawshank Redemption, The (1994)', 'Chinatown (1974)', 'Cinema Paradiso (Nuovo cinema Paradiso) (1989)', 'Prestige, The (2006)']
