In [24]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import PredefinedKFold as pkf


import matplotlib.pyplot as plt
import numpy as np


In [10]:
trainset = np.loadtxt('./data/train.txt').astype(int)
testset = np.loadtxt('./data/test.txt').astype(int)
data = np.loadtxt('./data/data.txt').astype(int)

In [25]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)


In [31]:
#use SVD algorithm.
algo = SVD()

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9644974907716897
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [32]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x109687e10>

In [33]:
V_surprise = algo.qi
U_surprise = algo.pu

In [34]:
def visualize_V(Vtilt, id_movie, name_movie, figname):
    """
    2D visualizatin of Vtilt.
    Input: 
    Vtilt: 2xn matrix 
    id_movie: the id of the movies to visualize
    name_movie: the names of the movies to visualize
    figname: the figure name you want to save
    """   
    #normalize Vtilt such that each row has variance of 1.0
    Vtilt_row_means = np.mean(Vtilt, axis=1)
    Vtilt_row_std = np.std(Vtilt, axis=1)
    Vtiltp = np.zeros_like(Vtilt)
    for irow in range(len(Vtilt)):
        Vtiltp[irow] = np.divide(Vtilt[irow] - Vtilt_row_means[irow],Vtilt_row_std[irow])
        
    fig = plt.figure()
    fig.set_size_inches(40,30)
    ax = fig.add_subplot(1, 1, 1)
    for idx in range(len(id_movie)):
        plt.scatter(Vtiltp[0][id_movie[idx]], Vtiltp[1][id_movie[idx]], s=5000, marker='o', color='blue')
        plt.text(Vtiltp[0][id_movie[idx]], Vtiltp[1][id_movie[idx]]-0.3, name_movie[idx], 
                 rotation=45, bbox=dict(facecolor='red', alpha=0.05), fontsize=50,
                 horizontalalignment='center', verticalalignment='center')
        #print(str(Vtiltp[0][id_movie[idx]])+', '+str(Vtiltp[1][id_movie[idx]])+', '+name_movie[idx])
    #plt.xlim(-1.0*np.amax(Vtiltp[0]), np.amax(Vtiltp[0]))
    #plt.ylim(-1.0*np.amax(Vtiltp[1]), np.amax(Vtiltp[1]))
    #plt.xlim(np.amin(Vtiltp[0]), np.amax(Vtiltp[0]))
    #plt.ylim(np.amin(Vtiltp[1]), np.amax(Vtiltp[1]))
    plt.xlim(-2.5,2.5)
    plt.ylim(-2.5,2.5)
    ax.spines['left'].set_position('center')
    ax.spines['bottom'].set_position('center')
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    ax.spines['left'].set_linewidth(5.5)
    ax.spines['bottom'].set_linewidth(5.5)
    ax.spines['right'].set_linewidth(5.5)
    ax.spines['top'].set_linewidth(5.5)
    #ax.set_yticklabels([])
    #ax.set_xticklabels([])
    plt.xticks([-2,-1,1,2], fontsize=60)
    plt.yticks([-2,-1,1,2], fontsize=60)
    plt.xlabel('V[0]',horizontalalignment='right', x=1.0, fontsize=60)
    plt.ylabel('V[1]',horizontalalignment='right', y=1.0, fontsize=60)
    #plt.show()
    plt.savefig('plots/'+figname+'.pdf')
    plt.savefig('plots/'+figname+'.png')

In [35]:
## read the movies.txt file, get the list of all movies ids, names, and their genres
ratings_all = np.loadtxt('data/data.txt')
movies_all = []
with open('data/movies.txt') as f:
    lines = f.readlines()  
    for line in lines:
        line_items = line.strip('\n').split(';')
        line_items_int_str = []
        for idx in range(len(line_items)):
            if idx == 1:
                line_items_int_str.append(line_items[idx][0:line_items[idx].find('(')]) #remove the year of the movie from the name
            else:
                line_items_int_str.append(int(line_items[idx]))
        movies_all.append(line_items_int_str)

In [36]:
## 10 random movies, 10 most popular movies, and 10 best movies
average_ratings_all = np.zeros(len(movies_all))
num_of_ratings_all = np.zeros(len(movies_all))
for rating in ratings_all:
    num_of_ratings_all[int(rating[1])-1] += 1
    average_ratings_all[int(rating[1])-1] += rating[2]
average_ratings_all = np.divide(average_ratings_all, num_of_ratings_all)
id_most_popular_10 = num_of_ratings_all.argsort()[-10:][::-1]
id_best_10 = average_ratings_all.argsort()[-10:][::-1]

name_most_popular_10 = []
for idx in range(len(id_most_popular_10)):
    name_most_popular_10.append(movies_all[id_most_popular_10[idx]][1]+"(%.0f, %.1f)"%(num_of_ratings_all[id_most_popular_10[idx]], average_ratings_all[id_most_popular_10[idx]]))
    
name_best_10 = []
for idx in range(len(id_best_10)):
    name_best_10.append(movies_all[id_best_10[idx]][1]+"(%.0f, %.1f)"%(num_of_ratings_all[id_best_10[idx]], average_ratings_all[id_best_10[idx]]))
    #print(average_ratings_all[id_best_10[idx]])
    #print(num_of_ratings_all[id_best_10[idx]])

## 10 random movies
id_all = np.arange(len(movies_all))
np.random.shuffle(id_all)
id_random10 = id_all[:10]
name_random10 = []
for idx in range(len(id_random10)):
    name_random10.append(movies_all[id_random10[idx]][1]+"(%.0f, %.1f)"%(num_of_ratings_all[id_random10[idx]], average_ratings_all[id_random10[idx]]))

## 10 random movies of genre = action
print(id_all)
id_random_action = []
id_random_documentary = []
id_random_drama = []
name_random_action = []
name_random_documentary = []
name_random_drama = []

for idx in id_all:
    if movies_all[idx][3] == 1 and num_of_ratings_all[idx] > 50:
        id_random_action.append(idx)
        name_random_action.append(movies_all[idx][1]+"(%.0f, %.1f)"%(num_of_ratings_all[idx], average_ratings_all[idx]))
    if movies_all[idx][9] == 1 and num_of_ratings_all[idx] > 20:
        id_random_documentary.append(idx)
        name_random_documentary.append(movies_all[idx][1]+"(%.0f, %.1f)"%(num_of_ratings_all[idx], average_ratings_all[idx]))
    if movies_all[idx][10] == 1 and num_of_ratings_all[idx] > 50:
        id_random_drama.append(idx)
        name_random_drama.append(movies_all[idx][1]+"(%.0f, %.1f)"%(num_of_ratings_all[idx], average_ratings_all[idx]))

id_random10_action = id_random_action[:10]
name_random10_action = name_random_action[:10]
id_random10_documentary = id_random_documentary[:10]
name_random10_documentary = name_random_documentary[:10]
id_random10_drama = id_random_drama[:10]
name_random10_drama = name_random_drama[:10]

print('10 random movies')
print(id_random10)
print(name_random10)
print('10 random action movies')
print(id_random10_action)
print(name_random10_action)
print('10 random documentary movies')
print(id_random10_documentary)
print(name_random10_documentary)
print('10 random drama movies')
print(id_random10_drama)
print(name_random10_drama)

print('10 most popular movies')
print(id_most_popular_10)
print(name_most_popular_10)
print('10 best movies')
print(id_best_10)
print(name_best_10)

[ 176  767 1197 ...  433 1343  410]
10 random movies
[ 176  767 1197   25 1419  482 1075  355  583 1386]
['Good, The Bad and The Ugly, The (137, 3.9)', 'Casper (52, 3.1)', 'Purple Noon (7, 3.1)', 'Brothers McMullen, The (73, 3.5)', "Gilligan's Island: The Movie (3, 1.3)", 'Casablanca (243, 4.5)', 'Pagemaster, The (12, 2.2)', 'Client, The (97, 3.4)', 'Secret Garden, The (79, 3.5)', 'Fall (3, 2.0)']
10 random action movies
[176, 294, 183, 596, 549, 448, 146, 630, 173, 719]
['Good, The Bad and The Ugly, The (137, 3.9)', 'Breakdown (77, 3.4)', 'Army of Darkness (116, 3.4)', 'Eraser (206, 3.2)', 'Die Hard: With a Vengeance (151, 3.3)', 'Star Trek: The Motion Picture (117, 3.0)', 'Long Kiss Goodnight, The (185, 3.5)', 'Crying Game, The (119, 3.6)', 'Raiders of the Lost Ark (420, 4.3)', 'First Knight (86, 3.0)']
10 random documentary movies
[1141, 1083, 31, 643, 47, 846, 633, 1064, 644, 1021]
['When We Were Kings (44, 4.0)', 'Anne Frank Remembered (21, 3.9)', 'Crumb (81, 3.8)', 'Thin Blue Lin

In [None]:
## perform SVD to V and get the A, Sigma, B
U = U_surprise.transpose()
V = V_surprise.transpose()
Vp = np.zeros_like(V)
V_row_means = np.mean(V, axis=1)
for irow in range(len(V)):
    Vp[irow] = V[irow] - V_row_means[irow]
A, S, B =  np.linalg.svd(Vp)
A12 = A[:,[0,1]]
A12T = A12.transpose()
Vtilt = np.matmul(A12T, V)

## visualize the movies of choices
visualize_V(Vtilt, id_random10, name_random10, 'visualize_V_surprise_random10movies')
visualize_V(Vtilt, id_random10_action, name_random10_action, 'visualize_V_surprise_random10movies_action')
visualize_V(Vtilt, id_random10_documentary, name_random10_documentary, 'visualize_V_surprise_random10movies_documentary')
visualize_V(Vtilt, id_random10_drama, name_random10_drama, 'visualize_V_surprise_random10movies_drama')
visualize_V(Vtilt, id_best_10, name_best_10, 'visualize_V_surprise_best10movies')
visualize_V(Vtilt, id_most_popular_10, name_most_popular_10, 'visualize_V_surprise_mostpopular10movies')


In [None]:
biased=False

In [None]:
param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
algo = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Use the new parameters with the train data
algo = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)
algo.fit(trainset)
test_pred = algo.test(testset)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)