In [None]:
# movie recommend system

In [17]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.io import loadmat
import scipy.optimize

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

In [2]:
# load data
data=loadmat('data/ex8_movies.mat')
print(data.keys())
Y,R=data['Y'],data['R']
num_movies,num_users=np.shape(Y)
print(num_movies)
print(num_users)

dict_keys(['R', 'Y', '__version__', '__header__', '__globals__'])
1682
943


In [3]:
# number of features
num_features=10

In [11]:
# unpack X and Theta into vectors
def unpack(X,Theta):
    params=np.r_[X.flatten(),Theta.flatten()]
    return(params)
# pack vectors into two arrays
def pack(params,num_movies,num_users,num_features):
    X=np.reshape(params[0:num_movies*num_features],(num_movies,num_features))
    Theta=np.reshape(params[num_movies*num_features:],(num_users,num_features))
    return((X,Theta))

In [12]:
# cost function
def costFunction(params,reg,Y,R,num_movies,num_users,num_features):
    X,Theta=pack(params,num_movies,num_users,num_features)
    J=0.5*np.sum(R*np.square(X.dot(Theta.T)-Y))
    J+=reg/2.0*np.sum(np.square(Theta))
    J+=reg/2.0*np.sum(np.square(X))
    return(J)

In [13]:
#X=np.zeros((num_movies,num_features))
#Theta=np.zeros((num_users,num_features))
data=loadmat('data/ex8_movieParams.mat')
print(data.keys())
X,Theta=data['X'],data['Theta']
params=unpack(X,Theta)
reg=1
costFunction(params,reg,Y,R,num_movies,num_users,num_features)

dict_keys(['X', '__header__', 'Theta', 'num_movies', 'num_features', '__globals__', '__version__', 'num_users'])


32520.682450229557

In [14]:
# gradient function
def gradFunction(params,reg,Y,R,num_movies,num_users,num_features):
    X,Theta=pack(params,num_movies,num_users,num_features)
    Ypre=X.dot(Theta.T)
    diff=R*(Ypre-Y)
    gradX=diff.dot(Theta)+reg*X
    gradTheta=diff.T.dot(X)+reg*Theta
    gradParams=np.r_[gradX.flatten(),gradTheta.flatten()]
    return(gradParams)
gradFunction(params,reg,Y,R,num_movies,num_users,num_features)

array([-5.21315594,  2.0591285 , -5.68148384, ..., -5.27650042,
        4.22109195,  2.11819114])

In [69]:
# add myself
my_ratings = np.zeros((1682,1))
my_ratings[0]   = 4
my_ratings[97]  = 2
my_ratings[6]   = 3
my_ratings[11]  = 5
my_ratings[53]  = 4
my_ratings[63]  = 5
my_ratings[65]  = 3
my_ratings[68]  = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354] = 5

Yext=np.c_[Y,my_ratings]
Rext=np.c_[R,my_ratings>0]

num_movies,num_users=np.shape(Yext)
print(num_movies)
print(num_users)

1682
944


In [96]:
# normalize the ratings
meanY=np.sum(Yext,1)/np.sum(Rext,1)
Ynorm=Yext-meanY.reshape(-1,1)

In [125]:
# now training the system
# random intial
X = np.random.rand(num_movies,num_features)
Theta = np.random.rand(num_users,num_features)
params=unpack(X,Theta)
reg=10.0

result = scipy.optimize.fmin_cg(costFunction, x0=params, fprime=gradFunction, \
                               args=(reg,Ynorm,Rext,num_movies,num_users,num_features), \
                                maxiter=50,disp=True,full_output=True)


         Current function value: 38977.803915
         Iterations: 50
         Function evaluations: 82
         Gradient evaluations: 82


In [126]:
# reshpae result
trainedX,trainedTheta=pack(result[0],num_movies,num_users,num_features)

In [132]:
# make predictions
my_ratings_pre=trainedX.dot(trainedTheta.T)[:,-1]+meanY

In [133]:
# make recommendations
pred_idxs_sorted = np.argsort(my_ratings_pre)[::-1]
print(pred_idxs_sorted[:10])

[1598 1121 1188 1292  813 1535 1466 1652 1499 1200]


In [129]:
# Let's make a list of strings to reference later
movies = []
with open('data/movie_ids.txt') as f:
    for line in f:
        movies.append(' '.join(line.strip('\n').split(' ')[1:]))

In [134]:
# make predictions
print("Top recommendations for you:")
for i in range(10):
    print('Predicting rating %0.1f for movie %s.' % \
    (my_ratings_pre[pred_idxs_sorted[i]],movies[pred_idxs_sorted[i]]))
    
print("\nOriginal ratings provided:")
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('Rated %d for movie %s.' % (my_ratings[i],movies[i]))

Top recommendations for you:
Predicting rating 5.0 for movie Someone Else's America (1995).
Predicting rating 5.0 for movie They Made Me a Criminal (1939).
Predicting rating 5.0 for movie Prefontaine (1997).
Predicting rating 5.0 for movie Star Kid (1997).
Predicting rating 5.0 for movie Great Day in Harlem, A (1994).
Predicting rating 5.0 for movie Aiqing wansui (1994).
Predicting rating 5.0 for movie Saint of Fort Washington, The (1993).
Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996).
Predicting rating 5.0 for movie Santa with Muscles (1996).
Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996) .

Original ratings provided:
Rated 4 for movie Toy Story (1995).
Rated 3 for movie Twelve Monkeys (1995).
Rated 5 for movie Usual Suspects, The (1995).
Rated 4 for movie Outbreak (1995).
Rated 5 for movie Shawshank Redemption, The (1994).
Rated 3 for movie While You Were Sleeping (1995).
Rated 5 for movie Forrest Gump (1994).
Rated 2 for