In [1]:
import numpy as np
from scipy.io import loadmat
from scipy import optimize
import pandas as pd

In [2]:
# extracting the y and r matrices from the csv files
a = pd.read_csv('data/r_movies.csv')
r = a.to_numpy()
t = pd.read_csv('data/y_movies.csv')
y = t.to_numpy()

# creating the movie list from the movie_ids csv file
movieList = pd.read_csv('data/movie_ids.csv')["name"].to_list()

In [3]:
# function to normalize the dataset 
def mean_normal(a, r):
    temp = np.zeros(a.shape, dtype = float)
    yMean = np.zeros((y.shape[0], 1), dtype = float)
    for i in range(len(r)):
        mean = sum(a[i]) / sum(r[i])
        
        temp[i] = a[i] - mean
        yMean[i] = mean
       
    return temp * r, yMean;

#creating new user rating vector
my_ratings = np.zeros((y.shape[0], 1), dtype = float)
my_ratings[49] = 4
my_ratings[55]= 5
my_ratings[63] = 5
my_ratings[68]= 5
my_ratings[70]= 4
my_ratings[97] = 4
my_ratings[126] = 4
my_ratings[177] = 5
my_ratings[195]= 4
my_ratings[0] = 4
my_ratings[10] = 5
my_ratings[180] = 4

# generating user r vector from the user ratings
my_r = (my_ratings != 0) * 1

# adding new user y vector to y rating matrix 
y = np.hstack((my_ratings, y))

# adding new user r vector to original r matrix 
r = np.hstack((my_r, r))

In [4]:
Lambda= 10  # regularization parameter 
n = 11 # total number of features for the movies
no_of_movies = y.shape[0] # total number of movies
no_of_users = y.shape[1]   # total number of users

# calculating normalized y values
yPrime, yMean = mean_normal(y, r)

In [5]:
# initializing the variables
X = np.random.rand(no_of_movies, n)
Theta = np.random.rand(no_of_users, n)
initial_theta = np.concatenate((X, Theta), axis = None)
args = (yPrime, Lambda)

In [6]:
# function for calculating the cost 
def f(initial_theta, *args):
    yPrime, Lambda = args
    X, Theta = initial_theta[:no_of_movies*n,], initial_theta[no_of_movies*n:,]
    X = np.reshape(X, (no_of_movies, n))
    Theta = np.reshape(Theta, (no_of_users, n))
    Error = ((X.dot(np.transpose(Theta)))-yPrime).copy()
    J = (.5) * ((Error * Error)* r).sum()
    Reg_term_theta = (Lambda / 2) * ((Theta * Theta).sum())
    Reg_term_X = (Lambda / 2) * ((X * X).sum())
    J = J + Reg_term_theta + Reg_term_X
    return J

In [7]:
# function for calculating the gradients
def grads(initial_theta, *args):
    yPrime, Lambda = args
    X, Theta = initial_theta[:no_of_movies*n,], initial_theta[no_of_movies*n:,]
    X = np.reshape(X, (no_of_movies, n))
    Theta = np.reshape(Theta, (no_of_users, n))
    Error = ((X.dot(np.transpose(Theta)))-yPrime).copy()
    X_grad = ((Error*r).dot(Theta) + Lambda * X).copy()
    Theta_grad = ((np.transpose(Error*r)).dot(X) + Lambda * Theta).copy()
    rolled_up_grads = np.concatenate((X_grad, Theta_grad), axis=None)
    return rolled_up_grads

In [8]:
# setting options for the optimizer function 
opts = {'maxiter' : 30,    # non-default value.
        'disp' : True,    # non-default value.
        'gtol' : 1e-5,    # default value.
        'norm' : np.inf,  # default value.
        'eps' : 1.4901161193847656e-08}  # default value.

# training the model 
optimal_rolled_theta = optimize.minimize(f, initial_theta, jac = grads, args = args,
                         method = 'CG', options = opts)

print(optimal_rolled_theta.x)

         Current function value: 38895.603152
         Iterations: 30
         Function evaluations: 48
         Gradient evaluations: 48
[ 0.60288791  0.05234486  0.07904991 ... -0.32654178 -0.18132563
 -0.69575801]


In [9]:
# Converting the matrices to their original shape 
X, Theta = optimal_rolled_theta.x[:no_of_movies*n,], optimal_rolled_theta.x[no_of_movies*n:,]
X = np.reshape(X, (no_of_movies, n))
Theta = np.reshape(Theta, (no_of_users, n))

# calculating the normalized predictions
prediction = (X.dot(np.transpose(Theta))).copy()

# calculating the actual predictions on a scale of 1.00 to 5.00
actualPreds = np.zeros(yPrime.shape, dtype = float)
for i in range(no_of_movies):
    temp = yMean[i]
    actualPreds[i] = prediction[i] + temp

In [10]:
# predictions for new user 
my_preds = actualPreds[:,0]

In [11]:
# calculating the suggested movies for the new user 
suggestedMovies = dict(); # contains the movie ids as key and ratings as values

for i in range(no_of_movies):
    suggestedMovies[i] = my_preds[i];
    
suggestedMovies = {k: v for k, v in sorted(suggestedMovies.items(), key = lambda item: item[1], reverse = True)}

In [12]:
# displaying the top N movies for the user 
loopControl = 0
topNmovies = 20
for key, item in suggestedMovies.items():
    if(my_r[key] == 1):  #so that already rated movies don't appear 
        continue
    print(movieList[key])
    loopControl += 1
    if(loopControl >= topNmovies):
        break

Prefontaine (1997)
Santa with Muscles (1996)
Someone Else's America (1995)
Saint of Fort Washington, The (1993)
Great Day in Harlem, A (1994)
They Made Me a Criminal (1939)
Aiqing wansui (1994)
Entertaining Angels: The Dorothy Day Story (1996)
Star Kid (1997)
Marlene Dietrich: Shadow and Light (1996) 
Schindler's List (1993)
Pather Panchali (1955)
Maya Lin: A Strong Clear Vision (1994)
Some Mother's Son (1996)
Anna (1996)
Titanic (1997)
Everest (1998)
Good Will Hunting (1997)
Casablanca (1942)
Usual Suspects, The (1995)


In [15]:
y

array([[4., 5., 4., ..., 5., 0., 0.],
       [0., 3., 0., ..., 0., 0., 5.],
       [0., 4., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
y[1,1]

3.0

In [20]:
y.shape

(1682, 944)

In [None]:
## Code to generate new user vectors and adding it to the original 

# my_ratings = np.zeros((y.shape[0], 1), dtype = float)


# # generating user r vector from the user ratings
# my_r = (my_ratings != 0) * 1

# # adding new user y vector to y rating matrix 
# y = np.hstack((y, my_ratings))

# # adding new user r vector to original r matrix 
# r = np.hstack((r, my_r))