In [316]:
import numpy as np
from scipy.io import loadmat
from scipy import optimize
import pandas as pd

In [317]:
# function to normalize the dataset 
def mean_normal(y, r):
    temp = np.zeros(y.shape, dtype = float)
    yMean = np.zeros((y.shape[0], 1), dtype = float)
    for i in range(len(r)):
        mean = sum(y[i]) / sum(r[i])
        
        temp[i] = y[i] - mean
        yMean[i] = mean
       
    return temp * r, yMean;

In [318]:
# function for calculating the cost 
def f(initial_theta, *args):
    yPrime, Lambda = args
    no_of_movies = yPrime.shape[0] 
    no_of_users = yPrime.shape[1]
    X, Theta = initial_theta[:no_of_movies*n,], initial_theta[no_of_movies*n:,]
    X = np.reshape(X, (no_of_movies, n))
    Theta = np.reshape(Theta, (no_of_users, n))
    Error = ((X.dot(np.transpose(Theta)))-yPrime).copy()
    J = (.5) * ((Error * Error)* r).sum()
    Reg_term_theta = (Lambda / 2) * ((Theta * Theta).sum())
    Reg_term_X = (Lambda / 2) * ((X * X).sum())
    J = J + Reg_term_theta + Reg_term_X
    return J

In [319]:
# function for calculating the gradients
def grads(initial_theta, *args):
    yPrime, Lambda = args
    no_of_movies = yPrime.shape[0] 
    no_of_users = yPrime.shape[1]
    X, Theta = initial_theta[:no_of_movies*n,], initial_theta[no_of_movies*n:,]
    X = np.reshape(X, (no_of_movies, n))
    Theta = np.reshape(Theta, (no_of_users, n))
    Error = ((X.dot(np.transpose(Theta)))-yPrime).copy()
    X_grad = ((Error*r).dot(Theta) + Lambda * X).copy()
    Theta_grad = ((np.transpose(Error*r)).dot(X) + Lambda * Theta).copy()
    rolled_up_grads = np.concatenate((X_grad, Theta_grad), axis=None)
    return rolled_up_grads

In [320]:
def recommend(userID):
    # extracting the y and r matrices from the csv files
    a = pd.read_csv('data/r_movies.csv')
    r = a.to_numpy()
    t = pd.read_csv('data/y_movies.csv')
    y = t.to_numpy()

    # creating the movie list from the movie_ids csv file
    movieList = pd.read_csv('data/movie_ids.csv')["name"].to_list()
    
    my_ratings = np.zeros((y.shape[0], 1), dtype = float)
    temp = y[:, userID]
    my_ratings = temp.reshape((y.shape[0], 1))
    my_r = (my_ratings != 0) * 1
    
    
    Lambda= 10  # regularization parameter 
    n = 11 # total number of features for the movies
    no_of_movies = y.shape[0] # total number of movies
    no_of_users = y.shape[1]   # total number of users
    

    # calculating normalized y values
    yPrime, yMean = mean_normal(y, r)
    
    # initializing the machine learning matrices
    X = np.random.rand(no_of_movies, n)
    Theta = np.random.rand(no_of_users, n)
    initial_theta = np.concatenate((X, Theta), axis = None)
    args = (yPrime, Lambda)
    
    # setting options for the optimizer function 
    opts = {'maxiter' : 30,    # non-default value.
            'disp' : True,    # non-default value.
            'gtol' : 1e-5,    # default value.
            'norm' : np.inf,  # default value.
            'eps' : 1.4901161193847656e-08}  # default value.

    # training the model 
    optimal_rolled_theta = optimize.minimize(f, initial_theta, jac = grads, args = args,
                             method = 'CG', options = opts)

    print(optimal_rolled_theta.x)
    
    # Converting the matrices to their original shape 
    X, Theta = optimal_rolled_theta.x[:no_of_movies*n,], optimal_rolled_theta.x[no_of_movies*n:,]
    X = np.reshape(X, (no_of_movies, n))
    Theta = np.reshape(Theta, (no_of_users, n))

    # calculating the normalized predictions
    prediction = (X.dot(np.transpose(Theta))).copy()

    # calculating the actual predictions on a scale of 1.00 to 5.00
    actualPreds = np.zeros(yPrime.shape, dtype = float)
    for i in range(no_of_movies):
        temp = yMean[i]
        actualPreds[i] = prediction[i] + temp
        
    # predictions for new user 
    my_preds = actualPreds[:, userID]
    
    # calculating the suggested movies for the new user 
    suggestedMovies = dict(); # contains the movie ids as key and ratings as values

    for i in range(no_of_movies):
        suggestedMovies[i] = my_preds[i];

    suggestedMovies = {k: v for k, v in sorted(suggestedMovies.items(), key = lambda item: item[1], reverse = True)}
    
    # creating the top movies for the user dict
    movie_dict = dict()
    for key, item in suggestedMovies.items():
        if(my_r[key] == 1):  #so that already rated movies don't appear 
            continue
        movie_dict[key] = movieList[key]

    return movie_dict  

In [321]:
# insert any user id here to get recommendations for that user
movie_dict = recommend(105)
movie_dict

         Current function value: 38827.803273
         Iterations: 30
         Function evaluations: 47
         Gradient evaluations: 47
[ 0.0505324   0.28022432  0.20231211 ... -0.21312529 -0.02071381
 -0.73027745]


{813: 'Great Day in Harlem, A (1994)',
 1188: 'Prefontaine (1997)',
 1200: 'Marlene Dietrich: Shadow and Light (1996) ',
 1499: 'Santa with Muscles (1996)',
 1121: 'They Made Me a Criminal (1939)',
 1466: 'Saint of Fort Washington, The (1993)',
 1652: 'Entertaining Angels: The Dorothy Day Story (1996)',
 1598: "Someone Else's America (1995)",
 1535: 'Aiqing wansui (1994)',
 1292: 'Star Kid (1997)',
 1448: 'Pather Panchali (1955)',
 1593: 'Everest (1998)',
 482: 'Casablanca (1942)',
 1397: 'Anna (1996)',
 1641: "Some Mother's Son (1996)",
 118: 'Maya Lin: A Strong Clear Vision (1994)',
 113: 'Wallace & Gromit: The Best of Aardman Animation (1996)',
 602: 'Rear Window (1954)',
 407: 'Close Shave, A (1995)',
 97: 'Silence of the Lambs, The (1991)',
 177: '12 Angry Men (1957)',
 356: "One Flew Over the Cuckoo's Nest (1975)",
 168: 'Wrong Trousers, The (1993)',
 426: 'To Kill a Mockingbird (1962)',
 495: "It's a Wonderful Life (1946)",
 514: 'Boot, Das (1981)',
 1190: 'Letter From Death Row