In [6]:
import numpy as np
from numpy import loadtxt
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [23]:
def normalizeRatings(Y, R):
    """
    Preprocess data by subtracting mean rating for every movie (every row).
    Only include real ratings R(i,j)=1.
    [Ynorm, Ymean] = normalizeRatings(Y, R) normalized Y so that each movie
    has a rating of 0 on average. Unrated moves then have a mean rating (0)
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)

def load_precalc_params_small():

    file = open('small_movies_X.csv', 'rb')
    X = loadtxt(file, delimiter = ",")

    file = open('small_movies_W.csv', 'rb')
    W = loadtxt(file,delimiter = ",")

    file = open('small_movies_b.csv', 'rb')
    b = loadtxt(file,delimiter = ",")
    b = b.reshape(1,-1)
    num_movies, num_features = X.shape
    num_users,_ = W.shape
    return(X, W, b, num_movies, num_features, num_users)
    
def load_ratings_small():
    file = open('small_movies_Y.csv', 'rb')
    Y = loadtxt(file,delimiter = ",")

    file = open('small_movies_R.csv', 'rb')
    R = loadtxt(file,delimiter = ",")
    return(Y,R)

def load_Movie_List_pd():
    """ returns df with and index of movies in the order they are in in the Y matrix """
    df = pd.read_csv('small_movie_list.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
    mlist = df["title"].to_list()
    return(mlist, df)

In [29]:
# Load data
X, W, b, mum_movies, num_features, num_users = load_precalc_params_small()
Y, R = load_ratings_small()
num_movies = X.shape[0]

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies", num_movies)
print("num_users", num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [30]:
# Cost function
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 + tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [41]:
movieList, movieList_df = load_Movie_List_pd()

my_ratings = np.zeros(num_movies)

my_ratings[2700] = 5   # Toy Story 3 (2010)
my_ratings[2609] = 2   # Persuasion (2007)
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

In [42]:
# Realod ratings and add new ratings
Y, R = load_ratings_small()
Y = np.c_[my_ratings, Y]
R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

In [43]:
# Useful Values
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1)
W = tf.Variable(tf.random.normal((num_users,  num_features), dtype=tf.float64), name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features), dtype=tf.float64), name='X')
b = tf.Variable(tf.random.normal((1,          num_users),    dtype=tf.float64), name='b')

# Instantiate an optimizer
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [44]:
iterations = 200
lambda_ = 1

for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)
    
    # Use gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient(cost_value, [X, W, b])
    
    # Run on step of gradient descent by updating 
    # The value of the variables to minimize the loss.
    optimizer.apply_gradients(zip(grads, [X, W, b]))
    
    # Log periodically
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 4215173.6
Training loss at iteration 20: 195728.6
Training loss at iteration 40: 88283.7
Training loss at iteration 60: 49451.5
Training loss at iteration 80: 30768.1
Training loss at iteration 100: 20483.5
Training loss at iteration 120: 14327.4
Training loss at iteration 140: 10443.6
Training loss at iteration 160: 7903.1
Training loss at iteration 180: 6195.8


In [45]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

# restore the mean
pm = p + Ymean
my_predictions = pm[:, 0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movieList[j]}')
        
print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}')

Predicting rating 5.30 for movie Argo (2012)
Predicting rating 5.22 for movie Raise Your Voice (2004)
Predicting rating 5.18 for movie The Butterfly Effect (2004)
Predicting rating 5.12 for movie Tom Segura: Mostly Stories (2016)
Predicting rating 5.11 for movie L.A. Slasher (2015)
Predicting rating 5.10 for movie My Love (2006)
Predicting rating 5.09 for movie Kung Fu Panda: Secrets of the Masters (2011)
Predicting rating 5.07 for movie The Hunger Games (2012)
Predicting rating 5.07 for movie Buzzard (2015)
Predicting rating 5.06 for movie The Blue Planet (2001)
Predicting rating 5.06 for movie A Perfect Day (2015)
Predicting rating 5.04 for movie Enter the Void (2009)
Predicting rating 5.04 for movie A Detective Story (2003)
Predicting rating 5.04 for movie The Girl with All the Gifts (2016)
Predicting rating 5.04 for movie Scooby-Doo! Abracadabra-Doo (2010)
Predicting rating 5.04 for movie Martin Lawrence Live: Runteldat (2002)
Predicting rating 5.04 for movie Justice League: Doom (

In [49]:
filter=(movieList_df["number of ratings"] > 20)
movieList_df["pred"] = my_predictions
movieList_df = movieList_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
movieList_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)

Unnamed: 0,pred,mean rating,number of ratings,title
929,4.943962,4.118919,185,"Lord of the Rings: The Return of the King, The..."
2700,4.930078,4.109091,55,Toy Story 3 (2010)
3527,4.778574,4.047619,21,Captain Phillips (2013)
2608,4.660523,4.022388,67,Shutter Island (2010)
3802,4.658177,4.02,50,The Imitation Game (2014)
4066,4.705751,4.0,48,The Martian (2015)
3083,4.682473,3.993421,76,"Dark Knight Rises, The (2012)"
3618,5.008456,3.993151,73,Interstellar (2014)
1142,4.931356,3.986842,38,The Machinist (2004)
3283,5.298095,3.982143,28,Argo (2012)
