In [1]:
import numpy as np
import pandas as pd

movies = pd.read_csv('data/movies_clean.csv')
reviews = pd.read_csv('data/reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

In [2]:
def create_train_test(reviews, order_by, training_size, testing_size):
    '''    
    INPUT:
    reviews - (pandas df) dataframe to split into train and test
    order_by - (string) column name to sort by
    training_size - (int) number of rows in training set
    testing_size - (int) number of columns in the test set
    
    OUTPUT:
    training_df -  (pandas df) dataframe of the training set
    validation_df - (pandas df) dataframe of the test set
    '''
    reviews_new = reviews.sort_values(order_by)
    training_df = reviews_new.head(training_size)
    validation_df = reviews_new.iloc[training_size:training_size+testing_size]
    
    return training_df, validation_df

In [3]:
train_df, val_df = create_train_test(reviews, 'date', 8000, 2000)

In [4]:
def FunkSVD(ratings_mat, latent_features=12, learning_rate=0.0001, iters=100):
    '''
    This function performs matrix factorization using a basic form of FunkSVD with no regularization
    
    INPUT:
    ratings_mat - (numpy array) a matrix with users as rows, movies as columns, and ratings as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate 
    iters - (int) the number of iterations
    
    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    movie_mat - (numpy array) a latent feature by movie matrix
    '''
    
    n_users = ratings_mat.shape[0]
    n_movies = ratings_mat.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(ratings_mat))
    
    # User and movie matrices with random values
    user_mat = np.random.rand(n_users, latent_features)
    movie_mat = np.random.rand(latent_features, n_movies)
    
    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # Tracking iteration and MSE
    print("Optimizaiton Statistics")
    print("Iterations | Mean Squared Error ")
    
   
    for iteration in range(iters):

        # updating sse
        old_sse = sse_accum
        sse_accum = 0
        
        # User-movie pair
        for i in range(n_users):
            for j in range(n_movies):
                
                
                if ratings_mat[i, j] > 0:
                    
                    # Error as the actual minus the dot product of the user and movie latent features
                    diff = ratings_mat[i, j] - np.dot(user_mat[i, :], movie_mat[:, j])
                    
                    # sum of squared errors for the matrix
                    sse_accum += diff**2
                    
                    # updating the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*movie_mat[k, j])
                        movie_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])

        
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))
        
    return user_mat, movie_mat 

In [5]:
train_user_item = train_df[['user_id', 'movie_id', 'rating', 'timestamp']]
train_data_df = train_user_item.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
train_data_np = np.array(train_data_df)

# Fitting FunkSVD with the specified hyper parameters to the training data
user_mat, movie_mat = FunkSVD(train_data_np, latent_features=15, learning_rate=0.005, iters=250)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 10.607507
2 		 5.990875
3 		 4.184772
4 		 3.126891
5 		 2.430543
6 		 1.938303
7 		 1.573601
8 		 1.294657
9 		 1.076504
10 		 0.903101
11 		 0.763545
12 		 0.650095
13 		 0.557061
14 		 0.480161
15 		 0.416108
16 		 0.362360
17 		 0.316935
18 		 0.278286
19 		 0.245201
20 		 0.216722
21 		 0.192090
22 		 0.170696
23 		 0.152047
24 		 0.135737
25 		 0.121433
26 		 0.108854
27 		 0.097767
28 		 0.087972
29 		 0.079299
30 		 0.071603
31 		 0.064761
32 		 0.058666
33 		 0.053225
34 		 0.048359
35 		 0.043999
36 		 0.040085
37 		 0.036567
38 		 0.033397
39 		 0.030538
40 		 0.027954
41 		 0.025617
42 		 0.023498
43 		 0.021576
44 		 0.019830
45 		 0.018241
46 		 0.016794
47 		 0.015475
48 		 0.014271
49 		 0.013170
50 		 0.012164
51 		 0.011242
52 		 0.010397
53 		 0.009623
54 		 0.008912
55 		 0.008258
56 		 0.007658
57 		 0.007105
58 		 0.006596
59 		 0.006127
60 		 0.005694
61 		 0.005295
62 		 0.004927
63 		 0.004586
64 		 

In [6]:
def predict_rating(user_matrix, movie_matrix, user_id, movie_id):
    '''
    INPUT:
    user_matrix - user by latent factor matrix
    movie_matrix - latent factor by movie matrix
    user_id - the user_id from the reviews df
    movie_id - the movie_id according the movies df
    
    OUTPUT:
    pred - the predicted rating for user_id-movie_id according to FunkSVD
    '''
    # Series of users and movies that matches the ordering in training data
    user_ids_series = np.array(train_data_df.index)
    movie_ids_series = np.array(train_data_df.columns)
    
    # User row and Movie Column
    user_row = np.where(user_ids_series == user_id)[0][0]
    movie_col = np.where(movie_ids_series == movie_id)[0][0]
    
    # Dot product to make prediction
    pred = np.dot(user_matrix[user_row, :], movie_matrix[:, movie_col])
    
    return pred

In [7]:
pred_val = predict_rating(user_mat, movie_mat, 8, 2844)
pred_val

6.514886261895935

In [8]:
def print_prediction_summary(user_id, movie_id, prediction):
    '''
    INPUT:
    user_id - the user_id from the reviews df
    movie_id - the movie_id according the movies df
    prediction - the predicted rating for user_id-movie_id
    '''
    movie_name = str(movies[movies['movie_id'] == movie_id]['movie']) [5:]
    movie_name = movie_name.replace('\nName: movie, dtype: object', '')
    print("For user {} we predict a {} rating for the movie {}.".format(user_id, round(prediction, 2), str(movie_name)))

In [9]:
print_prediction_summary(8, 2844, pred_val)

For user 8 we predict a 6.51 rating for the movie  Fantômas - À l'ombre de la guillotine (1913).


In [10]:
def validation_comparison(val_df, num_preds):
    '''
    INPUT:
    val_df - the validation dataset created in the third cell above
    num_preds - (int) the number of rows (going in order) you would like to make predictions for
    
    OUTPUT:
    Nothing returned - print a statement about the prediciton made for each row of val_df from row 0 to num_preds
    '''
    val_users = np.array(val_df['user_id'])
    val_movies = np.array(val_df['movie_id'])
    val_ratings = np.array(val_df['rating'])
    
    
    for idx in range(num_preds):
        pred = predict_rating(user_mat, movie_mat, val_users[idx], val_movies[idx])
        print("The actual rating for user {} on movie {} is {}.\n While the predicted rating is {}.".format(val_users[idx], val_movies[idx], val_ratings[idx], round(pred))) 

        
# Predicted vs. actual for the first 6 rows.
validation_comparison(val_df, 6)        

The actual rating for user 49056 on movie 1598822 is 8.
 While the predicted rating is 6.
The actual rating for user 49056 on movie 289879 is 9.
 While the predicted rating is 8.
The actual rating for user 49056 on movie 1563738 is 9.
 While the predicted rating is 7.
The actual rating for user 49056 on movie 1458175 is 4.
 While the predicted rating is 9.
The actual rating for user 28599 on movie 103639 is 8.
 While the predicted rating is 8.
The actual rating for user 50593 on movie 1560985 is 4.
 While the predicted rating is 4.


In [11]:
validation_comparison(val_df, 6)        

The actual rating for user 49056 on movie 1598822 is 8.
 While the predicted rating is 6.
The actual rating for user 49056 on movie 289879 is 9.
 While the predicted rating is 8.
The actual rating for user 49056 on movie 1563738 is 9.
 While the predicted rating is 7.
The actual rating for user 49056 on movie 1458175 is 4.
 While the predicted rating is 9.
The actual rating for user 28599 on movie 103639 is 8.
 While the predicted rating is 8.
The actual rating for user 50593 on movie 1560985 is 4.
 While the predicted rating is 4.
