In [1]:
# Author: Sylvain Lapeyrade (sylla801)
# File: TSKS11-Lab6

import numpy as np
from scipy import sparse
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
# Validation data:
# RMSE, training data = 0.886
# RMSE, test data = 0.898
train_file2 = './validate_code_netflix/validate_code.training'
test_file2 = './validate_code_netflix/validate_code.test'
movie_file2 = './validate_code_netflix/validate_code.moviename'

# personal files => sylvainl
# RMSE, training data = 0.886
# RMSE, test data = 0.8992
train_file = './sylvainl/training/sylvainl.training'
test_file = './sylvainl/test/sylvainl.test'
movie_file = './sylvainl/moviename/sylvainl.moviename'

# Task 2:
train_file3 = './sylvainl/training/task---2.training'
test_file3 = './sylvainl/test/task---2.test'
movie_file3 = './sylvainl/moviename/task---2.moviename'

In [3]:
# *** TRAINING ***
train_edge_list = np.loadtxt(train_file, delimiter=',', dtype=int)
train_users, train_movies, train_ratings = (
    train_edge_list[:, 0], train_edge_list[:, 1], train_edge_list[:, 2])

print("Train edge list:")
print("User, Movie, Rating")
print(train_edge_list)

train_size = len(train_edge_list)
nr_train_users = max(train_users)
nr_train_movies = max(train_movies)

print("\nTrain size: {}".format(train_size))
print("Training users: {}".format(nr_train_users))
print("Training movies: {}".format(nr_train_movies))

# Training matrix (A)
train_matrix = np.zeros(shape=(nr_train_users, nr_train_movies))
for user, movie, rating in zip(train_users, train_movies, train_ratings):
    train_matrix[int(user)-1, int(movie)-1] = int(rating)  # -1

print("\nTraining Matrix:\n {}".format(train_matrix))

Train edge list:
User, Movie, Rating
[[   8    1    5]
 [  11    1    5]
 [  13    1    5]
 ...
 [1801 1500    4]
 [1819 1500    5]
 [1988 1500    4]]

Train size: 184757
Training users: 2000
Training movies: 1500

Training Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 3. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 0.]]


In [6]:
# TRAINING indices: Indices where an user rated a movie
train_indices = np.transpose(np.nonzero(train_matrix))
# Sort by movies then by users while keeping the stability
train_indices = train_indices[train_indices[:, 0].argsort()]
train_indices = train_indices[train_indices[:, 1].argsort(kind='mergesort')]

print("Train Indices")
print("User, Movie\n", train_indices)

Train Indices
User, Movie
 [[   7    0]
 [  10    0]
 [  12    0]
 ...
 [1800 1499]
 [1818 1499]
 [1987 1499]]


In [4]:
# *** TESTING ***
test_edge_list = np.loadtxt(test_file, delimiter=',', dtype=int)
test_users, test_movies, test_ratings = (
    test_edge_list[:, 0], test_edge_list[:, 1], test_edge_list[:, 2])

print("Test edge list:")
print("User, Movie, Rating")
print(test_edge_list)

test_size = len(test_edge_list)
nr_test_users = max(test_users)
nr_test_movies = max(test_movies)

print("\nTest size: {}".format(test_size))
print("Testing users: {}".format(nr_test_users))
print("Testing movies: {}".format(nr_test_movies))

# Test matrix (A)
test_matrix = np.zeros(shape=(nr_test_users, nr_test_movies))
for u, m, r in zip(test_users, test_movies, test_ratings):
    test_matrix[int(u) - 1, int(m) - 1] = int(r)  # -1

print("\nTesting Matrix:\n {}".format(test_matrix))

Test edge list:
User, Movie, Rating
[[   3    1    5]
 [  36    1    5]
 [  59    1    2]
 ...
 [ 995 1500    4]
 [1440 1500    4]
 [1695 1500    4]]

Test size: 20529
Testing users: 2000
Testing movies: 1500

Testing Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [5. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
# TESTING indices: Indices where an user rated a movie
test_indices = np.transpose(np.nonzero(test_matrix))

# Sort by movies then by users while keeping the stability
test_indices = test_indices[test_indices[:, 0].argsort()]
test_indices = test_indices[test_indices[:, 1].argsort(kind='mergesort')]

print("Test Indices")
print("User, Movie\n", test_indices)

Test Indices
User, Movie
 [[   2    0]
 [  35    0]
 [  58    0]
 ...
 [ 994 1499]
 [1439 1499]
 [1694 1499]]


In [7]:
# *** BASELINE PREDICTOR *** rum= ̄r+bU,u+bM,m
# where bU,u and bM,m minimize the root mean-square error (RMSE)

# ̄r: average rating over all users and movies
average_rating = np.sum(train_matrix) / train_size
print("Average Rating Training:", average_rating)

Average Rating Training: 3.6119064500939073


In [8]:
# TASK 0: *** SIMPLE BASELINE PREDICTOR ***

# bU,u: bias of user u compared to the average ̄r
bias_user = np.zeros(shape=nr_train_users)
for user in range(nr_train_users):
    bias_user[user] = (np.sum(train_matrix[user]) / np.count_nonzero(train_matrix[user])) - average_rating
print("Bias user:", bias_user)

# bM,m: bias of movie m compared to the average ̄r
bias_movie = np.zeros(shape=nr_train_movies)
for movie in range(nr_train_movies):
    bias_movie[movie] = (np.sum(train_matrix[:, movie]) / np.count_nonzero(train_matrix[:, movie])) - average_rating
print("\nBias movie:", bias_movie)

Bias user: [ 0.47142688 -0.44523978  0.15080541 ...  0.01418051  0.2342474
 -0.08249469]

Bias movie: [ 0.51722741 -0.3539879  -0.86190645 ... -1.17072998  0.03653105
  0.23555118]


In [9]:
# TRAINING rum: predicted rating for user u of movie m
predicted_ratings_training = np.zeros(shape=train_size)

i = 0
for user, movie in train_indices:
    predicted_ratings_training[i] = average_rating + bias_user[user] + bias_movie[movie]
    i+=1 

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_training[predicted_ratings_training < 1] = 1
predicted_ratings_training[predicted_ratings_training > 5] = 5

print("Simple baseline solution")
print("Predicted Training Ratings:\n", predicted_ratings_training)
print("Actual Training Ratings:\n", train_ratings)

# TRAINING Root Mean Squared Error 
gradient_error_training = train_ratings - predicted_ratings_training
print("\nGradient Error Training:", gradient_error_training)
avg_gradient_error_training = np.mean(train_ratings - predicted_ratings_training)
print("Average Gradient Error Training:\n", avg_gradient_error_training)
rmse_training = math.sqrt(sum((train_ratings - predicted_ratings_training)**2) / train_size)
print("Root Squared Mean Error Training:", rmse_training)

Simple baseline solution
Predicted Training Ratings:
 [4.12947231 4.23151312 4.34262423 ... 3.87047181 4.33465028 3.43555118]
Actual Training Ratings:
 [5 5 5 ... 4 5 4]

Gradient Error Training: [0.87052769 0.76848688 0.65737577 ... 0.12952819 0.66534972 0.56444882]
Average Gradient Error Training:
 0.0020989346353743193
Root Squared Mean Error Training: 0.909629009558173


In [10]:
# TESTING rum: predicted rating for user u of movie m
i = 0
predicted_ratings_testing = np.zeros(shape=test_size)
for user, movie in test_indices:
    predicted_ratings_testing[i] = average_rating + bias_user[user] + bias_movie[movie]
    i+=1

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_testing[predicted_ratings_testing < 1] = 1
predicted_ratings_testing[predicted_ratings_testing > 5] = 5

print("Simple baseline solution")
print("Predicted Testing Ratings:\n", predicted_ratings_testing)
print("Actual Testing Ratings:\n", test_ratings)

# TESTING Root Mean Squared Error
gradient_error_testing = test_ratings - predicted_ratings_testing
print("\nGradient Error Testing:", gradient_error_testing)
avg_gradient_error_testing = np.mean(test_ratings - predicted_ratings_testing)
print("Average Gradient Error Testing:\n", avg_gradient_error_testing)
rmse_testing = math.sqrt(sum((test_ratings - predicted_ratings_testing)**2) / test_size)
print("Root Squared Mean Error Testing:", rmse_testing)

Simple baseline solution
Predicted Testing Ratings:
 [4.27993927 4.96884031 4.13216994 ... 3.58948376 3.55297856 4.31811998]
Actual Testing Ratings:
 [5 5 2 ... 4 4 4]

Gradient Error Testing: [ 0.72006073  0.03115969 -2.13216994 ...  0.41051624  0.44702144
 -0.31811998]
Average Gradient Error Testing:
 0.001967901290099149
Root Squared Mean Error Testing: 0.9263492191061035


In [11]:
# TASK 1: ***LEAST SQUARE SOLUTION***

matrix_A = np.zeros((train_size, nr_train_users + nr_train_movies))
vector_y = np.zeros(train_size)

for row, i in zip(train_indices, range(train_size)):
    matrix_A[i, int(row[0])] = 1
    matrix_A[i, int(row[1]) + nr_train_users] = 1
    vector_y[i] = train_matrix[row[0], row[1]] - average_rating

print("Matrix A:\n", matrix_A)
print("\nVector y:\n", vector_y)

# Bias
bias_lstsq, _, _, _ = np.linalg.lstsq(matrix_A, vector_y, rcond=1e-3)
print("\nLeast squared solutions (bias):\n", bias_lstsq)

Matrix A:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]

Vector y:
 [1.38809355 1.38809355 1.38809355 ... 0.38809355 1.38809355 0.38809355]

Least squared solutions (bias):
 [-0.0721962  -0.71452885 -0.2149811  ... -1.12434221  0.14135516
  0.35246021]


In [12]:
# TRAINING rum: predicted rating for user u of movie m
predicted_ratings_training = np.zeros(shape=train_size)
for user, movie, i in zip(train_indices[:, 0], train_indices[:, 1], range(train_size)):
    predicted_ratings_training[i] = average_rating + bias_lstsq[user] + bias_lstsq[movie + nr_train_users]

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_training[predicted_ratings_training < 1] = 1
predicted_ratings_training[predicted_ratings_training > 5] = 5

print("Least Squared Solution")
print("Predicted Training Ratings:\n", predicted_ratings_training)
print("Actual Training Ratings:\n", train_ratings)

# TRAINING Root Mean Squared Error 
gradient_error_training = train_ratings - predicted_ratings_training
print("\nGradient Error Training:", gradient_error_training)
avg_gradient_error_training = np.mean(train_ratings - predicted_ratings_training)
print("Average Gradient Error Training:\n", avg_gradient_error_training)
rmse_training = math.sqrt(sum((train_ratings - predicted_ratings_training)**2) / train_size)
print("Root Squared Mean Error Training:", rmse_training)

Least Squared Solution
Predicted Training Ratings:
 [4.19541185 4.1363209  4.15562285 ... 3.74162581 4.16993549 3.32069999]
Actual Training Ratings:
 [5 5 5 ... 4 5 4]

Gradient Error Training: [0.80458815 0.8636791  0.84437715 ... 0.25837419 0.83006451 0.67930001]
Average Gradient Error Training:
 0.0004059375679277457
Root Squared Mean Error Training: 0.8835469255427403


In [13]:
# TESTING rum: predicted rating for user u of movie m
predicted_ratings_testing = np.zeros(shape=test_size)
for user, movie, i in zip(test_indices[:, 0], test_indices[:, 1], range(test_size)):
    predicted_ratings_testing[i] = average_rating + bias_lstsq[user] + bias_lstsq[movie + nr_train_users]

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_testing[predicted_ratings_testing < 1] = 1
predicted_ratings_testing[predicted_ratings_testing > 5] = 5

print("Least Squared Solution")
print("Predicted Testing Ratings:\n", predicted_ratings_testing)
print("Actual Testing Ratings:\n", test_ratings)

# TESTING Root Mean Squared Error
gradient_error_testing = test_ratings - predicted_ratings_testing
print("\nGradient Error Testing:", gradient_error_testing)
avg_gradient_error_testing = np.mean(test_ratings - predicted_ratings_testing)
print("Average Gradient Error Testing:\n", avg_gradient_error_testing)
rmse_testing = math.sqrt(sum((test_ratings - predicted_ratings_testing)**2) / test_size)
print("Root Squared Mean Error Testing:", rmse_testing)

Least Squared Solution
Predicted Testing Ratings:
 [4.0067127  4.56569567 4.1958912  ... 3.76630292 3.84100552 3.94379735]
Actual Testing Ratings:
 [5 5 2 ... 4 4 4]

Gradient Error Testing: [ 0.9932873   0.43430433 -2.1958912  ...  0.23369708  0.15899448
  0.05620265]
Average Gradient Error Testing:
 0.00040553458619866435
Root Squared Mean Error Testing: 0.8992469584163942
