In [1]:
# Author: Sylvain Lapeyrade (sylla801)
# File: TSKS11-Lab6 task2_sylvainl.ipynb

import numpy as np
from scipy.spatial import distance
import math

In [2]:
# Validation data:
# RMSE, training data = 0.886
# RMSE, test data = 0.898
train_file1 = './validate_code_netflix/validate_code.training'
test_file1 = './validate_code_netflix/validate_code.test'
movie_file1 = './validate_code_netflix/validate_code.moviename'

# personal files => sylvainl
# RMSE, training data = 0.8835
# RMSE, test data = 0.8992
train_file2 = './sylvainl/training/sylvainl.training'
test_file2 = './sylvainl/test/sylvainl.test'
movie_file2 = './sylvainl/moviename/sylvainl.moviename'

# Task 2:
# RMSE, training data = ???
# RMSE, test data = ???
train_file = './sylvainl/training/task---2.training'
test_file = './sylvainl/test/task---2.test'
movie_file = './sylvainl/moviename/task---2.moviename'

In [3]:
# *** TRAINING ***
train_edge_list = np.loadtxt(train_file, delimiter=',', dtype=int)
train_users, train_movies, train_ratings = (
    train_edge_list[:, 0], train_edge_list[:, 1], train_edge_list[:, 2])

print("Train edge list:")
print("User, Movie, Rating")
print(train_edge_list)

train_size = len(train_edge_list)
nr_train_users = max(train_users)
nr_train_movies = max(train_movies)

print("\nTrain size: {}".format(train_size))
print("Training users: {}".format(nr_train_users))
print("Training movies: {}".format(nr_train_movies))

# Training matrix (A)
train_matrix = np.zeros(shape=(nr_train_users, nr_train_movies))
for user, movie, rating in zip(train_users, train_movies, train_ratings):
    train_matrix[int(user)-1, int(movie)-1] = int(rating)  # -1

print("\nTraining Matrix:\n {}".format(train_matrix))

Train edge list:
User, Movie, Rating
[[   4    1    4]
 [   7    1    3]
 [  12    1    4]
 ...
 [1885 1500    3]
 [1934 1500    4]
 [1989 1500    4]]

Train size: 183183
Training users: 2000
Training movies: 1500

Training Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
# TRAINING indices: Indices where an user rated a movie
train_indices = np.transpose(np.nonzero(train_matrix))
# Sort by movies then by users while keeping the stability
train_indices = train_indices[train_indices[:, 0].argsort()]
train_indices = train_indices[train_indices[:, 1].argsort(kind='mergesort')]

print("Train Indices")
print("User, Movie\n", train_indices)

Train Indices
User, Movie
 [[   3    0]
 [   6    0]
 [  11    0]
 ...
 [1884 1499]
 [1933 1499]
 [1988 1499]]


In [5]:
# *** TESTING ***
test_edge_list = np.loadtxt(test_file, delimiter=',', dtype=int)
test_users, test_movies, test_ratings = (
    test_edge_list[:, 0], test_edge_list[:, 1], test_edge_list[:, 2])

print("Test edge list:")
print("User, Movie, Rating")
print(test_edge_list)

test_size = len(test_edge_list)
nr_test_users = max(test_users)
nr_test_movies = max(test_movies)

print("\nTest size: {}".format(test_size))
print("Testing users: {}".format(nr_test_users))
print("Testing movies: {}".format(nr_test_movies))

# Test matrix (A)
test_matrix = np.zeros(shape=(nr_test_users, nr_test_movies))
for u, m, r in zip(test_users, test_movies, test_ratings):
    test_matrix[int(u) - 1, int(m) - 1] = int(r)  # -1

print("\nTesting Matrix:\n {}".format(test_matrix))

Test edge list:
User, Movie, Rating
[[   1    1    5]
 [  28    1    4]
 [  66    1    4]
 ...
 [ 921 1500    3]
 [1058 1500    4]
 [1580 1500    4]]

Test size: 20354
Testing users: 2000
Testing movies: 1500

Testing Matrix:
 [[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
# TESTING indices: Indices where an user rated a movie
test_indices = np.transpose(np.nonzero(test_matrix))

# Sort by movies then by users while keeping the stability
test_indices = test_indices[test_indices[:, 0].argsort()]
test_indices = test_indices[test_indices[:, 1].argsort(kind='mergesort')]

print("Test Indices")
print("User, Movie\n", test_indices)

Test Indices
User, Movie
 [[   0    0]
 [  27    0]
 [  65    0]
 ...
 [ 920 1499]
 [1057 1499]
 [1579 1499]]


In [7]:
# *** BASELINE PREDICTOR *** rum= ̄r+bU,u+bM,m
# where bU,u and bM,m minimize the root mean-square error (RMSE)

# ̄r: average rating over all users and movies
average_rating = np.sum(train_matrix) / train_size
print("Average Rating Training:", average_rating)

Average Rating Training: 3.5720508999197524


In [8]:
# TASK 0: *** SIMPLE BASELINE PREDICTOR ***

# bU,u: bias of user u compared to the average ̄r
bias_user = np.zeros(shape=nr_train_users)
for user in range(nr_train_users):
    bias_user[user] = (np.sum(train_matrix[user]) / np.count_nonzero(train_matrix[user])) - average_rating
print("Bias user:", bias_user)

# bM,m: bias of movie m compared to the average ̄r
bias_movie = np.zeros(shape=nr_train_movies)
for movie in range(nr_train_movies):
    bias_movie[movie] = (np.sum(train_matrix[:, movie]) / np.count_nonzero(train_matrix[:, movie])) - average_rating
print("\nBias movie:", bias_movie)

Bias user: [ 0.64223481 -0.47555967  0.49094123 ...  0.59461577 -0.25900742
  0.51128243]

Bias movie: [ 0.57385074 -0.41861175 -0.53431505 ...  0.04184871  0.6463399
  0.16070772]


In [9]:
# TRAINING rum: predicted rating for user u of movie m
predicted_ratings_training = np.zeros(shape=train_size)

i = 0
for user, movie in train_indices:
    predicted_ratings_training[i] = average_rating + bias_user[user] + bias_movie[movie]
    i+=1 

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_training[predicted_ratings_training < 1] = 1
predicted_ratings_training[predicted_ratings_training > 5] = 5

print("Simple baseline solution")
print("Predicted Training Ratings:\n", predicted_ratings_training)
print("Actual Training Ratings:\n", train_ratings)

# TRAINING Root Mean Squared Error 
gradient_error_training = train_ratings - predicted_ratings_training
print("\nGradient Error Training:", gradient_error_training)
avg_gradient_error_training = np.mean(train_ratings - predicted_ratings_training)
print("Average Gradient Error Training:\n", avg_gradient_error_training)
rmse_training = math.sqrt(sum((train_ratings - predicted_ratings_training)**2) / train_size)
print("Root Squared Mean Error Training:", rmse_training)

Simple baseline solution
Predicted Training Ratings:
 [4.2032625  4.31459148 3.66750633 ... 3.9237135  4.06514458 3.41843968]
Actual Training Ratings:
 [4 3 4 ... 3 4 4]

Gradient Error Training: [-0.2032625  -1.31459148  0.33249367 ... -0.9237135  -0.06514458
  0.58156032]
Average Gradient Error Training:
 0.0019502172812095047
Root Squared Mean Error Training: 0.9220415429071303


In [10]:
# TESTING rum: predicted rating for user u of movie m
i = 0
predicted_ratings_testing = np.zeros(shape=test_size)
for user, movie in test_indices:
    predicted_ratings_testing[i] = average_rating + bias_user[user] + bias_movie[movie]
    i+=1

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_testing[predicted_ratings_testing < 1] = 1
predicted_ratings_testing[predicted_ratings_testing > 5] = 5

print("Simple baseline solution")
print("Predicted Testing Ratings:\n", predicted_ratings_testing)
print("Actual Testing Ratings:\n", test_ratings)

# TESTING Root Mean Squared Error
gradient_error_testing = test_ratings - predicted_ratings_testing
print("\nGradient Error Testing:", gradient_error_testing)
avg_gradient_error_testing = np.mean(test_ratings - predicted_ratings_testing)
print("Average Gradient Error Testing:\n", avg_gradient_error_testing)
rmse_testing = math.sqrt(sum((test_ratings - predicted_ratings_testing)**2) / test_size)
print("Root Squared Mean Error Testing:", rmse_testing)

Simple baseline solution
Predicted Testing Ratings:
 [4.78813645 3.28218407 5.         ... 3.60138569 3.50616227 4.10515217]
Actual Testing Ratings:
 [5 4 4 ... 3 4 4]

Gradient Error Testing: [ 0.21186355  0.71781593 -1.         ... -0.60138569  0.49383773
 -0.10515217]
Average Gradient Error Testing:
 0.003039158661004881
Root Squared Mean Error Testing: 0.934499976267962


In [11]:
# TASK 1: ***LEAST SQUARE SOLUTION***

matrix_A = np.zeros((train_size, nr_train_users + nr_train_movies))
vector_y = np.zeros(train_size)

for row, i in zip(train_indices, range(train_size)):
    matrix_A[i, int(row[0])] = 1
    matrix_A[i, int(row[1]) + nr_train_users] = 1
    vector_y[i] = train_matrix[row[0], row[1]] - average_rating

print("Matrix A:\n", matrix_A)
print("\nVector y:\n", vector_y)

# Bias
bias_lstsq, _, _, _ = np.linalg.lstsq(matrix_A, vector_y, rcond=1e-3)
print("\nLeast squared solutions (bias):\n", bias_lstsq)

Matrix A:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]

Vector y:
 [ 0.4279491 -0.5720509  0.4279491 ... -0.5720509  0.4279491  0.4279491]

Least squared solutions (bias):
 [ 0.2733828  -0.7259397   0.29693682 ...  0.13029496  0.81813896
  0.28751214]


In [21]:
# TRAINING rum: predicted rating for user u of movie m
predicted_ratings_training = np.zeros(shape=train_size)
train_ratings_predicted = np.zeros(shape=(nr_train_users, nr_train_movies))
for user, movie, i in zip(train_indices[:, 0], train_indices[:, 1], range(train_size)):
    predicted_ratings_training[i] = train_ratings_predicted[user, movie] = average_rating + bias_lstsq[user] + bias_lstsq[movie + nr_train_users]

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_training[predicted_ratings_training < 1] = 1
predicted_ratings_training[predicted_ratings_training > 5] = 5

print("Least Squared Solution")
print("Predicted Training Ratings:\n", predicted_ratings_training)
print("Actual Training Ratings:\n", train_ratings)

# TRAINING Root Mean Squared Error 
gradient_error_training = train_ratings - predicted_ratings_training
print("\nGradient Error Training:", gradient_error_training)
avg_gradient_error_training = np.mean(train_ratings - predicted_ratings_training)
print("Average Gradient Error Training:\n", avg_gradient_error_training)
rmse_training = math.sqrt(sum((train_ratings - predicted_ratings_training)**2) / train_size)
print("Root Squared Mean Error Training:", rmse_training)

Least Squared Solution
Predicted Training Ratings:
 [4.30348489 3.85870244 3.787168   ... 4.20698457 4.27459659 3.24544658]
Actual Training Ratings:
 [4 3 4 ... 3 4 4]

Gradient Error Training: [-0.30348489 -0.85870244  0.212832   ... -1.20698457 -0.27459659
  0.75455342]
Average Gradient Error Training:
 0.0004871461785553018
Root Squared Mean Error Training: 0.8957232336188603


In [13]:
# TESTING rum: predicted rating for user u of movie m
predicted_ratings_testing = np.zeros(shape=test_size)
for user, movie, i in zip(test_indices[:, 0], test_indices[:, 1], range(test_size)):
    predicted_ratings_testing[i] = average_rating + bias_lstsq[user] + bias_lstsq[movie + nr_train_users]

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_testing[predicted_ratings_testing < 1] = 1
predicted_ratings_testing[predicted_ratings_testing > 5] = 5

print("Least Squared Solution")
print("Predicted Testing Ratings:\n", predicted_ratings_testing)
print("Actual Testing Ratings:\n", test_ratings)

# TESTING Root Mean Squared Error
gradient_error_testing = test_ratings - predicted_ratings_testing
print("\nGradient Error Testing:", gradient_error_testing)
avg_gradient_error_testing = np.mean(test_ratings - predicted_ratings_testing)
print("Average Gradient Error Testing:\n", avg_gradient_error_testing)
rmse_testing = math.sqrt(sum((test_ratings - predicted_ratings_testing)**2) / test_size)
print("Root Squared Mean Error Testing:", rmse_testing)

Least Squared Solution
Predicted Testing Ratings:
 [4.5143465  3.54752802 5.         ... 3.56867373 3.37054391 4.07890416]
Actual Testing Ratings:
 [5 4 4 ... 3 4 4]

Gradient Error Testing: [ 0.4856535   0.45247198 -1.         ... -0.56867373  0.62945609
 -0.07890416]
Average Gradient Error Testing:
 0.0018268726759106677
Root Squared Mean Error Testing: 0.9101572909183229
