In [14]:
# Author: Sylvain Lapeyrade (sylla801)
# File: TSKS11-Lab6

from scipy import spatial


In [15]:
# Validation data:
# RMSE, training data = 0.886
# RMSE, test data = 0.898
train_file = './validate_code_netflix/validate_code.training'
test_file = './validate_code_netflix/validate_code.test'
movie_file = './validate_code_netflix/validate_code.moviename'

# personal files => sylvainl
train_file2 = './training/sylvainl.training'
test_file2 = './test/sylvainl.test'
movie_file2 = './test/sylvainl.moviename'

# Task 2:
train_file3 = './training/task---2.training'
test_file3 = './test/task---2.test'
movie_file3 = './test/task---2.moviename'

In [16]:
# *** TRAINING ***
train_edge_list = np.loadtxt(train_file, delimiter=',', dtype=int)
train_users, train_movies, train_ratings = (
    train_edge_list[:, 0], train_edge_list[:, 1], train_edge_list[:, 2])

print("Train edge list:")
print("User, Movie, Rating")
print(train_edge_list)

train_size = len(train_edge_list)
nr_train_users = max(train_users)
nr_train_movies = max(train_movies)

print("\nTrain size: {}".format(train_size))
print("Training users: {}".format(nr_train_users))
print("Training movies: {}".format(nr_train_movies))

# Training matrix (A)
train_matrix = np.zeros(shape=(nr_train_users, nr_train_movies))
for user, movie, rating in zip(train_users, train_movies, train_ratings):
    train_matrix[int(user)-1, int(movie)-1] = int(rating)  # -1

print("\nTraining Matrix:\n {}".format(train_matrix))

Train edge list:
User, Movie, Rating
[[   6    1    3]
 [  10    1    1]
 [  16    1    4]
 ...
 [1867 1500    3]
 [1887 1500    3]
 [1890 1500    4]]

Train size: 185850
Training users: 2000
Training movies: 1500

Training Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 4. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 3. 0.]
 ...
 [0. 4. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
# TRAINING indices: Indices where an user rated a movie
train_indices = np.transpose(np.nonzero(train_matrix))
# Sort by movies then by users while keeping the stability
train_indices = train_indices[train_indices[:, 0].argsort()]
train_indices = train_indices[train_indices[:, 1].argsort(kind='mergesort')]

print("Train Indices")
print("User, Movie\n", train_indices)

Train Indices
User, Movie
 [[   5    0]
 [   9    0]
 [  15    0]
 ...
 [1866 1499]
 [1886 1499]
 [1889 1499]]


In [18]:
# *** TESTING ***
test_edge_list = np.loadtxt(test_file, delimiter=',', dtype=int)
test_users, test_movies, test_ratings = (
    test_edge_list[:, 0], test_edge_list[:, 1], test_edge_list[:, 2])

print("Test edge list:")
print("User, Movie, Rating")
print(test_edge_list)

test_size = len(test_edge_list)
nr_test_users = max(test_users)
nr_test_movies = max(test_movies)

print("\nTest size: {}".format(test_size))
print("Testing users: {}".format(nr_test_users))
print("Testing movies: {}".format(nr_test_movies))

# Test matrix (A)
test_matrix = np.zeros(shape=(nr_test_users, nr_test_movies))
for u, m, r in zip(test_users, test_movies, test_ratings):
    test_matrix[int(u) - 1, int(m) - 1] = int(r)  # -1

print("\nTesting Matrix:\n {}".format(test_matrix))

Test edge list:
User, Movie, Rating
[[   4    1    5]
 [  60    1    3]
 [ 134    1    4]
 ...
 [1139 1500    1]
 [1371 1500    4]
 [1923 1500    5]]

Test size: 20651
Testing users: 2000
Testing movies: 1500

Testing Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
# TESTING indices: Indices where an user rated a movie
test_indices = np.transpose(np.nonzero(test_matrix))

# Sort by movies then by users while keeping the stability
test_indices = test_indices[test_indices[:, 0].argsort()]
test_indices = test_indices[test_indices[:, 1].argsort(kind='mergesort')]

print("Test Indices")
print("User, Movie\n", test_indices)

Test Indices
User, Movie
 [[   3    0]
 [  59    0]
 [ 133    0]
 ...
 [1138 1499]
 [1370 1499]
 [1922 1499]]


In [20]:
# *** BASELINE PREDICTOR *** rum= ̄r+bU,u+bM,m
# where bU,u and bM,m minimize the root mean-square error (RMSE)

# ̄r: average rating over all users and movies
average_rating = np.sum(train_matrix) / train_size
print("Average Rating Training:", average_rating)

Average Rating Training: 3.6066505246166263


In [21]:
# TASK 0: *** SIMPLE BASELINE PREDICTOR ***

# bU,u: bias of user u compared to the average ̄r
bias_user = np.zeros(shape=nr_train_users)
for user in range(nr_train_users):
    bias_user[user] = (np.sum(train_matrix[user]) / np.count_nonzero(train_matrix[user])) - average_rating
print("Bias user:", bias_user)

# bM,m: bias of movie m compared to the average ̄r
bias_movie = np.zeros(shape=nr_train_movies)
for movie in range(nr_train_movies):
    bias_movie[movie] = (np.sum(train_matrix[:, movie]) / np.count_nonzero(train_matrix[:, movie])) - average_rating
print("\nBias movie:", bias_movie)

Bias user: [ 0.3164264   0.65650737  0.214245   ... -0.06818899  0.19890503
 -1.04820897]

Bias movie: [-0.42616272  0.24205796 -0.47621574 ... -0.66725659  0.10917681
  0.58922576]


In [22]:
# TRAINING rum: predicted rating for user u of movie m
predicted_ratings_training = np.zeros(shape=train_size)

i = 0
for user, movie in train_indices:
    predicted_ratings_training[i] = average_rating + bias_user[user] + bias_movie[movie]
    i+=1 

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_training[predicted_ratings_training < 1] = 1
predicted_ratings_training[predicted_ratings_training > 5] = 5

print("Simple baseline solution")
print("Predicted Training Ratings:\n", predicted_ratings_training)
print("Actual Training Ratings:\n", train_ratings)

# TRAINING Root Mean Squared Error 
gradient_error_training = train_ratings - predicted_ratings_training
print("\nGradient Error Training:", gradient_error_training)
avg_gradient_error_training = np.mean(train_ratings - predicted_ratings_training)
print("Average Gradient Error Training:\n", avg_gradient_error_training)
rmse_training = math.sqrt(sum((train_ratings - predicted_ratings_training)**2) / train_size)
print("Root Squared Mean Error Training:", rmse_training)

Simple baseline solution
Predicted Training Ratings:
 [3.01243377 3.66686054 3.27195049 ... 3.65438867 4.42971656 4.30669788]
Actual Training Ratings:
 [3 1 4 ... 3 3 4]

Gradient Error Training: [-0.01243377 -2.66686054  0.72804951 ... -0.65438867 -1.42971656
 -0.30669788]
Average Gradient Error Training:
 0.0021524344043788255
Root Squared Mean Error Training: 0.9113285879625816


In [23]:
# TESTING rum: predicted rating for user u of movie m
i = 0
predicted_ratings_testing = np.zeros(shape=test_size)
for user, movie in test_indices:
    predicted_ratings_testing[i] = average_rating + bias_user[user] + bias_movie[movie]
    i+=1

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_testing[predicted_ratings_testing < 1] = 1
predicted_ratings_testing[predicted_ratings_testing > 5] = 5

print("Simple baseline solution")
print("Predicted Testing Ratings:\n", predicted_ratings_testing)
print("Actual Testing Ratings:\n", test_ratings)

# TESTING Root Mean Squared Error
gradient_error_testing = test_ratings - predicted_ratings_testing
print("\nGradient Error Testing:", gradient_error_testing)
avg_gradient_error_testing = np.mean(test_ratings - predicted_ratings_testing)
print("Average Gradient Error Testing:\n", avg_gradient_error_testing)
rmse_testing = math.sqrt(sum((test_ratings - predicted_ratings_testing)**2) / test_size)
print("Root Squared Mean Error Testing:", rmse_testing)

Simple baseline solution
Predicted Testing Ratings:
 [3.77295232 1.80741392 3.35432509 ... 4.06422576 3.42584548 4.25589243]
Actual Testing Ratings:
 [5 3 4 ... 1 4 5]

Gradient Error Testing: [ 1.22704768  1.19258608  0.64567491 ... -3.06422576  0.57415452
  0.74410757]
Average Gradient Error Testing:
 -0.0006655708384069903
Root Squared Mean Error Testing: 0.9257029440118905


In [24]:
# TASK 1: ***LEAST SQUARE SOLUTION***

matrix_A = np.zeros((train_size, nr_train_users + nr_train_movies))
vector_y = np.zeros(train_size)

for row, i in zip(train_indices, range(train_size)):
    matrix_A[i, int(row[0])] = 1
    matrix_A[i, int(row[1]) + nr_train_users] = 1
    vector_y[i] = train_matrix[row[0], row[1]] - average_rating

print("Matrix A:\n", matrix_A)
print("\nVector y:\n", vector_y)

# Bias
bias_lstsq, _, _, _ = np.linalg.lstsq(matrix_A, vector_y, rcond=1e-3)
print("\nLeast squared solutions (bias):\n", bias_lstsq)

Matrix A:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]

Vector y:
 [-0.60665052 -2.60665052  0.39334948 ... -0.60665052 -0.60665052
  0.39334948]

Least squared solutions (bias):
 [-0.19754469  0.25400514 -0.17869378 ... -0.47943225  0.22523022
  0.75502689]


In [31]:
# TRAINING rum: predicted rating for user u of movie m
predicted_ratings_training = np.zeros(shape=train_size)
for user, movie, i in zip(train_indices[:, 0], train_indices[:, 1], range(train_size)):
    predicted_ratings_training[i] = average_rating + bias_lstsq[user] + bias_lstsq[movie + nr_train_users]

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_training[predicted_ratings_training < 1] = 1
predicted_ratings_training[predicted_ratings_training > 5] = 5

print("Least Squared Solution")
print("Predicted Training Ratings:\n", predicted_ratings_training)
print("Actual Training Ratings:\n", train_ratings)

# TRAINING Root Mean Squared Error 
gradient_error_training = train_ratings - predicted_ratings_training
print("\nGradient Error Training:", gradient_error_training)
avg_gradient_error_training = np.mean(train_ratings - predicted_ratings_training)
print("Average Gradient Error Training:\n", avg_gradient_error_training)
rmse_training = math.sqrt(sum((train_ratings - predicted_ratings_training)**2) / train_size)
print("Root Squared Mean Error Training:", rmse_training)

Least Squared Solution
Predicted Training Ratings:
 [2.926481   3.17494391 3.22156063 ... 3.78964329 4.73287026 4.25806373]
Actual Training Ratings:
 [3 1 4 ... 3 3 4]

Gradient Error Training: [ 0.073519   -2.17494391  0.77843937 ... -0.78964329 -1.73287026
 -0.25806373]
Average Gradient Error Training:
 0.0004761822560547787
Root Squared Mean Error Training: 0.8859758692598938


In [32]:
# TESTING rum: predicted rating for user u of movie m
predicted_ratings_testing = np.zeros(shape=test_size)
for user, movie, i in zip(test_indices[:, 0], test_indices[:, 1], range(test_size)):
    predicted_ratings_testing[i] = average_rating + bias_lstsq[user] + bias_lstsq[movie + nr_train_users]

# Truncate predicted ratings by setting predictions that falls below 1 to 1
#  and any prediction that exceeds 5 to 5
predicted_ratings_testing[predicted_ratings_testing < 1] = 1
predicted_ratings_testing[predicted_ratings_testing > 5] = 5

print("Least Squared Solution")
print("Predicted Testing Ratings:\n", predicted_ratings_testing)
print("Actual Testing Ratings:\n", test_ratings)

# TESTING Root Mean Squared Error
gradient_error_testing = test_ratings - predicted_ratings_testing
print("\nGradient Error Testing:", gradient_error_testing)
avg_gradient_error_testing = np.mean(test_ratings - predicted_ratings_testing)
print("Average Gradient Error Testing:\n", avg_gradient_error_testing)
rmse_testing = math.sqrt(sum((test_ratings - predicted_ratings_testing)**2) / test_size)
print("Root Squared Mean Error Testing:", rmse_testing)

Least Squared Solution
Predicted Testing Ratings:
 [3.66226549 2.14073463 3.11560797 ... 4.04612729 3.56584903 4.18088163]
Actual Testing Ratings:
 [5 3 4 ... 1 4 5]

Gradient Error Testing: [ 1.33773451  0.85926537  0.88439203 ... -3.04612729  0.43415097
  0.81911837]
Average Gradient Error Testing:
 -0.00023389577710912534
Root Squared Mean Error Testing: 0.8975070398165736
