In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split

Loading data

In [2]:
ratings_path = 'movielens_1m/ratings.dat'
users_path = 'movielens_1m/users.dat'
movies_path = 'movielens_1m/movies.dat'

In [3]:
ratings = pd.read_csv(ratings_path, sep="::", names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

  """Entry point for launching an IPython kernel.


In [4]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Split into train and test data

In [21]:
np.random.seed(1234)
# train data ratio
x = 0.8
train, test = train_test_split(ratings, train_size=x)

Count the number of unique users (m) and movies (n) in train set.

In [22]:
unique_users = sorted(list(set(train.UserID)))
unique_movies = sorted(list(set(train.MovieID)))
user_map, movie_map = {}, {}
for idx, userID in enumerate(unique_users):
    user_map[userID] = idx
for idx, itemID in enumerate(unique_movies):
    movie_map[itemID] = idx
n_users = len(unique_users)
n_movies = len(unique_movies)
print(f'\nNumber of users: {n_users} \n')
print(f'Number of movies: {n_movies}')



Number of users: 6040 

Number of movies: 3683


Convert the data above to a sparse m x n matrix, where each row is one user, 
each column is one movie, and the value is the rating. This is so-called 
"customer-product ratings matrix".

In [23]:
R = train.pivot(index='UserID', columns='MovieID', values='Rating')

In [24]:
print(R)

MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1         5.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6036      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6037      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6038      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6039      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6040      3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

MovieID  39

Compute the sparsity ratio.

In [25]:
total_count = n_users * n_movies
na_count = R.isna().sum().sum()
sparsity = round(float(na_count) / total_count, 5)
print(f'The sparsity ratio is {sparsity}')

The sparsity ratio is 0.96403


The paper suggests two approaches of filling the matrix: 


1) Using the average ratings for a customer,

2) or using the average ratings for a product.

Here we use the second approach to remove the sparsity.

In [26]:
 
for idx_movie in unique_movies:
    col = R[idx_movie]
    col.fillna(col.mean(), inplace=True)

In [27]:
print(R)

MovieID      1         2         3         4         5         6         7     \
UserID                                                                          
1        5.000000  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
2        4.153614  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
3        4.153614  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
4        4.153614  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
5        4.153614  3.178131  3.028133  2.764706  2.983193  2.000000  3.415301   
...           ...       ...       ...       ...       ...       ...       ...   
6036     4.153614  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
6037     4.153614  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
6038     4.153614  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
6039     4.153614  3.178131  3.028133  2.764706  2.983193  3.862299  3.415301   
6040     3.000000  3.178131 

The paper suggests two normalization techniques: 
conversion of ratings to z-scores and subtraction of customer average for a product.

Here we use the second technique.

In [28]:
# First convert R to a numpy array

R_norm = R.copy().to_numpy()
mean_vals = []

for idx_user in range(n_users):
    row = R_norm[idx_user]
    mean_val = np.mean(row)
    mean_vals.append(mean_val)
    R_norm[idx_user] -= mean_val

mean_vals = np.array(mean_vals)
    

Set dimension of the latent vector space, and compute the SVD.

In [29]:
k = 6
Uk, Sk, Vk_t = svds(R_norm, k=k)

In [30]:
print(f'The shape of Uk is {Uk.shape}\n'
      f'The shape of Sk is {Sk.shape}\n'
      f'The shape of Vk_t is {Vk_t.shape}')



The shape of Uk is (6040, 6)
The shape of Sk is (6,)
The shape of Vk_t is (6, 3683)


In [31]:
Sk_sqrt = np.sqrt(Sk)
Sk_mat = np.diag(Sk_sqrt)

In [32]:
U = np.matmul(Uk, Sk_mat)
Vt = np.matmul(Sk_mat, Vk_t)
V = np.transpose(Vt)

In [33]:
def predict(U, V, mean_vals, idx_user, idx_item):
    mean_val = mean_vals[idx_user]
    user_vec = U[idx_user, :]
    item_vec = V[idx_item, :]
    pred_val = mean_val + np.inner(user_vec, item_vec)
    return pred_val

In [34]:
true_ratings = R.to_numpy()

In [35]:
idx = 0

In [44]:
idx += 1
uID = ratings['UserID'][idx]
mID = ratings['MovieID'][idx]
idx_user = user_map[uID]
idx_movie = movie_map[mID]
true_val = true_ratings[idx_user, idx_movie]
pred_val = predict(U, V, mean_vals, idx_user, idx_movie)
print(f'UserID = {uID}, MovieID = {mID}\n')
print(f'True rating is {true_val}\n')
print(f'Pred rating is {pred_val}\n')
print(f'Error is {abs(true_val - pred_val)}\n')

    

UserID = 1, MovieID = 919

True rating is 4.249107780157031

Pred rating is 4.336189338086129

Error is 0.087081557929098



Computing mean absolute error (MAE) of the predictions.

In [46]:
def compute_norm(matrix, mode='l2', axis=0):
    matrix = np.array(matrix)
    assert matrix.ndim > axis, 'something is wrong!'
    if mode == 'l2':
        squared = np.square(matrix)
        added = np.sum(squared, axis=axis)
        norm = np.sqrt(added)
    elif mode == 'l1':
        abs = np.abs(matrix)
        norm = np.sum(abs, axis=axis)
    elif mode == 'linf':
        abs = np.abs(matrix)
        norm = np.max(abs, axis=axis)
    else:
        raise ValueError('wrong choice of norm!')
    return norm


In [56]:
test.reset_index(inplace=True)
n_test_ratings = test.shape[0]
errors = []
for idx in range(n_test_ratings):
    uID = test['UserID'][idx]
    mID = test['MovieID'][idx]
    if uID not in user_map or mID not in movie_map:
        continue
    idx_user = user_map[uID]
    idx_movie = movie_map[mID]
    true_val = test['Rating'][idx]
    pred_val = predict(U, V, mean_vals, idx_user, idx_movie)
    diff = true_val - pred_val
    errors.append(diff)
mae_error = compute_norm(errors, mode='l1') / len(errors)
print(f'x = {x}, mae = {mae_error}')



x = 0.8, mae = 0.7433810836698924


In [54]:
test['UserID']

468372    2887
450418    2776
314880    1880
411694    2472
403138    2407
          ... 
469741    2894
604451    3675
835768    5025
687451    4115
595695    3624
Name: UserID, Length: 200042, dtype: int64