In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split

In [14]:
ratings_path = 'movielens_1m/ratings.dat'
users_path = 'movielens_1m/users.dat'
movies_path = 'movielens_1m/movies.dat'

In [15]:
ratings = pd.read_csv(ratings_path, sep="::", names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

  """Entry point for launching an IPython kernel.


In [16]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Split into train and test data

In [17]:
np.random.seed(1234)
# train data ratio
x = 0.8
train, test = train_test_split(ratings, train_size=x)

In [18]:
unique_users = sorted(list(set(train.UserID)))
unique_movies = sorted(list(set(train.MovieID)))
user_map, movie_map = {}, {}
for idx, userID in enumerate(unique_users):
    user_map[userID] = idx
for idx, itemID in enumerate(unique_movies):
    movie_map[itemID] = idx
n_users = len(unique_users)
n_movies = len(unique_movies)
print(f'\nNumber of users: {n_users} \n')
print(f'Number of movies: {n_movies}')



Number of users: 6040 

Number of movies: 3683


In [19]:
train.reset_index(inplace=True)
samples = []
for idx in range(train.shape[0]):
    u = user_map[train['UserID'][idx]]
    i = movie_map[train['MovieID'][idx]]
    r = train['Rating'][idx]
    samples.append((u, i, r))

Initialize P, Q

In [20]:
k = 10
P = np.random.normal(scale=1./k, size=(n_users, k))
Q = np.random.normal(scale=1./k, size=(n_movies, k))
b_u = np.zeros(n_users)
b_i = np.zeros(n_movies)
mu = np.mean([s[2] for s in samples])

In [21]:
def predict(P, Q, b_u, b_i, mu, u, i):
    assert P.shape[1] == Q.shape[1], 'Shapes of P and Q does not match!'
    p, q = P[u, :], Q[i, :]
    res = mu + b_u[u] + b_i[i] + np.inner(p, q)
    return res

def sgd(P, Q, b_u, b_i, mu, samples, lr=0.1, lambd=0.01):
    for u, i, r in samples:
        pred_val = predict(P, Q, b_u, b_i, mu, u, i)
        e = r - pred_val
        b_i[i] += lr * (e - lambd*b_i[i])
        b_u[u] += lr * (e - lambd*b_u[u])
        Q_i = Q[i, :].copy()
        Q[i, :] += lr * (e*P[u, :] - lambd*Q[i, :])
        P[u, :] += lr * (e*Q_i - lambd*P[u, :])
    return P, Q, b_u, b_i

def mse(P, Q, b_u, b_i, mu, samples):
    predicted = get_full_matrix(P, Q, b_u, b_i, mu)
    error = 0
    for u, i, r in samples:
        pred_val = predicted[u, i]
        e = r - pred_val
        error += pow(e, 2)
    return error / len(samples)

def get_full_matrix(P, Q, b_u, b_i, mu):
    matrix = np.matmul(P, Q.transpose())
    n_u = len(b_u)
    n_i = len(b_i)
    for u in range(n_u):
        matrix[u, :] += np.array([b_u[u]] * n_i)
    for i in range(n_i):
        matrix[:, i] += np.array([b_i[i]] * n_u)
    matrix += np.array([[mu] * n_i for _ in range(n_u)])
    return matrix
        

In [22]:
n_iter = 100
lr = 0.1
lambd = 0.01
training_process = []
for idx in range(n_iter):
    np.random.shuffle(samples)
    P, Q, b_u, b_i = sgd(P, Q, b_u, b_i, mu, samples, lr=lr, lambd=lambd)
    error = mse(P, Q, b_u, b_i, mu, samples)
    training_process.append((idx, error))
    if (idx + 1) % 1 == 0:
        print("Iteration: %d ; error = %.4f" % (idx+1, error))
    

Iteration: 1 ; error = 0.8061
Iteration: 2 ; error = 0.7645
Iteration: 3 ; error = 0.7441
Iteration: 4 ; error = 0.7350
Iteration: 5 ; error = 0.7303
Iteration: 6 ; error = 0.7314
Iteration: 7 ; error = 0.7273
Iteration: 8 ; error = 0.7239
Iteration: 9 ; error = 0.7286
Iteration: 10 ; error = 0.7271
Iteration: 11 ; error = 0.7237
Iteration: 12 ; error = 0.7221
Iteration: 13 ; error = 0.7197
Iteration: 14 ; error = 0.7203
Iteration: 15 ; error = 0.7222
Iteration: 16 ; error = 0.7189
Iteration: 17 ; error = 0.7240
Iteration: 18 ; error = 0.7209
Iteration: 19 ; error = 0.7216
Iteration: 20 ; error = 0.7230
Iteration: 21 ; error = 0.7220
Iteration: 22 ; error = 0.7217
Iteration: 23 ; error = 0.7186
Iteration: 24 ; error = 0.7223
Iteration: 25 ; error = 0.7204
Iteration: 26 ; error = 0.7240
Iteration: 27 ; error = 0.7196
Iteration: 28 ; error = 0.7216
Iteration: 29 ; error = 0.7221
Iteration: 30 ; error = 0.7156
Iteration: 31 ; error = 0.7243
Iteration: 32 ; error = 0.7187
Iteration: 33 ; e