In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split

In [2]:
ratings_path = 'movielens_1m/ratings.dat'
users_path = 'movielens_1m/users.dat'
movies_path = 'movielens_1m/movies.dat'

In [3]:
ratings = pd.read_csv(ratings_path, sep="::", names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

  """Entry point for launching an IPython kernel.


In [4]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Split into train and test data

In [5]:
np.random.seed(1234)
# train data ratio
x = 0.8
train, test = train_test_split(ratings, train_size=x)

In [6]:
unique_users = sorted(list(set(train.UserID)))
unique_movies = sorted(list(set(train.MovieID)))
user_map, movie_map = {}, {}
for idx, userID in enumerate(unique_users):
    user_map[userID] = idx
for idx, itemID in enumerate(unique_movies):
    movie_map[itemID] = idx
n_users = len(unique_users)
n_movies = len(unique_movies)
print(f'\nNumber of users: {n_users} \n')
print(f'Number of movies: {n_movies}')



Number of users: 6040 

Number of movies: 3683


In [7]:
train.reset_index(inplace=True)
samples = []
for idx in range(train.shape[0]):
    u = user_map[train['UserID'][idx]]
    i = movie_map[train['MovieID'][idx]]
    r = train['Rating'][idx]
    samples.append((u, i, r))

Initialize P, Q

In [8]:
k = 10
P = np.random.normal(scale=1./k, size=(n_users, k))
Q = np.random.normal(scale=1./k, size=(n_movies, k))

In [9]:
def predict(P, Q, u, i):
    assert P.shape[1] == Q.shape[1], 'Shapes of P and Q does not match!'
    p, q = P[u, :], Q[i, :]
    return np.inner(p, q)

def sgd(P, Q, samples, lr=0.1, lambd=0.01):
    for u, i, r in samples:
        pred_val = predict(P, Q, u, i)
        e = r - pred_val
        Q_i = Q[i, :].copy()
        Q[i, :] += lr * (e*P[u, :] - lambd*Q[i, :])
        P[u, :] += lr * (e*Q_i - lambd*P[u, :])
    return P, Q

def mse(P, Q, samples):
    predicted = np.matmul(P, Q.transpose())
    error = 0
    for u, i, r in samples:
        pred_val = predict(P, Q, u, i)
        e = r - pred_val
        error += pow(e, 2)
    return error / len(samples)
        

In [10]:
n_iter = 100
lr = 0.1
lambd = 0.01
training_process = []
for idx in range(n_iter):
    np.random.shuffle(samples)
    P, Q = sgd(P, Q, samples, lr=lr, lambd=lambd)
    error = mse(P, Q, samples)
    training_process.append((idx, error))
    if (idx + 1) % 1 == 0:
        print("Iteration: %d ; error = %.4f" % (idx+1, error))
    

Iteration: 1 ; error = 1.1813
Iteration: 2 ; error = 1.1885
Iteration: 3 ; error = 1.1623
Iteration: 4 ; error = 1.1257
Iteration: 5 ; error = 1.1325
Iteration: 6 ; error = 1.1358
Iteration: 7 ; error = 1.1465
Iteration: 8 ; error = 1.1180
Iteration: 9 ; error = 1.1415
Iteration: 10 ; error = 1.1112
Iteration: 11 ; error = 1.1121
Iteration: 12 ; error = 1.1311
Iteration: 13 ; error = 1.1218
Iteration: 14 ; error = 1.1065
Iteration: 15 ; error = 1.1303
Iteration: 16 ; error = 1.1144
Iteration: 17 ; error = 1.1254
Iteration: 18 ; error = 1.1226
Iteration: 19 ; error = 1.1253
Iteration: 20 ; error = 1.1035
Iteration: 21 ; error = 1.1184
Iteration: 22 ; error = 1.1264
Iteration: 23 ; error = 1.1102
Iteration: 24 ; error = 1.1352
Iteration: 25 ; error = 1.1173
Iteration: 26 ; error = 1.1188
Iteration: 27 ; error = 1.1186
Iteration: 28 ; error = 1.1321
Iteration: 29 ; error = 1.1071
Iteration: 30 ; error = 1.1067
Iteration: 31 ; error = 1.1286
Iteration: 32 ; error = 1.1119
Iteration: 33 ; e