In this project, I am going to implement a simple matrix factorization for MovieLens 25m dataset

Importing Libraries

In [1]:
import numpy as np
import pandas  as pd
import matplotlib.pyplot as plt

Dataset:

In [2]:
movies = pd.read_csv('ml-25m/movies.csv')

In [3]:
ratings = pd.read_csv('ml-25m/ratings.csv')

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


Removing the columns we don't need

In [6]:
movies = movies.drop('genres', axis=1)

In [7]:
ratings = ratings.drop('timestamp', axis=1)

Merging the two dataset

In [8]:
df = ratings.merge(movies, on = 'movieId')

In [9]:
df

Unnamed: 0,userId,movieId,rating,title
0,1,296,5.0,Pulp Fiction (1994)
1,3,296,5.0,Pulp Fiction (1994)
2,4,296,4.0,Pulp Fiction (1994)
3,5,296,4.0,Pulp Fiction (1994)
4,7,296,4.0,Pulp Fiction (1994)
...,...,...,...,...
25000090,162358,200192,2.0,Den frusna leoparden (1986)
25000091,162358,200194,2.0,Tough Luck (2004)
25000092,162386,139970,3.5,I Don't Speak English (1995)
25000093,162386,200726,4.0,The Graduates (1995)


droping samples from dataset for run time 

In [10]:
df = df.sample(frac = 0.004)

In [11]:
df

Unnamed: 0,userId,movieId,rating,title
16016070,62590,61986,3.0,Appaloosa (2008)
6485386,151558,7458,4.0,Troy (2004)
7627381,104209,63062,4.5,Changeling (2008)
19387121,47894,2724,4.0,Runaway Bride (1999)
4379094,154093,778,4.0,Trainspotting (1996)
...,...,...,...,...
3432599,97663,4993,1.0,"Lord of the Rings: The Fellowship of the Ring,..."
15672506,162074,2916,3.5,Total Recall (1990)
13153586,28212,22,4.0,Copycat (1995)
10246584,54311,592,3.5,Batman (1989)


splitting data into train - validate - test sets

In [12]:
uniqueU = np.unique(df['userId'])
uniqueM = np.unique(df['movieId'])

In [13]:
n_users = len(uniqueU)
n_movies = len(uniqueM)

In [14]:
n_users

54909

In [15]:
n_movies

10187

In [16]:
usersrow = {}
moviescolumn = {}
for i, user_id in enumerate(uniqueU):
        usersrow[user_id] = i

for j, movie_id in enumerate(uniqueM):
        moviescolumn[movie_id] = j

In [17]:
train_size = 0.8
validate_size = 0.1
train, validate, test = np.split(df.sample(frac=1), [int(train_size * len(df)), int((validate_size + train_size) * len(df))])

In [18]:
train 

Unnamed: 0,userId,movieId,rating,title
5642529,73082,4246,3.0,Bridget Jones's Diary (2001)
21141857,45496,110553,1.0,The Amazing Spider-Man 2 (2014)
14114652,59802,2005,3.0,"Goonies, The (1985)"
4262418,11517,593,4.0,"Silence of the Lambs, The (1991)"
1081273,119327,356,3.5,Forrest Gump (1994)
...,...,...,...,...
15239714,43978,6331,3.0,Spellbound (2002)
15941532,113260,44199,3.5,Inside Man (2006)
11998314,120777,1721,2.0,Titanic (1997)
18845092,5402,81562,4.0,127 Hours (2010)


In [19]:
validate

Unnamed: 0,userId,movieId,rating,title
20667572,160348,4734,1.0,Jay and Silent Bob Strike Back (2001)
12994349,92963,2094,0.5,"Rocketeer, The (1991)"
21998775,14042,2828,2.0,Dudley Do-Right (1999)
12676782,42428,1219,5.0,Psycho (1960)
8047790,146554,79132,4.0,Inception (2010)
...,...,...,...,...
15926858,87214,33836,2.5,Bewitched (2005)
9052883,62854,182823,2.5,Bright (2017)
10718564,64042,1183,3.5,"English Patient, The (1996)"
24238808,105301,1383,1.0,Adrenalin: Fear the Rush (1996)


In [20]:
test

Unnamed: 0,userId,movieId,rating,title
71520,145814,296,5.0,Pulp Fiction (1994)
17167561,74082,647,3.0,Courage Under Fire (1996)
16521949,133109,2712,3.0,Eyes Wide Shut (1999)
8096728,83703,81591,5.0,Black Swan (2010)
16315833,15756,6350,4.0,Laputa: Castle in the Sky (Tenkû no shiro Rapy...
...,...,...,...,...
9109450,97180,1220,4.0,"Blues Brothers, The (1980)"
18410450,7949,2527,4.0,Westworld (1973)
8604184,115272,106920,3.5,Her (2013)
14905761,52559,3418,3.5,Thelma & Louise (1991)


 Converting the dataframes into matrix

In [21]:
R = np.zeros((n_users, n_movies))
R_test = np.zeros((n_users, n_movies))
R_valid = np.zeros((n_users, n_movies))

In [22]:
for index, row in train.iterrows():
  i = usersrow[row.userId]
  j = moviescolumn[row.movieId]
  R[i, j] = row.rating


In [23]:
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
for index, row in validate.iterrows():
  i = usersrow[row.userId]
  j = moviescolumn[row.movieId]
  R_valid[i, j] = row.rating

In [25]:
R_valid

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
for index, row in test.iterrows():
  i = usersrow[row.userId]
  j = moviescolumn[row.movieId]
  R_test[i, j] = row.rating

In [27]:
R_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Model

In [28]:
def predict(user_id, movie_id, P, Q, b, bi, bu):
  return b + bu[usersrow[user_id]] + bi[moviescolumn[movie_id]]+ np.dot(P[usersrow[user_id],:], Q[moviescolumn[movie_id],:].T)

In [29]:
def rmse(E, P, Q, bu, bi, b):
    users, items = E.nonzero()
    sum = 0
    for u, i in zip(users, items):  
        error = E[u, i] - (b +bu[u] +bi[i] + np.dot(P[u,:], Q[i,:]).T)
        sum += error**2
    return np.sqrt(sum/len(E[E > 0]))

In [30]:
def improvedRegSvd(E, E_test, k, lr, reg, user_id):
    users, items = E.nonzero()
    train_errors = []
    valid_errors = []
    t = 50
    P = np.random.rand(n_users, k)
    Q = np.random.rand(n_movies, k)
    b_u = np.zeros(n_users)
    b_i = np.zeros(n_movies)
    b = np.mean(E[np.where(E != 0)])
    for epoch in range(t+1):
        for u, i in zip(users, items):  
            error = E[u, i] - (b + b_u[u] + b_i[i] + np.dot(P[u,:], Q[i,:].T))
            P[u, :] += (lr * ( error * Q[i, :] - reg * P[u,:]))
            Q[i, :] += (lr * ( error * P[u, :] - reg * Q[i,:]))
            b_u[u] += lr * (error - reg * b_u[u])
            b_i[i] += lr * (error - reg * b_i[i])
        train_rmse = rmse(E, P, Q , b_u, b_i, b)
        valid_rmse = rmse(E_test, P, Q, b_u, b_i, b)
        train_errors.append(train_rmse)
        valid_errors.append(valid_rmse)
        predictions = np.zeros((n_movies, 1))
        movie_to_column_items = np.array(list(moviescolumn.items()))
        for i, movie in enumerate(movie_to_column_items):
          predictions[i] = predict(user_id, movie[0], P, Q, b, b_i, b_u)
        df_result = pd.DataFrame(columns=['UserID', 'MovieID', 'Prediction'])
        indices = np.argsort(-predictions, axis=0)
        for j in range(10):
          movie_id = int(movie_to_column_items[np.where(movie_to_column_items[:, 1] == indices[j])][0][0])
          df_result = pd.DataFrame({
          'UserID': user_id,
          'MovieID': movie_id,
          'Prediction': predictions[indices[j]][0][0]}, index=[j])
    return train_errors,valid_errors, df_result

Grid Search 

In [None]:
from itertools import product
K = [2,3,5,7,9]
LearningRates = [0.01, 0.001, 0.0001]
Regularizations = [0.1, 0.01, 0.001, 0.0001]
prods = list(product(K, LearningRates, Regularizations))
for k1, l1, r1 in prods:
  funk = improvedRegSvd(R.copy(), R_valid.copy(), k1, l1, r1, 60148)
  print(f"Params: {k1}, {l1}, {r1};\ttrain loss: {funk[0][50]:.3f},\tvalid loss: {funk[1][50]:.3f}")

In [None]:
mf = improvedRegSvd(R.copy(), R_test.copy(), 3, 50, 0.001, 0.01, 50)

In [None]:
plt.plot(range(51), mf[0], marker='o', label='Training Data')
plt.plot(range(51), mf[1], marker='v', label='Test_data')
plt.title('Gradient Descent Learning Curve')
plt.xlabel('Number of Epochs')
plt.ylabel(' Improved RegSVD RMSE')
plt.legend()
plt.grid()
plt.show()