In this project, I am going to implement a simple matrix factorization for MovieLens 25m dataset

Importing Libraries

In [1]:
import numpy as np
import pandas  as pd
import matplotlib.pyplot as plt

Dataset:

In [2]:
movies = pd.read_csv('ml-25m/movies.csv')

In [3]:
ratings = pd.read_csv('ml-25m/ratings.csv')

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


Removing the columns we don't need

In [6]:
movies = movies.drop('genres', axis=1)

In [7]:
ratings = ratings.drop('timestamp', axis=1)

Merging the two dataset

In [8]:
df = ratings.merge(movies, on = 'movieId')

In [9]:
df

Unnamed: 0,userId,movieId,rating,title
0,1,296,5.0,Pulp Fiction (1994)
1,3,296,5.0,Pulp Fiction (1994)
2,4,296,4.0,Pulp Fiction (1994)
3,5,296,4.0,Pulp Fiction (1994)
4,7,296,4.0,Pulp Fiction (1994)
...,...,...,...,...
25000090,162358,200192,2.0,Den frusna leoparden (1986)
25000091,162358,200194,2.0,Tough Luck (2004)
25000092,162386,139970,3.5,I Don't Speak English (1995)
25000093,162386,200726,4.0,The Graduates (1995)


droping samples from dataset for run time 

In [10]:
df = df.sample(frac = 0.004)

In [11]:
df

Unnamed: 0,userId,movieId,rating,title
17993221,37563,50923,3.0,"Astronaut Farmer, The (2007)"
2873625,103410,2762,4.0,"Sixth Sense, The (1999)"
23946772,90691,6094,4.5,Liquid Sky (1982)
20971002,20674,34336,3.0,Must Love Dogs (2005)
16623584,117545,673,1.0,Space Jam (1996)
...,...,...,...,...
4968341,32886,2105,3.5,Tron (1982)
23841135,148793,30850,5.0,"Merchant of Venice, The (2004)"
10981163,88435,329,3.0,Star Trek: Generations (1994)
3472376,89413,4995,2.5,"Beautiful Mind, A (2001)"


splitting data into train - validate - test sets

In [12]:
uniqueU = np.unique(df['userId'])
uniqueM = np.unique(df['movieId'])

In [13]:
n_users = len(uniqueU)
n_movies = len(uniqueM)

In [14]:
n_users

54872

In [15]:
n_movies

10233

In [16]:
usersrow = {}
moviescolumn = {}
for i, user_id in enumerate(uniqueU):
        usersrow[user_id] = i

for j, movie_id in enumerate(uniqueM):
        moviescolumn[movie_id] = j

In [17]:
usersrow

{2: 0,
 3: 1,
 4: 2,
 8: 3,
 12: 4,
 13: 5,
 19: 6,
 20: 7,
 25: 8,
 28: 9,
 31: 10,
 35: 11,
 36: 12,
 38: 13,
 43: 14,
 46: 15,
 48: 16,
 52: 17,
 57: 18,
 59: 19,
 61: 20,
 64: 21,
 65: 22,
 66: 23,
 67: 24,
 68: 25,
 69: 26,
 70: 27,
 72: 28,
 75: 29,
 76: 30,
 85: 31,
 91: 32,
 93: 33,
 95: 34,
 96: 35,
 107: 36,
 113: 37,
 120: 38,
 123: 39,
 125: 40,
 127: 41,
 132: 42,
 134: 43,
 158: 44,
 159: 45,
 164: 46,
 166: 47,
 169: 48,
 171: 49,
 172: 50,
 175: 51,
 176: 52,
 177: 53,
 181: 54,
 185: 55,
 186: 56,
 187: 57,
 188: 58,
 189: 59,
 195: 60,
 200: 61,
 201: 62,
 204: 63,
 215: 64,
 216: 65,
 218: 66,
 219: 67,
 224: 68,
 225: 69,
 226: 70,
 233: 71,
 235: 72,
 236: 73,
 240: 74,
 243: 75,
 245: 76,
 248: 77,
 250: 78,
 256: 79,
 259: 80,
 260: 81,
 261: 82,
 265: 83,
 277: 84,
 278: 85,
 280: 86,
 281: 87,
 284: 88,
 285: 89,
 288: 90,
 293: 91,
 294: 92,
 296: 93,
 298: 94,
 299: 95,
 302: 96,
 304: 97,
 309: 98,
 313: 99,
 318: 100,
 319: 101,
 322: 102,
 331: 103,
 333: 

In [18]:
train_size = 0.8
validate_size = 0.1
train, validate, test = np.split(df.sample(frac=1), [int(train_size * len(df)), int((validate_size + train_size) * len(df))])

In [19]:
train

Unnamed: 0,userId,movieId,rating,title
24159831,108645,7193,1.0,"Adventures of Ford Fairlane, The (1990)"
12502713,38670,938,3.5,Gigi (1958)
7421682,86529,57669,4.5,In Bruges (2008)
14442669,34362,2460,3.5,"Texas Chainsaw Massacre 2, The (1986)"
23691843,27551,2148,4.5,House (1986)
...,...,...,...,...
17325158,124042,1958,2.5,Terms of Endearment (1983)
11861539,566,1573,4.0,Face/Off (1997)
13768043,19615,1405,2.5,Beavis and Butt-Head Do America (1996)
23093346,32103,71282,3.0,"Food, Inc. (2008)"


In [20]:
validate

Unnamed: 0,userId,movieId,rating,title
17080435,105964,156,5.0,Blue in the Face (1995)
16384995,128305,353,4.5,"Crow, The (1994)"
24387564,103232,109846,3.5,Mr. Peabody & Sherman (2014)
4653867,5262,1213,4.5,Goodfellas (1990)
5297006,117031,3702,3.5,Mad Max (1979)
...,...,...,...,...
13167508,111188,31,2.5,Dangerous Minds (1995)
588029,148079,8360,4.0,Shrek 2 (2004)
13066571,157501,3868,3.0,"Naked Gun: From the Files of Police Squad!, Th..."
4162765,106449,173,2.0,Judge Dredd (1995)


In [21]:
test

Unnamed: 0,userId,movieId,rating,title
4695478,44900,1214,5.0,Alien (1979)
15392102,105451,44761,3.5,Brick (2005)
21904028,91493,68205,4.5,Crank: High Voltage (2009)
16523274,146711,2712,3.0,Eyes Wide Shut (1999)
185533,79771,2011,5.0,Back to the Future Part II (1989)
...,...,...,...,...
2042994,26812,1291,4.0,Indiana Jones and the Last Crusade (1989)
18492012,29677,1892,2.0,"Perfect Murder, A (1998)"
19679783,3369,1081,1.0,Victor/Victoria (1982)
11872238,87480,1573,3.0,Face/Off (1997)


 Converting the dataframes into matrix

In [22]:
R = np.zeros((n_users, n_movies))
R_test = np.zeros((n_users, n_movies))
R_valid = np.zeros((n_users, n_movies))

In [23]:
for index, row in train.iterrows():
  i = usersrow[row.userId]
  j = moviescolumn[row.movieId]
  R[i, j] = row.rating


In [24]:
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
for index, row in validate.iterrows():
  i = usersrow[row.userId]
  j = moviescolumn[row.movieId]
  R_valid[i, j] = row.rating

In [26]:
R_valid

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
for index, row in test.iterrows():
  i = usersrow[row.userId]
  j = moviescolumn[row.movieId]
  R_test[i, j] = row.rating

In [28]:
R_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Model

In [29]:
def predict(user_id, movie_id, P, Q):
  return np.dot(P[usersrow[user_id],:], Q[moviescolumn[movie_id],:].T)

In [30]:
def rmse(E, P, Q):
    users, items = E.nonzero()
    sum = 0
    for u, i in zip(users, items):  
        error = E[u, i] - np.dot(P[u,:], Q[i,:]).T
        sum += error**2
    return np.sqrt(sum/len(E[E > 0]))

In [31]:
def basic_mf(E, E_test, k, learning_rate, regularization, user_id):
    train_errors = []
    test_errors = []
    users, items = E.nonzero()
    P = np.random.rand(n_users, k)
    Q = np.random.rand(n_movies, k)
    t = 50
    for epoch in range(t+1):
        for u, i in zip(users, items):
            error = E[u, i] - np.dot(P[u,:], Q[i,:].T)
            P[u,:] += learning_rate * (error * Q[i,:] - regularization * P[u,:])
            Q[i,:] += learning_rate * (error * P[u,:] - regularization * Q[i,:])
        train_rmse = rmse(E,P,Q)
        test_rmse = rmse(E_test,P,Q)
        train_errors.append(train_rmse)
        test_errors.append(test_rmse)
        predictions = np.zeros((n_movies, 1))
        movie_to_column_items = np.array(list(moviescolumn.items()))
        for i, movie in enumerate(movie_to_column_items):
          predictions[i] = predict(user_id, movie[0], P, Q)
    df_result = pd.DataFrame(columns=['UserID', 'MovieID', 'Prediction'])
    indices = np.argsort(-predictions, axis=0)
    for j in range(10):
      movie_id = int(movie_to_column_items[np.where(movie_to_column_items[:, 1] == indices[j])][0][0])
      df_result = pd.DataFrame({
      'UserID': user_id,
      'MovieID': movie_id,
      'Prediction': predictions[indices[j]][0][0]}, index=[j])
    return train_errors, test_errors, df_result

In [32]:
from itertools import product
K = [2,3,5,7,9]
LearningRates = [0.01, 0.001, 0.0001]
Regularizations = [0.1, 0.01, 0.001, 0.0001]
prods = list(product(K, LearningRates, Regularizations))
for k1, l1, r1 in prods:
  funk = basic_mf(R.copy(), R_valid.copy(), k1, l1, r1, 60148)
  print(f"Params: {k1}, {l1}, {r1};\ttrain loss: {funk[0][50]:.3f},\tvalid loss: {funk[1][50]:.3f}")

epoch=0
epoch=1
epoch=2
epoch=3
epoch=4
epoch=5
epoch=6
epoch=7
epoch=8
epoch=9
epoch=10
epoch=11
epoch=12
epoch=13
epoch=14
epoch=15
epoch=16
epoch=17
epoch=18
epoch=19
epoch=20
epoch=21
epoch=22
epoch=23
epoch=24
epoch=25
epoch=26
epoch=27
epoch=28
epoch=29
epoch=30
epoch=31
epoch=32
epoch=33
epoch=34
epoch=35
epoch=36
epoch=37
epoch=38
epoch=39
epoch=40
epoch=41
epoch=42
epoch=43
epoch=44
epoch=45
epoch=46
epoch=47
epoch=48
epoch=49
epoch=50
Params: 2, 0.01, 0.1;	train loss: 0.505,	valid loss: 1.756
epoch=0
epoch=1
epoch=2
epoch=3
epoch=4
epoch=5
epoch=6
epoch=7
epoch=8
epoch=9
epoch=10
epoch=11
epoch=12
epoch=13
epoch=14
epoch=15
epoch=16
epoch=17
epoch=18
epoch=19
epoch=20
epoch=21
epoch=22
epoch=23
epoch=24
epoch=25
epoch=26
epoch=27
epoch=28
epoch=29
epoch=30
epoch=31
epoch=32
epoch=33
epoch=34
epoch=35
epoch=36
epoch=37
epoch=38
epoch=39
epoch=40
epoch=41
epoch=42
epoch=43
epoch=44
epoch=45
epoch=46
epoch=47
epoch=48
epoch=49
epoch=50
Params: 2, 0.01, 0.01;	train loss: 0.468,	v

KeyboardInterrupt: 

In [None]:
basicmf = basic_mf(R.copy(), R_test.copy(), 3, 50, 0.001, 0.01, 50)

epoch=0
train_rmse=2.590783238746999
test_rmse=2.599918221406997
epoch=1
train_rmse=2.3392463195501474
test_rmse=2.3569553215538495
epoch=2
train_rmse=2.160892095974894
test_rmse=2.1878202899151744
epoch=3
train_rmse=2.024355423014294
test_rmse=2.0608566175192413
epoch=4
train_rmse=1.9151689159047587
test_rmse=1.9612805098604857
epoch=5
train_rmse=1.8253475365648237
test_rmse=1.880884024354407
epoch=6
train_rmse=1.7498894901760154
test_rmse=1.8145383343460355
epoch=7
train_rmse=1.6854066845805322
test_rmse=1.758796081633051
epoch=8
train_rmse=1.6294963519204857
test_rmse=1.7112366564111188
epoch=9
train_rmse=1.580406332043046
test_rmse=1.6701130219516316
epoch=10
train_rmse=1.5368339786791803
test_rmse=1.6341390989585336
epoch=11
train_rmse=1.4977952472692408
test_rmse=1.602352184193444
epoch=12
train_rmse=1.4625355240315363
test_rmse=1.5740201807386995
epoch=13
train_rmse=1.430467463570944


In [None]:
plt.plot(range(51), mf[0], marker='o', label='Training Data')
plt.plot(range(51), mf[1], marker='v', label='Test_data')
plt.title('Gradient Descent Learning Curve')
plt.xlabel('Number of Epochs')
plt.ylabel(' Improved RegSVD RMSE')
plt.legend()
plt.grid()