In [None]:
# Load table into pandas df

import pandas as pd

ratings_df = pd.read_csv('sample_data/ratings.csv')

print(ratings_df.head())

In [None]:
# Split dataset into train (70%), dev (15%), test (15%)

from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(ratings_df, test_size=0.3, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) 

In [None]:
# High-level explanation of SGD: 
# 1. We initialize a user embedding table and an item embedding table
# 2. We use DataLoader to set up our data. 
# 3. Forward pass
#      Let's say user of id 1 rated movie of id 4.
#      We look up the user embedding of userid=1 and the item embedding of itemid=4.
#      We perform element-wise multiplication of the two embeddings and sum the result
#      This is the predicted rating
# 4. Backward pass
#      We compare the predicted rating with the label rating using MSE.
#      Update the user and item embedding tables accordingly.

In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# Define the model: we create user and item embedding tables for each unique user and unique item; 
# forward pass by element-wise multiplication and summation

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings: each user is a tensor of length n_factors.
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # create item embeddings: each item is a tensor of length n_factors
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users) * self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [None]:
# We need this to load the ratings pandas df into torch tensors. 

class Loader(Dataset):
    def __init__(self, ratings_df):
        self.ratings = ratings_df.copy()
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        # userid and corresponding movieid that they rated
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        # rating
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [None]:
def train_eval_dev(train_df, dev_df, sgd_param):
  
  n_users = ratings_df['userId'].nunique()
  n_items = ratings_df['movieId'].nunique()

  model = MatrixFactorization(n_users, n_items, n_factors=sgd_param['n_factors'])
  optimizer = optim.SGD(model.parameters(), lr=sgd_param['lr'])

  cuda = torch.cuda.is_available()

  if cuda:
    model = model.cuda()

  loss_fn = nn.MSELoss()
  train_set = Loader(train_df)
  # This splits dataset into batches
  train_loader = DataLoader(train_set, batch_size=128, shuffle=True)

  model.train()

  for epoch in range(sgd_param['epochs']):
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        # forward pass
        output = model(data)
        # calculate loss
        loss = loss_fn(output, target.float())
        # calculate gradient
        loss.backward()
        # update parameters (embeddings in this case)
        optimizer.step()
        total_loss += loss.item()

  dev_set = Loader(dev_df)
  dev_loader = DataLoader(dev_set, batch_size=128, shuffle=False)
  # model to eval mode
  model.eval()
  dev_preds, dev_targets = [], []
  with torch.no_grad():
      for data, target in dev_loader:
          data, target = data.cuda(), target.cuda()
          output = model(data)
          # append results
          dev_preds.extend(output.cpu().numpy())
          dev_targets.extend(target.cpu().numpy())

  mae = mean_absolute_error(dev_targets, dev_preds)

  print(f"dev_mae for {sgd_param['n_factors']} factors, {sgd_param['lr']} lr, {sgd_param['epochs']} epochs: {mae}")

In [None]:
for n_f in [10, 20, 50, 100]:
  for n_ep in [10, 50, 100]:
    for lr in [0.001, 0.01, 0.05]:
      train_eval_dev(train_df, dev_df, {'n_factors': n_f, 'epochs': n_ep, 'lr': lr})

In [None]:
# Results

# dev_mae for 10 factors, 0.001 lr, 10 epochs: 3.4900412809292685
# dev_mae for 10 factors, 0.01 lr, 10 epochs: 3.480439918096354
# dev_mae for 10 factors, 0.05 lr, 10 epochs: 3.028467126331669
# dev_mae for 10 factors, 0.001 lr, 50 epochs: 3.486756268621376
# dev_mae for 10 factors, 0.01 lr, 50 epochs: 3.026092983281317
# dev_mae for 10 factors, 0.05 lr, 50 epochs: 1.3760415739293925
# dev_mae for 10 factors, 0.001 lr, 100 epochs: 3.480752333602614
# dev_mae for 10 factors, 0.01 lr, 100 epochs: 2.1650907983948624
# dev_mae for 10 factors, 0.05 lr, 100 epochs: 1.1044064729262975
# dev_mae for 20 factors, 0.001 lr, 10 epochs: 3.483121689989763
# dev_mae for 20 factors, 0.01 lr, 10 epochs: 3.4644898156541255
# dev_mae for 20 factors, 0.05 lr, 10 epochs: 2.8658078690881816
# dev_mae for 20 factors, 0.001 lr, 50 epochs: 3.476532301745354
# dev_mae for 20 factors, 0.01 lr, 50 epochs: 2.869004440047278
# dev_mae for 20 factors, 0.05 lr, 50 epochs: 1.3476828530524387
# dev_mae for 20 factors, 0.001 lr, 100 epochs: 3.4646104475111694
# dev_mae for 20 factors, 0.01 lr, 100 epochs: 2.0680492953973366
# dev_mae for 20 factors, 0.05 lr, 100 epochs: 1.0965616834833602
# dev_mae for 50 factors, 0.001 lr, 10 epochs: 3.4622471793977434
# dev_mae for 50 factors, 0.01 lr, 10 epochs: 3.4174553854509817
# dev_mae for 50 factors, 0.05 lr, 10 epochs: 2.6361976552181994
# dev_mae for 50 factors, 0.001 lr, 50 epochs: 3.445969564824434
# dev_mae for 50 factors, 0.01 lr, 50 epochs: 2.6354201712233962
# dev_mae for 50 factors, 0.05 lr, 50 epochs: 1.3040099250541246
# dev_mae for 50 factors, 0.001 lr, 100 epochs: 3.4169696260114346
# dev_mae for 50 factors, 0.01 lr, 100 epochs: 1.9297473189003211
# dev_mae for 50 factors, 0.05 lr, 100 epochs: 1.0835130303438045
# dev_mae for 100 factors, 0.001 lr, 10 epochs: 3.4275661815249725
# dev_mae for 100 factors, 0.01 lr, 10 epochs: 3.3410197442210903
# dev_mae for 100 factors, 0.05 lr, 10 epochs: 2.438167753674767
# dev_mae for 100 factors, 0.001 lr, 50 epochs: 3.396717071175329
# dev_mae for 100 factors, 0.01 lr, 50 epochs: 2.440149415787094
# dev_mae for 100 factors, 0.05 lr, 50 epochs: 1.2641956796488487
# dev_mae for 100 factors, 0.001 lr, 100 epochs: 3.341480087737653
# dev_mae for 100 factors, 0.01 lr, 100 epochs: 1.8127504039392
# dev_mae for 100 factors, 0.05 lr, 100 epochs: 1.0718658247723067 <- Best

In [None]:
n_users = ratings_df['userId'].nunique()
n_items = ratings_df['movieId'].nunique()

model = MatrixFactorization(n_users, n_items, n_factors=100)
optimizer = optim.SGD(model.parameters(), lr=0.05)

cuda = torch.cuda.is_available()

if cuda:
    model = model.cuda()

loss_fn = nn.MSELoss()
train_set = Loader(train_df)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)

model.train()

for epoch in range(100):
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        if cuda:
            data, target = (d.cuda() for d in data), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

test_set = Loader(test_df)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False)

model.eval()
test_preds, test_targets = [], []
with torch.no_grad():
    for data, target in test_loader:
        if cuda:
            data, target = (d.cuda() for d in data), target.cuda()
        output = model(data)
        test_preds.extend(output.cpu().numpy())
        test_targets.extend(target.cpu().numpy())

mae = mean_absolute_error(test_targets, test_preds)
mse = mean_squared_error(test_targets, test_preds)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')

In [None]:
# Results (Trained using lr=0.05, num_epochs=100, num_factors=100)

# Epoch 1, Loss: 12.657633456630982
# Epoch 2, Loss: 11.971283441004546
# Epoch 3, Loss: 11.017647568730341
# Epoch 4, Loss: 9.792050778001979
# Epoch 5, Loss: 8.440822230732959
# Epoch 6, Loss: 7.183058524477309
# Epoch 7, Loss: 6.155719561853271
# Epoch 8, Loss: 5.361280183861221
# Epoch 9, Loss: 4.741093813509181
# Epoch 10, Loss: 4.246194690897845
# Epoch 11, Loss: 3.8416456921377042
# Epoch 12, Loss: 3.505483899427497
# Epoch 13, Loss: 3.2230168663073275
# Epoch 14, Loss: 2.9834001491899076
# Epoch 15, Loss: 2.778348690141802
# Epoch 16, Loss: 2.6011971783810766
# Epoch 17, Loss: 2.4458745169466822
# Epoch 18, Loss: 2.3102050827465197
# Epoch 19, Loss: 2.1896196316549745
# Epoch 20, Loss: 2.08339984131896
# Epoch 21, Loss: 1.9878152844266614
# Epoch 22, Loss: 1.9015246370564336
# Epoch 23, Loss: 1.825072992755019
# Epoch 24, Loss: 1.7540460373612419
# Epoch 25, Loss: 1.6904544553894927
# Epoch 26, Loss: 1.6323633815931238
# Epoch 27, Loss: 1.5781546265318773
# Epoch 28, Loss: 1.529779164687447
# Epoch 29, Loss: 1.483659920701082
# Epoch 30, Loss: 1.4420938186239505
# Epoch 31, Loss: 1.402962129941021
# Epoch 32, Loss: 1.3663139747104782
# Epoch 33, Loss: 1.333198330540588
# Epoch 34, Loss: 1.300657980144024
# Epoch 35, Loss: 1.271040684826996
# Epoch 36, Loss: 1.2437281118354935
# Epoch 37, Loss: 1.2176391514747038
# Epoch 38, Loss: 1.1924792952511623
# Epoch 39, Loss: 1.1692685065925985
# Epoch 40, Loss: 1.1471906673649084
# Epoch 41, Loss: 1.1271093536330306
# Epoch 42, Loss: 1.1075532258204792
# Epoch 43, Loss: 1.0887491647963938
# Epoch 44, Loss: 1.0709219538215278
# Epoch 45, Loss: 1.0545896566000537
# Epoch 46, Loss: 1.0384291540021482
# Epoch 47, Loss: 1.0232921919744948
# Epoch 48, Loss: 1.0088078245736551
# Epoch 49, Loss: 0.9950371460206267
# Epoch 50, Loss: 0.9821465224891469
# Epoch 51, Loss: 0.9693458156957142
# Epoch 52, Loss: 0.9573148699558299
# Epoch 53, Loss: 0.9458891057233879
# Epoch 54, Loss: 0.9350817700227102
# Epoch 55, Loss: 0.9240121945090916
# Epoch 56, Loss: 0.9145387292340181
# Epoch 57, Loss: 0.9041371773118558
# Epoch 58, Loss: 0.8947801643955535
# Epoch 59, Loss: 0.8860369059054748
# Epoch 60, Loss: 0.8772839566935664
# Epoch 61, Loss: 0.8685462648669878
# Epoch 62, Loss: 0.8604699603241422
# Epoch 63, Loss: 0.8526288756857747
# Epoch 64, Loss: 0.8452572154178135
# Epoch 65, Loss: 0.8381637550782466
# Epoch 66, Loss: 0.8308001554746559
# Epoch 67, Loss: 0.8246787493319615
# Epoch 68, Loss: 0.8175459005802438
# Epoch 69, Loss: 0.8111940079193184
# Epoch 70, Loss: 0.8048043349298878
# Epoch 71, Loss: 0.7990637609492177
# Epoch 72, Loss: 0.7934688431197319
# Epoch 73, Loss: 0.7872064919143483
# Epoch 74, Loss: 0.7821175379921561
# Epoch 75, Loss: 0.7767087594754454
# Epoch 76, Loss: 0.7714976891875267
# Epoch 77, Loss: 0.7664382398344468
# Epoch 78, Loss: 0.7614117902366148
# Epoch 79, Loss: 0.7570925200546997
# Epoch 80, Loss: 0.7517694356864777
# Epoch 81, Loss: 0.747075438823389
# Epoch 82, Loss: 0.7431324055229408
# Epoch 83, Loss: 0.7385387542671051
# Epoch 84, Loss: 0.734258241681517
# Epoch 85, Loss: 0.7298031257654446
# Epoch 86, Loss: 0.725728320794693
# Epoch 87, Loss: 0.7220588968391868
# Epoch 88, Loss: 0.718305023090131
# Epoch 89, Loss: 0.7143210173193095
# Epoch 90, Loss: 0.7102118381231591
# Epoch 91, Loss: 0.7065887288122937
# Epoch 92, Loss: 0.7030698830882708
# Epoch 93, Loss: 0.6994529672614906
# Epoch 94, Loss: 0.6958090996612674
# Epoch 95, Loss: 0.6922917447552301
# Epoch 96, Loss: 0.688913749233968
# Epoch 97, Loss: 0.6858701647083828
# Epoch 98, Loss: 0.6822459414277388
# Epoch 99, Loss: 0.6793227819860845
# Epoch 100, Loss: 0.6761781242878541
# Mean Absolute Error: 1.0772188565851786
# Mean Squared Error: 1.801882512378324