In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from matplotlib import pyplot as plt
from collections import defaultdict
import random
import csv

In [60]:
train_file = 'train_set.csv'
valid_file = 'valid_set.csv'
seen_test_file = 'seen_test_set.csv'
unseen_test_file = 'unseen_test_set.csv'
test_file = 'test_set.csv'

def get_all_u_i_r(file_name):
    df = pd.read_csv(file_name, sep=',')
    u_i_r_list = []

    # iterate through the whole dataset
    for _, row in df.iterrows():
        item = int(row['item_idx'])
        user = int(row['user_idx'])
        rating = row['rating']
        u_i_r_list.append([user, item, rating])
    
    return u_i_r_list

train_u_i_r = get_all_u_i_r(train_file)

# valid_u_i_r = get_all_u_i_r(valid_file)
# valid_u = [u for u, i, r in valid_u_i_r]
# valid_i = [i for u, i, r in valid_u_i_r]
# valid_r = [r for u, i, r in valid_u_i_r]

# seen_test_u_i_r = get_all_u_i_r(seen_test_file)
# seen_test_u_i = [[u, i] for u, i, r in seen_test_u_i_r]

# unseen_test_u_i_r = get_all_u_i_r(unseen_test_file)
# unseen_test_u_i = [[u, i] for u, i, r in unseen_test_u_i_r]

# test_u_i_r = get_all_u_i_r(test_file)
# test_u_i = [[u, i] for u, i, r in test_u_i_r]

In [47]:
# calculate global average rating on train+valid
avg_rating = np.mean([r for u, i, r in train_u_i_r] + 
                                 [r for u, i, r in valid_u_i_r]
                                 )

# get the number of users and items in train+valid
n_users = max(max([u for u, i, r in train_u_i_r]), max([u for u, i, r in valid_u_i_r]))
n_items = max(max([i for u, i, r in train_u_i_r]), max([i for u, i, r in valid_u_i_r]))
print(n_users)
print(n_items)

44683
1019


In [48]:
# custom dataset
class U_I_R_Dataset(Dataset):
    def __init__(self, u_i_r_list):
        self.u_i_r_list = u_i_r_list  # List of [user_idx, item_idx, rating]
        
    def __len__(self):
        return len(self.u_i_r_list)
    
    def __getitem__(self, idx):
        u, i, r = self.u_i_r_list[idx]
        return torch.tensor(u, dtype=torch.long), torch.tensor(i, dtype=torch.long), torch.tensor(r, dtype=torch.float32)

In [49]:
# tunable parameters
K = 5   # dimension of gamma_u / gamma_i
alpha_init = avg_rating # maybe no need to tune this
lamb_beta_u = 0.01
lamb_beta_i = 0.01
lamb_gamma_u = 0.01
lamb_gamma_i = 0.01
learning_rate = 0.01
batch_size = 1024
max_train_step = 50

In [50]:
# Create the dataset and dataloader
train_dataset = U_I_R_Dataset(train_u_i_r)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [69]:
class LatentFactorModel(nn.Module):
    def __init__(self, alpha_init, K, lamb_beta_u, lamb_beta_i, lamb_gamma_u, lamb_gamma_i, num_users, num_items):
        super(LatentFactorModel, self).__init__()

        # Initialize scalar average rating
        self.alpha = nn.Parameter(torch.tensor(alpha_init, dtype=torch.float32))

        # Bias terms for users and items
        # user/item idx = 0 means unseen user/item
        self.betaU = nn.Embedding(num_users+1, 1, padding_idx=0)
        self.betaI = nn.Embedding(num_items+1, 1, padding_idx=0)

        # Latent factors for users and items
        self.gammaU = nn.Embedding(num_users+1, K, padding_idx=0)
        self.gammaI = nn.Embedding(num_items+1, K, padding_idx=0)
        self.lamb_beta_u = lamb_beta_u
        self.lamb_beta_i = lamb_beta_i
        self.lamb_gamma_u = lamb_gamma_u
        self.lamb_gamma_i = lamb_gamma_i

        # Initialize embeddings with small random values
        # If training doesn't converge, try other initialization strategies, e.g. initialize all parameters to 0
        nn.init.normal_(self.betaU.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.betaI.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.gammaU.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.gammaI.weight, mean=0.0, std=0.001)

    # u / i should have the shape of (b,)
    def predict(self, u, i):
        beta_u = self.betaU(u).squeeze()    # (b,)
        beta_i = self.betaI(i).squeeze()    # (b,)
        gamma_u = self.gammaU(u)            # (b, K)
        gamma_i = self.gammaI(i)            # (b, K)
        p = self.alpha + beta_u + beta_i + torch.sum(gamma_u * gamma_i, dim=-1)
        return p    # (b,)

    # Regularizer
    def reg(self):
        return (
            self.lamb_beta_u * torch.sum(self.betaU.weight**2) +
            self.lamb_beta_i * torch.sum(self.betaI.weight**2) +
            self.lamb_gamma_u * torch.sum(self.gammaU.weight**2) +
            self.lamb_gamma_i * torch.sum(self.gammaI.weight**2)
        )

    # Loss
    # u, i, r should have the shape of (b,)
    def forward(self, u, i, r):
        pred = self.predict(u, i)
        # r = torch.tensor(r, dtype=torch.float32)
        return torch.nn.functional.mse_loss(pred, r, reduction='mean')


# evaluate on valid/test set
# u, i should be torch.long tensors with the shape of (b,)
# r should be torch.float32 tensors with the shape of (b,)
def evaluate(model, u, i, r):
    model.eval()
    pred = model.predict(u, i)  # (b,)

    model.train()
    return torch.nn.functional.mse_loss(pred, r, reduction='mean').item()


# training function
# early stop if mse of valid set starts to increase
def training_step(model, dataloader, optimizer, valid_u, valid_i, valid_r, pre_valid_mse=None):
    model.train()  # Set the model to training mode
    total_loss = 0

    early_stop = False

    for u, i, r in dataloader:
        optimizer.zero_grad()  # Zero the gradients

        # Forward pass: calculate the predicted ratings
        loss = model(u, i, r)  # Model forward pass
        loss += model.reg()  # Add regularization loss
        
        loss.backward()  # Backward pass: compute gradients
        optimizer.step()  # Optimizer step: update weights

        total_loss += loss.item()

        # evaluate on valid set to check if we need to early stop
        valid_mse = evaluate(model, valid_u, valid_i, valid_r)
        # print(f"valid_mse: {valid_mse:.4f}")
        if (pre_valid_mse is not None and pre_valid_mse < valid_mse) or np.isnan(valid_mse):
            early_stop = True
            break
        pre_valid_mse = valid_mse

    # Return average loss per batch, early stop, valid_mse
    return total_loss / len(dataloader), early_stop, valid_mse

In [70]:
# convert valid set to tensors
valid_u_i_r = get_all_u_i_r(valid_file)
valid_u = [u for u, i, r in valid_u_i_r]
valid_i = [i for u, i, r in valid_u_i_r]
valid_r = [r for u, i, r in valid_u_i_r]

valid_u = torch.tensor(valid_u).to(torch.long)
valid_i = torch.tensor(valid_i).to(torch.long)
valid_r = torch.tensor(valid_r).to(torch.float32)

# Initialize model
modelLFM = LatentFactorModel(alpha_init, K, lamb_beta_u, lamb_beta_i, lamb_gamma_u, lamb_gamma_i, n_users, n_items)

# Optimizer
optimizer = torch.optim.Adam(modelLFM.parameters(), lr=learning_rate)

# train loop
pre_valid_mse = None
for epoch in range(max_train_step):
    avg_loss, early_stop, valid_mse = training_step(modelLFM, train_dataloader, optimizer, valid_u, valid_i, valid_r, pre_valid_mse)
    if early_stop:
        print(f"Early stop at epoch {epoch + 1}, valid_mse = {valid_mse:.4f}")
        break
    if epoch % 10 == 9:
        print(f"Epoch {epoch + 1}, average loss = {avg_loss:.4f}")
    pre_valid_mse = valid_mse


Early stop at epoch 1, valid_mse = 1.1493


In [81]:
# evaluate on test set
print("evalute on test set (seen test + unseen test)")
test_u_i_r = get_all_u_i_r(test_file)
test_u = [u for u, i, r in test_u_i_r]
test_i = [i for u, i, r in test_u_i_r]
test_r = [r for u, i, r in test_u_i_r]

test_u = torch.tensor(test_u).to(torch.long)
test_i = torch.tensor(test_i).to(torch.long)
test_r = torch.tensor(test_r).to(torch.float32)

test_mse = evaluate(modelLFM, test_u, test_i, test_r)
print(f"test_mse: {test_mse:.4f}")

evalute on test set (seen test + unseen test)
test_mse: 1.1274
