In [25]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from matplotlib import pyplot as plt
from collections import defaultdict
import random
import csv
import os

In [26]:
train_file = 'train_set.csv'
valid_file = 'valid_set.csv'
seen_test_file = 'seen_test_set.csv'
unseen_test_file = 'unseen_test_set.csv'
test_file = 'test_set.csv'

def get_all_data(file_name):
    df = pd.read_csv(file_name, sep=',')
    data_list = []

    # iterate through the whole dataset
    for _, row in df.iterrows():
        item = int(row['item_idx'])
        user = int(row['user_idx'])
        rating = row['rating']

        size = int(row['size_idx'])
        fit = int(row['fit_idx'])
        user_attr = int(row['user_attr_idx'])
        model_attr = int(row['model_attr_idx'])
        category = int(row['category_idx'])
        brand = int(row['brand_idx'])
        year = int(row['year_idx'])
        split = int(row['split_idx'])

        data_list.append([user, item, rating,
                          size, fit,
                          user_attr, model_attr,
                          category, brand,
                          year, split])
    
    return data_list

train_data = get_all_data(train_file)

valid_data = get_all_data(valid_file)

In [27]:
# calculate global average rating on train+valid
avg_rating = np.mean([data[2] for data in train_data] + 
                     [data[2] for data in valid_data]
                    )
print(f'average rating: {avg_rating}')

# get the number of users and items in train+valid
n_users = max(max([data[0] for data in train_data]), max([data[0] for data in valid_data]))
n_items = max(max([data[1] for data in train_data]), max([data[1] for data in valid_data]))
print(n_users)
print(n_items)

average rating: 4.2093936748808165
44683
1019


In [28]:
# custom dataset for latent factor model with features
class U_I_R_features_Dataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list  # List of [user_idx, item_idx, rating,
                                      #        size_idx, fit_idx,
                                      #        user_attr_idx, model_attr_idx,
                                      #        category_idx, brand_idx,
                                      #        year_idx, split_idx]

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        u, i, r, size, fit, user_attr, model_attr, category, brand, year, split = self.data_list[idx]
        return (torch.tensor(u, dtype=torch.long), torch.tensor(i, dtype=torch.long), torch.tensor(r, dtype=torch.float32),
                torch.tensor(size, dtype=torch.long), torch.tensor(fit, dtype=torch.long),
                torch.tensor(user_attr, dtype=torch.long), torch.tensor(model_attr, dtype=torch.long),
                torch.tensor(category, dtype=torch.long), torch.tensor(brand, dtype=torch.long),
                torch.tensor(year, dtype=torch.long), torch.tensor(split, dtype=torch.long),
                )

In [29]:
# tunable parameters
K = 5   # dimension of gamma_u / gamma_i
alpha_init = avg_rating # maybe no need to tune this

lamb_beta_u = 0.01
lamb_beta_i = 0.01
lamb_gamma_u = 0.01
lamb_gamma_i = 0.01

# maybe we can define different lambdas for different features
lamb_attr_vec = 0.01

learning_rate = 0.01
batch_size = 1024
max_train_step = 50

In [30]:
# Create the dataset and dataloader
train_dataset = U_I_R_features_Dataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# valid data should be in one batch
valid_dataset = U_I_R_features_Dataset(valid_data)
valid_dataloader = DataLoader(valid_dataset, batch_size=len(valid_data), shuffle=True)

In [32]:
class LatentFactorModelWithFeatures(nn.Module):
    def __init__(self, alpha_init, K, lamb_beta_u, lamb_beta_i, lamb_gamma_u, lamb_gamma_i, lamb_attr_vec,
                 num_users, num_items,
                 n_size=9, n_fit=5,
                 n_user_attr=2, n_model_attr=2,
                 n_category=4, n_brand=31,
                 n_year=10, n_split=3):
        super(LatentFactorModelWithFeatures, self).__init__()

        # Initialize scalar average rating
        self.alpha = nn.Parameter(torch.tensor(alpha_init, dtype=torch.float32))

        # Bias terms for users and items
        # user/item idx = 0 means unseen user/item
        self.betaU = nn.Embedding(num_users+1, 1, padding_idx=0)
        self.betaI = nn.Embedding(num_items+1, 1, padding_idx=0)

        # Latent factors for users and items
        self.gammaU = nn.Embedding(num_users+1, K, padding_idx=0)
        self.gammaI = nn.Embedding(num_items+1, K, padding_idx=0)

        # attribute parameters for features
        self.rhoSize = nn.Embedding(n_size+1, K, padding_idx=0)
        self.rhoFit = nn.Embedding(n_fit+1, K, padding_idx=0)
        self.rhoUserAttr = nn.Embedding(n_user_attr+1, K, padding_idx=0)
        self.rhoModelAttr = nn.Embedding(n_model_attr+1, K, padding_idx=0)
        self.rhoCategory = nn.Embedding(n_category+1, K, padding_idx=0)
        self.rhoBrand = nn.Embedding(n_brand+1, K, padding_idx=0)
        self.rhoYear = nn.Embedding(n_year, K, padding_idx=0)    # year has no nan value
        self.rhoSplit = nn.Embedding(n_split, K, padding_idx=0)  # split has no nan value
        
        self.lamb_beta_u = lamb_beta_u
        self.lamb_beta_i = lamb_beta_i
        self.lamb_gamma_u = lamb_gamma_u
        self.lamb_gamma_i = lamb_gamma_i
        self.lamb_attr_vec = lamb_attr_vec

        # Initialize embeddings with small random values
        # If training doesn't converge, try other initialization strategies, e.g. initialize all parameters to 0
        nn.init.normal_(self.betaU.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.betaI.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.gammaU.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.gammaI.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoSize.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoFit.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoUserAttr.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoModelAttr.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoCategory.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoBrand.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoYear.weight, mean=0.0, std=0.001)
        nn.init.normal_(self.rhoSplit.weight, mean=0.0, std=0.001)

    # u / i, features should have the shape of (b,)
    def predict(self, u, i, size, fit, user_attr, model_attr, category, brand, year, split):
        beta_u = self.betaU(u).squeeze()    # (b,)
        beta_i = self.betaI(i).squeeze()    # (b,)
        gamma_u = self.gammaU(u)            # (b, K)
        gamma_i = self.gammaI(i)            # (b, K)

        # all feature parameters have the shape of (b, k)
        rho_size = self.rhoSize(size)
        rho_fit = self.rhoFit(fit)
        rho_user_attr = self.rhoUserAttr(user_attr)
        rho_model_attr = self.rhoModelAttr(model_attr)
        rho_category = self.rhoCategory(category)
        rho_brand = self.rhoBrand(brand)
        rho_year = self.rhoYear(year)
        rho_split = self.rhoSplit(split)

        p = self.alpha + beta_u + beta_i + torch.sum((gamma_u +
                                                      rho_size + rho_fit +
                                                      rho_user_attr + rho_model_attr +
                                                      rho_category + rho_brand +
                                                      rho_year + rho_split) * 
                                                    gamma_i, dim=-1)
        return p    # (b,)

    # Regularizer
    def reg(self):
        return (
            self.lamb_beta_u * torch.sum(self.betaU.weight**2) +
            self.lamb_beta_i * torch.sum(self.betaI.weight**2) +
            self.lamb_gamma_u * torch.sum(self.gammaU.weight**2) +
            self.lamb_gamma_i * torch.sum(self.gammaI.weight**2) + 
            self.lamb_attr_vec * (torch.sum(self.rhoSize.weight**2) +
                                  torch.sum(self.rhoFit.weight**2) +
                                  torch.sum(self.rhoUserAttr.weight**2) +
                                  torch.sum(self.rhoModelAttr.weight**2) +
                                  torch.sum(self.rhoCategory.weight**2) +
                                  torch.sum(self.rhoBrand.weight**2) +
                                  torch.sum(self.rhoYear.weight**2) +
                                  torch.sum(self.rhoSplit.weight**2)
                                  )
        )

    # Loss
    # u, i, r, features should have the shape of (b,)
    def forward(self, u, i, r, size, fit, user_attr, model_attr, category, brand, year, split):
        pred = self.predict(u, i, size, fit, user_attr, model_attr, category, brand, year, split)
        # r = torch.tensor(r, dtype=torch.float32)
        return torch.nn.functional.mse_loss(pred, r, reduction='mean')


# evaluate on valid/test set
# valid/test set dataloader should only have one batch
# u, i should be torch.long tensors with the shape of (b,)
# r should be torch.float32 tensors with the shape of (b,)
def evaluate(model, dataloader):
    model.eval()
    u, i, r, size, fit, user_attr, model_attr, category, brand, year, split = next(iter(dataloader))
    pred = model.predict(u, i, size, fit, user_attr, model_attr, category, brand, year, split)  # (b,)

    model.train()
    return torch.nn.functional.mse_loss(pred, r, reduction='mean').item()


# training function
# early stop if mse of valid set starts to increase
def training_step(model, dataloader, optimizer, valid_dataloader, pre_valid_mse=None):
    model.train()  # Set the model to training mode
    total_loss = 0

    early_stop = False

    for u, i, r, size, fit, user_attr, model_attr, category, brand, year, split in dataloader:
        optimizer.zero_grad()  # Zero the gradients

        # Forward pass: calculate the predicted ratings
        loss = model(u, i, r, size, fit, user_attr, model_attr, category, brand, year, split)  # Model forward pass
        loss += model.reg()  # Add regularization loss
        
        loss.backward()  # Backward pass: compute gradients
        optimizer.step()  # Optimizer step: update weights

        total_loss += loss.item()

        # evaluate on valid set to check if we need to early stop
        valid_mse = evaluate(model, valid_dataloader)
        print(f"valid_mse: {valid_mse:.4f}")
        if (pre_valid_mse is not None and pre_valid_mse < valid_mse) or np.isnan(valid_mse):
            early_stop = True
            break
        pre_valid_mse = valid_mse

    # Return average loss per batch, early stop, valid_mse
    return total_loss / len(dataloader), early_stop, valid_mse

In [33]:
# Initialize model
modelLFMWithFeatures = LatentFactorModelWithFeatures(alpha_init, K, lamb_beta_u, lamb_beta_i, lamb_gamma_u, lamb_gamma_i, lamb_attr_vec, n_users, n_items)

# Optimizer
optimizer = torch.optim.Adam(modelLFMWithFeatures.parameters(), lr=learning_rate)

# train loop
pre_valid_mse = None
for epoch in range(max_train_step):
    avg_loss, early_stop, valid_mse = training_step(modelLFMWithFeatures, train_dataloader, optimizer, valid_dataloader, pre_valid_mse)
    if early_stop:
        print(f"Early stop at epoch {epoch + 1}, valid_mse = {valid_mse:.4f}")
        break
    if epoch % 10 == 9:
        print(f"Epoch {epoch + 1}, average loss = {avg_loss:.4f}")
    pre_valid_mse = valid_mse

# save model
model_path = 'model/LFM_with_features.pth'
torch.save(modelLFMWithFeatures, model_path)
print("model saved")

valid_mse: 1.1822
valid_mse: 1.1805
valid_mse: 1.1786
valid_mse: 1.1764
valid_mse: 1.1733
valid_mse: 1.1701
valid_mse: 1.1669
valid_mse: 1.1641
valid_mse: 1.1610
valid_mse: 1.1580
valid_mse: 1.1550
valid_mse: 1.1526
valid_mse: 1.1498
valid_mse: 1.1476
valid_mse: 1.1458
valid_mse: 1.1443
valid_mse: 1.1430
valid_mse: 1.1417
valid_mse: 1.1406
valid_mse: 1.1398
valid_mse: 1.1395
valid_mse: 1.1393
valid_mse: 1.1395
Early stop at epoch 1, valid_mse = 1.1395
model saved


In [35]:

# load model
if os.path.exists(model_path):
    model = torch.load(model_path)
    print("model loaded")

    # evaluate on test set
    print("evalute on test set (seen test + unseen test)")
    test_data = get_all_data(test_file)

    test_dataset = U_I_R_features_Dataset(test_data)
    test_dataloader = DataLoader(test_dataset, batch_size=len(test_data), shuffle=True)

    test_mse = evaluate(model, test_dataloader)
    print(f"test_mse: {test_mse:.4f}")
else:
    print("model doesn't exist")

model loaded
evalute on test set (seen test + unseen test)
test_mse: 1.1153
