In [2]:
import time
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder
from data_utils import *
from evaluate_cbf import *
from models import *

%load_ext autoreload
%autoreload 2

In [3]:
# Initialise parameters

seed = 4242
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.set_num_threads(torch.get_num_threads())

emb_size = 100
lr = 0.001
dropout = 0.0
batch_size = 100
epochs = 10
device = "cpu"
top_k = [10, 20, 50, 100]
log_name = "log"
model_path = "./models/"
num_categories = 368
num_visual_features = 512
embedding_dim = 32
hidden_dim = 32
diversity_param = 0

In [4]:
# Load data
user_num, item_num, train_dict, valid_dict, test_dict, train_data, valid_gt, test_gt, category_features_onehot, visual_features, train_user_profiles, valid_user_profiles, test_user_profiles = load_data()

# Create training dataset
train_dataset = CBFData(user_item_pairs=train_data, num_items=item_num, category_features=category_features_onehot, visual_features=visual_features, user_profiles=train_user_profiles, train_dict=train_dict, is_training=True)

# Create dataloader object
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

Number of users: 506, Number of items: 1674
Training samples: 12358, Validation samples: 406, Test samples: 406
Category features shape: (1674, 368)
Visual features shape: (512,)


In [7]:
# Instantiate and train the model

model = ContentBasedModel(num_categories, num_visual_features, hidden_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()

best_f1_score = 0

for epoch in range(epochs):
    model.train()
    start_time = time.time()
    total_loss = 0

    for batch in train_loader:
        user_category, user_visual, item_category, item_visual, labels = batch
        user_category, user_visual = user_category.to(device), user_visual.to(device)
        item_category, item_visual = item_category.to(device), item_visual.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(user_category, user_visual, item_category, item_visual).squeeze()
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print('---'*18)
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Time elapsed: {time.time() - start_time:.2f}s")

    # Evaluation    
    recommends, avg_f1, avg_ndcg, avg_ild, avg_recall, f1_scores, ndcg_scores, diversity_scores, recall_scores = metrics(model, top_k, train_dict, valid_dict, train_user_profiles, category_features_onehot, visual_features, device, diversity_param, is_training=True)
    print(f"Validation Metrics - F1: {f1_scores}, NDCG: {ndcg_scores}, ILD: {diversity_scores}, Recall: {recall_scores}")

    # Update best F1 score and save model if necessary
    current_best_f1 = max(f1_scores)
    if current_best_f1 > best_f1_score:
        best_f1_score = current_best_f1
        # Save the model checkpoint
        torch.save(model.state_dict(), '/models/best_model/{time.time()}.pth')
        print(f"New best model saved with F1 score: {best_f1_score}, model path: best_model/{time.time()}.pth")
    print('---'*18)

print("Training completed.")
print("Best F1 score: ", best_f1_score)

------------------------------------------------------
Epoch 1, Loss: 0.6849391037417997, Time elapsed: 28.57s


TypeError: must be real number, not dict

In [None]:
# Test the model

model = ContentBasedModel(num_categories, num_visual_features, hidden_dim)
model.load_state_dict(torch.load("./best_model.pth"))
model.to(device)

model.eval()
recommends, avg_f1, avg_ndcg, avg_ild, avg_recall, f1_scores, ndcg_scores, diversity_scores, recall_scores = metrics(model, top_k, train_dict, test_dict, test_user_profiles, category_features_onehot, visual_features, device, diversity_param, is_training=False)
print(f"Test Metrics - F1: {f1_scores}, NDCG: {ndcg_scores}, ILD: {diversity_scores}, Recall: {recall_scores}")