In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
import torch

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch.nn.functional as F
from torch import nn, optim, Tensor

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn import LGConv

In [4]:
# Load the MovieLens dataset
ratings = pd.read_csv('./dataset/ml-latest-small/ratings.csv')

In [5]:
# Map user and movie IDs to indices
user_map = {user_id: i for i, user_id in enumerate(ratings['userId'].unique())}
movie_map = {movie_id: i for i, movie_id in enumerate(ratings['movieId'].unique())}

# Count the number of users, movies and total entities in the dataset
num_users = len(user_map)
num_movies = len(movie_map)
num_total = num_users + num_movies

In [6]:
# Create a tensor for user and movie indices based on the user ratings in the dataset
user_ids = torch.LongTensor([user_map[user_id] for user_id in ratings['userId']])
movie_ids = torch.LongTensor([movie_map[movie_id] for movie_id in ratings['movieId']])
edge_index = torch.stack([user_ids, movie_ids])

In [7]:
# Split the dataset into training and test sets
train_index, test_index = train_test_split(range(len(ratings)), test_size=0.2, random_state=0)
val_index, test_index = train_test_split(test_index, test_size=0.5, random_state=0)

In [8]:
train_edge_index = edge_index[:, train_index]
val_edge_index = edge_index[:, val_index]
test_edge_index = edge_index[:, test_index]

In [9]:
# Generate a batch of random indices using np.random.choice
def sample_mini_batch(edge_index):
  index = np.random.choice(range(edge_index.shape[1]), size=BATCH_SIZE)

  # Generate negative samples
  edge_index = structured_negative_sampling(edge_index)
  edge_index = torch.stack(edge_index, dim=0)

  # Select the user, positive and negative samples
  user_index = edge_index[0, index]
  pos_movie_index = edge_index[1, index]
  neg_movie_index = edge_index[2, index]

  return user_index, pos_movie_index, neg_movie_index

In [10]:
# LightGCN Model
class LightGCN(nn.Module):
  def __init__(self, num_users, num_movies, num_layers=4, dim_h=64):
    super().__init__()
    self.num_users = num_users
    self.num_movies = num_movies
    self.num_layers = num_layers
    self.emb_users = nn.Embedding(num_embeddings=self.num_users, embedding_dim=dim_h)
    self.emb_movies = nn.Embedding(num_embeddings=self.num_movies, embedding_dim=dim_h)
    self.convs = nn.ModuleList(LGConv() for _ in range(num_layers))

    # Initialize the weights
    nn.init.normal_(self.emb_users.weight, std=0.1)
    nn.init.normal_(self.emb_movies.weight, std=0.1)

  def forward(self, edge_index):
    emb = torch.cat([self.emb_users.weight, self.emb_movies.weight])
    embs = [emb]
    for conv in self.convs:
      emb = conv(x=emb, edge_index=edge_index)
      embs.append(emb)
    
    emb_final = torch.mean(torch.stack(embs, dim=1), dim=1)
    
    emb_users_final, emb_movies_final = torch.split(emb_final, [self.num_users, self.num_movies])
    return emb_users_final, self.emb_users.weight, emb_movies_final, self.emb_movies.weight

In [11]:
# Calculate the loss
def bpr_loss(emb_users_final, emb_users, emb_pos_movies_final, emb_pos_movies, emb_neg_movies_final, emb_neg_movies):
  reg_loss = LAMBDA * (emb_users.norm().pow(2) +
                       emb_pos_movies.norm().pow(2) +
                       emb_neg_movies.norm().pow(2))
  pos_ratings = torch.mul(emb_users_final, emb_pos_movies_final).sum(dim=-1)
  neg_ratings = torch.mul(emb_users_final, emb_neg_movies_final).sum(dim=-1)
  bpr_loss = torch.mean(torch.nn.functional.softplus(pos_ratings - neg_ratings))

  return -bpr_loss + reg_loss

In [12]:
def get_user_items(edge_index):
  user_items = dict()
  for i in range(edge_index.shape[1]):
    user = edge_index[0][i].item()
    item = edge_index[1][i].item()
    if user not in user_items:
      user_items[user] = []
    user_items[user].append(item)
  return user_items

In [13]:
def compute_recall_at_k(items_ground_truth, items_predicted):
  num_correct_pred = np.sum(items_predicted, axis=1)
  num_total_pred = np.array([len(items_ground_truth[i]) for i in range(len(items_ground_truth))])

  recall = np.mean(num_correct_pred / num_total_pred)

  return recall

In [14]:
def compute_ndcg_at_k(items_ground_truth, items_predicted):
  test_matrix = np.zeros((len(items_predicted), K))

  for i, items in enumerate(items_ground_truth):
    length = min(len(items), K)
    test_matrix[i, :length] = 1
  
  max_r = test_matrix
  idcg = np.sum(max_r * 1. / np.log2(np.arange(2, K + 2)), axis=1)
  dcg = items_predicted * (1. / np.log2(np.arange(2, K + 2)))
  dcg = np.sum(dcg, axis=1)
  idcg[idcg == 0.] = 1.
  ndcg = dcg / idcg
  ndcg[np.isnan(ndcg)] = 0.
  
  return np.mean(ndcg)

In [15]:
# wrapper function to get evaluation metrics
def get_metrics(model, edge_index, exclude_edge_indices):

  ratings = torch.matmul(model.emb_users.weight, model.emb_movies.weight.T)

  for exclude_edge_index in exclude_edge_indices:
    user_pos_items = get_user_items(exclude_edge_index)
    exclude_users = []
    exclude_items = []
    for user, items in user_pos_items.items():
      exclude_users.extend([user] * len(items))
      exclude_items.extend(items)
    ratings[exclude_users, exclude_items] = -1024

  # get the top k recommended items for each user
  _, top_K_items = torch.topk(ratings, k=K)

  # get all unique users in evaluated split
  users = edge_index[0].unique()

  test_user_pos_items = get_user_items(edge_index)

  # convert test user pos items dictionary into a list
  test_user_pos_items_list = [test_user_pos_items[user.item()] for user in users]

  # determine the correctness of topk predictions
  items_predicted = []
  for user in users:
    ground_truth_items = test_user_pos_items[user.item()]
    label = list(map(lambda x: x in ground_truth_items, top_K_items[user]))
    items_predicted.append(label)

  recall = compute_recall_at_k(test_user_pos_items_list, items_predicted)
  ndcg = compute_ndcg_at_k(test_user_pos_items_list, items_predicted)

  return recall, ndcg

In [16]:
# wrapper function to evaluate model
def test(model, edge_index, exclude_edge_indices):
  emb_users_final, emb_users, emb_items_final, emb_items = model.forward(edge_index)
  user_indices, pos_item_indices, neg_item_indices = structured_negative_sampling(edge_index, contains_neg_self_loops=False)

  emb_users_final, emb_users = emb_users_final[user_indices], emb_users[user_indices]
  emb_pos_items_final, emb_pos_items = emb_items_final[pos_item_indices], emb_items[pos_item_indices]
  emb_neg_items_final, emb_neg_items = emb_items_final[neg_item_indices], emb_items[neg_item_indices]

  loss = bpr_loss(emb_users_final, emb_users, emb_pos_items_final, emb_pos_items, emb_neg_items_final, emb_neg_items).item()

  recall, ndcg = get_metrics(model, edge_index, exclude_edge_indices)

  return loss, recall, ndcg

## Training Model

In [17]:
K = 20
LAMBDA = 1e-6
BATCH_SIZE = 1024

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
model = LightGCN(num_users, num_movies)
model = model.to(device)
edge_index = edge_index.to(device)
train_edge_index = train_edge_index.to(device)
val_edge_index = val_edge_index.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
num_batch = int(len(train_index) / BATCH_SIZE)
for epoch in range(1, 31):
  model.train()
  for batch in range(num_batch):
    optimizer.zero_grad()

    emb_users_final, emb_users, emb_movies_final, emb_movies = model.forward(train_edge_index)
    
    user_index, pos_movies_index, neg_movies_index = sample_mini_batch(train_edge_index)
    
    emb_users_final, emb_users = emb_users_final[user_index], emb_users[user_index]
    emb_pos_movies_final, emb_pos_movies = emb_movies_final[pos_movies_index], emb_movies_final[pos_movies_index]
    emb_neg_movies_final, emb_neg_movies = emb_movies_final[neg_movies_index], emb_movies_final[neg_movies_index]

    train_loss = bpr_loss(emb_users_final, emb_users, emb_pos_movies_final, emb_pos_movies, emb_neg_movies_final, emb_neg_movies)
    
    train_loss.backward()
    optimizer.step()
    
  if epoch % 5 == 0:
    model.eval()
    val_loss, recall, ndcg = test(model, val_edge_index, [train_edge_index])
    print(f"Epoch {epoch} | Train loss: {train_loss.item():.5f} | Val loss: {val_loss:.5f} | Val recall@{K}:{recall:.5f} | Val ndcg@{K}: {ndcg:.5f}")

Epoch 5 | Train loss: -17.36334 | Val loss: -6.74925 | Val recall@20:0.10559 | Val ndcg@20: 0.10656
Epoch 10 | Train loss: -66.57384 | Val loss: -26.11737 | Val recall@20:0.10667 | Val ndcg@20: 0.11183
Epoch 15 | Train loss: -143.30582 | Val loss: -55.93231 | Val recall@20:0.10721 | Val ndcg@20: 0.11212
Epoch 20 | Train loss: -263.75146 | Val loss: -95.26365 | Val recall@20:0.11006 | Val ndcg@20: 0.11351
Epoch 25 | Train loss: -368.48038 | Val loss: -145.20323 | Val recall@20:0.11110 | Val ndcg@20: 0.11379
Epoch 30 | Train loss: -477.84818 | Val loss: -200.85176 | Val recall@20:0.11053 | Val ndcg@20: 0.11375


In [None]:
# Load movie metadata
movies_df = pd.read_csv('./dataset/ml-latest-small/movies.csv')

# Reverse mapping to go from movie index to movieId
index_to_movie_id = {index: movie_id for movie_id, index in movie_map.items()}

In [None]:
# Recommendation function
def recommend_movies(model, user_id, train_edge_index, top_k=10):
    model.eval()
    
    with torch.no_grad():
        emb_users_final, _, emb_movies_final, _ = model.forward(train_edge_index)
        
        user_idx = user_map[user_id]
        user_embedding = emb_users_final[user_idx]
        
        scores = torch.matmul(user_embedding, emb_movies_final.T)

        rated_movie_indices = train_edge_index[1][train_edge_index[0] == user_idx]
        scores[rated_movie_indices] = -1024

        top_scores, top_movie_indices = torch.topk(scores, top_k)

        top_movie_ids = [index_to_movie_id[idx.item()] for idx in top_movie_indices]
        top_scores_list = top_scores.cpu().numpy().tolist()

        recommendations = movies_df[movies_df['movieId'].isin(top_movie_ids)].copy()
        
        score_map = dict(zip(top_movie_ids, top_scores_list))
        recommendations["score"] = recommendations["movieId"].map(score_map)

        recommendations = recommendations.sort_values(by="score", ascending=False)

        return recommendations[['movieId', 'title', 'genres', 'score']]

In [None]:
test_user_id = 1
recommendations = recommend_movies(model, test_user_id, train_edge_index, top_k=10)
print(recommendations)