In [2]:
!pip install torch_geometric



In [82]:
from torch_geometric.nn.models.lightgcn import LightGCN
import pandas as pd
import os
from tqdm import tqdm
import torch
import numpy as np

## Load Data
We can begin by loading in the user review data. For each user, we have a subset of the movies that they reviewed. We'll load each of the CSVs as dataframes, and store a dict of user IDs corresponding to their dataframes.

In [83]:
# for now we will use the first 10k rows of the data, set to None to use all data
AMOUNT_TO_LOAD = 1000

In [84]:
user_reviews_dir = 'user_reviews'
user_review_data = dict()

for filename in tqdm(os.listdir(user_reviews_dir)):
    if AMOUNT_TO_LOAD is not None and len(user_review_data) >= AMOUNT_TO_LOAD:
        break
    try:
        user_review_data[filename] = pd.read_csv(os.path.join(user_reviews_dir, filename), encoding='unicode_escape')
    except pd.errors.EmptyDataError:
        print(f'Empty file: {filename}')
        pass

  2%|▏         | 1000/63111 [00:02<02:59, 345.62it/s]


Now let's split the data into training, validation, and test sets. Since this is a recommender, we're gonna split by removing some of the user's reviews.

For every user, so long as the user has more than 5 reviews, remove one review for the validation set and one review for the test set.

In [85]:
print(list(user_review_data.keys())[0])

asel82_reviews.csv


In [86]:
# remove all values with nan in the review column
for key in tqdm(user_review_data.keys()):
    user_review_data[key] = user_review_data[key].dropna(subset=['movie_rating'])

100%|██████████| 1000/1000 [00:01<00:00, 884.26it/s]


In [87]:
train_reviews = []
validation_reviews = []
test_reviews = []
for user_id, reviews in tqdm(user_review_data.items()):
    if len(reviews) > 50:
        validation_review_data_df = reviews.sample(15, replace=False)
        validation_review_data = validation_review_data_df.to_dict('records')
        for review in validation_review_data:
            review['user_id'] = user_id
        validation_reviews.extend(validation_review_data)
        # remove the validation reviews from the training data
        reviews = reviews.drop(validation_review_data_df.index)
        test_review_data_df = reviews.sample(15, replace=False)
        test_review_data = test_review_data_df.to_dict('records')
        for review in test_review_data:
            review['user_id'] = user_id
        test_reviews.extend(test_review_data)
        # remove the test reviews from the training data
        reviews = reviews.drop(test_review_data_df.index)
        train_review_data = reviews.to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        train_reviews.extend(train_review_data)
    else:
        # if the user has less than 5 reviews, we will use all of them for training
        train_review_data = reviews.to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        train_reviews.extend(train_review_data)

print(f'Train reviews: {len(train_reviews)}')
print(f'Validation reviews: {len(validation_reviews)}')
print(f'Test reviews: {len(test_reviews)}')

100%|██████████| 1000/1000 [00:02<00:00, 340.22it/s]

Train reviews: 396059
Validation reviews: 12300
Test reviews: 12300





## Build the Model
Now that we have the training data, let's construct the model to train.

In [104]:
num_train_users = len(set([review['user_id'] for review in train_reviews]))
num_train_items = len(set([review['movie_id'] for review in train_reviews]))
num_total_items = len(set([review['movie_id'] for review in train_reviews + validation_reviews + test_reviews]))
num_nodes = num_train_users + num_total_items
print(f'Number of train users: {num_train_users}')
print(f'Number of train items: {num_train_items}')
print(f'Number of nodes: {num_nodes}')

Number of train users: 1000
Number of train items: 43225
Number of nodes: 45001


In [105]:
num_val_users = len(set([review['user_id'] for review in validation_reviews]))
num_val_items = len(set([review['movie_id'] for review in validation_reviews]))
num_val_nodes = num_val_users + num_val_items

In [106]:
# Let's map users to ids
movie_id_to_movie_name = dict()
for review in train_reviews + validation_reviews + test_reviews:
    movie_id_to_movie_name[review['movie_id']] = review['movie_title']

user_to_id = dict()
for i, user_id in enumerate(set([review['user_id'] for review in train_reviews + validation_reviews + test_reviews])):
    user_to_id[user_id] = i

# Let's map movies to ids
movie_to_id = dict()
for i, movie_id in enumerate(set([review['movie_id'] for review in train_reviews + validation_reviews + test_reviews])):
    movie_to_id[movie_id] = i + num_train_users

# Let's map ids to users
id_to_user = dict()
for user_id, index in user_to_id.items():
    id_to_user[index] = user_id

# Let's map ids to movies
id_to_movie = dict()
for movie_id, index in movie_to_id.items():
    id_to_movie[index] = movie_id

# Let's map movie names to movie ids
movie_name_to_movie_id = dict()
for movie_id, movie_name in movie_id_to_movie_name.items():
    movie_name_to_movie_id[movie_name] = movie_id

In [107]:
import random

def convert_review_to_edge(review):
    user_id = user_to_id[review['user_id']]
    movie_id = movie_to_id[review['movie_id']]
    edge_weight = review['movie_rating']
    if (edge_weight < 3.5 and edge_weight > 2.5):
        return None, None
    edge = (user_id, movie_id)
    edge_weight = review['movie_rating']
    return edge, edge_weight

def shuffle_edges_and_edge_weights(edges, edge_weights):
    c = list(zip(edges, edge_weights))
    random.shuffle(c)
    return zip(*c)

def convert_reviews_to_edges(reviews):
    edges = []
    edge_weights = []
    for review in tqdm(reviews):
        edge, edge_weight = convert_review_to_edge(review)
        if edge is not None:
            edges.append(edge)
            edge_weights.append(edge_weight)
    
    # Reformat the edges to be a tensor
    edges = torch.tensor(edges, dtype=torch.long).t().contiguous()
    return edges, edge_weights

In [108]:
# Now let's create the edges between users and movies.
# The id of the user will be the index of the user in the user_to_id dict
# The id of the movie will be the index of the movie in the movie_to_id dict + the number of users

train_edges, train_edge_weights = convert_reviews_to_edges(train_reviews)
validation_edges, validation_edge_weights = convert_reviews_to_edges(validation_reviews)

print(f'Train edges: {train_edges.shape[1]}')
print(f'Validation edges: {validation_edges.shape[1]}')

100%|██████████| 396059/396059 [00:00<00:00, 704428.25it/s]
100%|██████████| 12300/12300 [00:00<00:00, 605224.47it/s]

Train edges: 318166
Validation edges: 10260





In [109]:
import torch_geometric.data as data

# create the graph
train_graph = data.Data(
    edge_index=train_edges,
    edge_attr=torch.tensor(train_edge_weights),
    num_nodes=num_nodes
)

validation_graph = data.Data(
    edge_index=validation_edges,
    edge_attr=torch.tensor(validation_edge_weights),
    num_nodes=num_nodes
)

In [110]:
train_graph.validate(raise_on_error=True)
validation_graph.validate(raise_on_error=True)

True

In [81]:
# Let's create some negative edges
def resample_edges_for_user(user_positive_edges, user_negative_edges):
    num_negative_edges_to_add = user_positive_edges.shape[1] * 3 - user_negative_edges.shape[1]
    if (num_negative_edges_to_add <= 0):
        num_negative_edges_to_remove = -num_negative_edges_to_add
        # choose the negative edges to keep
        negative_edges_to_keep = torch.randint(user_negative_edges.shape[1], (user_negative_edges.shape[1] - num_negative_edges_to_remove,))
        # remove all the negative edges for this user
        user_negative_edges = user_negative_edges[:, negative_edges_to_keep]
    else:
        # Create new negative edges
        negative_edges_to_add = torch.tensor([[user_id] * num_negative_edges_to_add, torch.randint(num_train_users, num_train_items, (num_negative_edges_to_add,))], dtype=torch.long)
        # Add the negative edges to the negative edges for this user
        user_negative_edges = torch.cat([user_negative_edges, negative_edges_to_add], dim=1)
    return user_positive_edges, user_negative_edges
        

In [43]:
# let's compute ndcg
def compute_ndcg_at_k(relevances, k=5):
    relevances = relevances[:k]
    dcg = 0
    for i, relevance in enumerate(relevances):
        dcg += (2 ** relevance - 1) / np.log2(i + 2)
    idcg = 0
    for i, relevance in enumerate(sorted(relevances, reverse=True)):
        idcg += (2 ** relevance - 1) / np.log2(i + 2)
    return dcg / idcg

In [44]:
def get_user_positive_items(edge_index):
    """Generates dictionary of positive items for each user

    Args:
        edge_index (torch.Tensor): 2 by N list of edges

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [45]:
import time
def compute_recall_at_k(validation_graph, model, K):
    # get positive edges in validation set
    positive_edges = validation_graph.edge_index[:, validation_graph.edge_attr > 3.5]

    # map users to positive edges
    user_pos_items = get_user_positive_items(positive_edges)

    # get users
    users = positive_edges[0].unique()

    users = users[torch.randint(users.shape[0], (min(200, len(users)),))]
    # filter the validation edges to only the users we want to evaluate
    user_validation_edges = []
    for user in users:
        user_validation_edges.append(validation_graph.edge_index[:, validation_graph.edge_index[0] == user])
    user_validation_edges = torch.cat(user_validation_edges, dim=1)
    print(user_validation_edges.shape)

    first_user_id = users[0].item()
    user_name = id_to_user[first_user_id]
    print(f'User: {user_name}')

    # get movies
    movie_indices = torch.LongTensor([_ for _ in range(len(users) + 1, validation_graph.num_nodes)]).to(device)

    # Get positive items for each user in validation set
    truth_items = [set(user_pos_items[user.item()]) for user in users]

    first_user_truth_items = truth_items[0]
    first_user_truth_items = [id_to_movie[item] for item in first_user_truth_items]
    first_user_truth_items = [movie_id_to_movie_name[item] for item in first_user_truth_items]
    print(first_user_truth_items)

    training_edges = train_graph.edge_index

    # Get top-K recommended items for each user in validation set
    total_recall = 0
    print("Computing recommendations for {} users".format(len(users)))
    for user_index, user_id in tqdm(enumerate(users), total=len(users)):
        tick = time.time()
        all_edges = torch.tensor([(user_id, item_id) for item_id in range(num_train_users, num_train_items)], dtype=torch.long).t().contiguous()
        recommendations = model.recommend(all_edges.to(device), src_index=torch.tensor([user_id]), dst_index=torch.tensor([x for x in range(num_train_users + 1, num_train_items)]), k=3 * K)[0]
        tock = time.time()
        train_edges_for_user = training_edges[:, training_edges[0] == user_id]
        # remove all the recommendations that are in the training set
        recommendations = recommendations[~torch.isin(recommendations, train_edges_for_user[1])][:K]
        if (len(recommendations) < K):
            print("Not enough recommendations for user {}".format(user_id))
        if (user_id == first_user_id):
            first_user_recommended_items = recommendations
            first_user_recommended_items = [id_to_movie[item.item()] for item in first_user_recommended_items if item.item() > num_train_users]
            first_user_recommended_items = [movie_id_to_movie_name[item] for item in first_user_recommended_items if item in movie_id_to_movie_name]
            print(first_user_recommended_items)
        # num_intersect = 0
        truth_items_for_user = truth_items[user_index]
        # for item in recommendations:
        #     item = item.item()
        #     if item in truth_items_for_user:
        #         num_intersect += 1
        # print(num_intersect)
        num_intersect = len(set([item.item() for item in recommendations]).intersection(truth_items[user_index]))
        recall = num_intersect / len(truth_items_for_user)
        total_recall += recall
    return total_recall / len(users)



In [46]:
from typing import Optional, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList
from torch.nn.modules.loss import _Loss

from torch_geometric.nn.conv import LGConv
from torch_geometric.typing import Adj, OptTensor, SparseTensor

In [47]:
"""Adapted from https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/models/lightgcn.html"""
class CustomLightGCN(torch.nn.Module):
    """From the <https://arxiv.org/abs/2002.02126>` paper.

    Args:
        num_nodes (int): The number of nodes in the graph.
        embedding_dim (int): The dimensionality of node embeddings.
        num_layers (int): The number of layers.
    """
    def __init__(
        self,
        num_nodes: int,
        embedding_dim: int,
        num_layers: int
    ):
        super().__init__()

        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.embedding = Embedding(num_nodes, embedding_dim)
        self.alpha = torch.tensor([1. / (num_layers + 1)] * (num_layers + 1))
        self.convs = ModuleList([GATConv(embedding_dim, embedding_dim, heads=8, dropout=0.6) for _ in range(num_layers)])
        self.linears = ModuleList([Linear(embedding_dim * 8, embedding_dim) for _ in range(num_layers)])
        torch.nn.init.xavier_uniform_(self.embedding.weight)

    def get_embedding(self, edge_index):
        x = self.embedding.weight
        out = x * self.alpha[0]

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = self.linears[i](x.view(-1, self.embedding_dim * 8))
            out = out + x * self.alpha[i + 1]

        return out


    def forward(self, edge_index):
        edge_label_index = edge_index
        out = self.get_embedding(edge_index)
        user = out[edge_label_index[0]]
        movie = out[edge_label_index[1]]
        return (user * movie).sum(dim=-1)


    def predict_link(self, edge_index, edge_label_index):
        "Predict links between nodes specified in edge_label_index."""
        pred = self(edge_index, edge_label_index).sigmoid()
        return pred.round()


    def recommend(self, edge_index, k):
        """Get top-k recommendations for nodes in src_index."""
        out_user = self.get_embedding(edge_index)
        out_movie = self.get_embedding(edge_index)
        pred = out_user @ out_movie.t()
        top_index = pred.topk(k, dim=-1).indices
        return top_index


    def link_pred_loss(self, pred, edge_label):
        """Computes the model loss for a link prediction using torch.nn.BCEWithLogitsLoss.
        
        Args:
            pred (torch.Tensor): The predictions.
            edge_label (torch.Tensor): The ground-truth edge labels.
        """
        loss_fn = torch.nn.BCEWithLogitsLoss()
        return loss_fn(pred, edge_label.to(pred.dtype))


    def recommendation_loss(self, pos_edge_rank, neg_edge_rank,
                            lambda_reg: float = 1e-4):
        """Computes the model loss for a ranking objective via the Bayesian
        Personalized Ranking (BPR) loss.

        Args:
            pos_edge_rank (torch.Tensor): Positive edge rankings.
            neg_edge_rank (torch.Tensor): Negative edge rankings.
            lambda_reg (int, optional): The L2 regularization strength
                of the Bayesian Personalized Ranking (BPR) loss.
        """
        loss_fn = BPRLoss(lambda_reg)
        return loss_fn(pos_edge_rank, neg_edge_rank, self.embedding.weight)

In [48]:
""" This is verbatim from https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/models/lightgcn.html. """
class BPRLoss(_Loss):
    """The Bayesian Personalized Ranking (BPR) loss."""
    __constants__ = ['lambda_reg']
    lambda_reg: float

    def __init__(self, lambda_reg: float = 0, **kwargs):
        super().__init__(None, None, "sum", **kwargs)
        self.lambda_reg = 0

    def forward(self, positives: Tensor, negatives: Tensor,
                parameters: Tensor = None) -> Tensor:
        """Compute the mean Bayesian Personalized Ranking (BPR) loss.

        Args:
            positives (Tensor): The vector of positive-pair rankings.
            negatives (Tensor): The vector of negative-pair rankings.
            parameters (Tensor, optional): The tensor of parameters which
                should be used for :math:`L_2` regularization
                (default: :obj:`None`).
        """
        n_pairs = positives.size(0)
        log_prob = F.logsigmoid(positives - negatives).mean()
        regularization = 0

        if self.lambda_reg != 0:
            regularization = self.lambda_reg * parameters.norm(p=2).pow(2)

        return (-log_prob + regularization) / n_pairs

In [165]:
def resample_hard_negative_edges_for_user(user_positive_edges, user_negative_edges, model, num_train_items, epoch):
    # Select hard negative edges based on current model parameters
    user_positive_items = user_positive_edges[1, :]
    
    # get the rankings for this user
    all_edges = torch.tensor([(user_id, item_id) for item_id in range(num_train_users, num_train_items)], dtype=torch.long).t().contiguous()
    with torch.no_grad():
        user_rankings = model.forward(all_edges) # this is of shape (42263) -- each index is the prediction for that index's movie
    mask = torch.ones(num_train_items - num_train_users, dtype=torch.bool) # gets indices of all the movies
    pos_items_mask = user_positive_items < num_train_items - num_train_users
    filtered_pos_items = user_positive_items[pos_items_mask]
    mask[filtered_pos_items] = True

    # get the rankings for negative items
    negative_rankings = user_rankings[mask]
    
    # select the top k negative items for this user
    if epoch != 0:
        k = min(epoch - 1, negative_rankings.shape[0])
    else:
        k = 0

    _, topk_items = torch.topk(negative_rankings, k)
    negative_items = torch.nonzero(mask).flatten()[topk_items]

    # create the new negative edges
    negative_edges_to_add = torch.tensor([[user_id] * k, negative_items], dtype=torch.long)
    new_negative_edges = torch.cat([user_negative_edges, negative_edges_to_add], dim=1)

    return user_positive_edges, new_negative_edges

In [168]:
import numpy as np
import math
import matplotlib.pyplot as plt

NUM_LAYERS = 1
LR = 5e-5
BATCH_SIZE = min(128, len(user_review_data))
EMBEDDING_DIM = 64
K = 10
model = LightGCN(num_nodes=num_nodes, embedding_dim=EMBEDDING_DIM, num_layers=NUM_LAYERS)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print("Running on device: {}".format(device))
print(EMBEDDING_DIM)

optim = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.95)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optim, milestones=[100, 200, 300, 400], gamma=0.5)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim, T_0=100)

train_positive_edges = train_graph.edge_index[:, train_graph.edge_attr >= 3.5]
train_negative_edges = train_graph.edge_index[:, train_graph.edge_attr <= 2.5]

validation_df = pd.DataFrame.from_dict(validation_reviews)
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(comment=f'LightGCN_{EMBEDDING_DIM}_layers_{NUM_LAYERS}_batch_size_{BATCH_SIZE}_lr_{LR}_num_train_users_{num_train_users}_num_train_items_{num_train_items}_recall_{K}')

for epoch in range(10):
    # we are using BPR so we go by user
    average_loss = 0
    # We'll proceed in batches of users
    for start_idx in tqdm(range(0, num_train_users, BATCH_SIZE)):
        model.train()
        loss = torch.tensor(0.0, requires_grad=True)
        # randomly select a batch of users
        users_in_batch = torch.randperm(num_train_users)[start_idx:start_idx + BATCH_SIZE]
        for user_id in users_in_batch:
            # get all the edges specific to this user
            user_positive_edges = train_positive_edges[:, train_positive_edges[0] == user_id]
            user_negative_edges = train_negative_edges[:, train_negative_edges[0] == user_id]
            if (user_positive_edges.shape[1] == 0 or user_negative_edges.shape[1] == 0):
                continue
            # limit the number of positive edges to 5000
            if (user_positive_edges.shape[1] > 5000):
                user_positive_edges = user_positive_edges[:, :5000]
            # Get at most 15000 negative edges
            if (user_negative_edges.shape[1] > 15000):
                user_negative_edges = user_negative_edges[:, :15000]
            # sample hard negative edges
            user_positive_edges, user_negative_edges = resample_hard_negative_edges_for_user(user_positive_edges, user_negative_edges, model, num_train_items, epoch)
            # resample negative edges to make sure we have enough
            user_positive_edges, user_negative_edges = resample_edges_for_user(user_positive_edges, user_negative_edges)
            # concatenate the positive and negative edges
            user_edges = torch.cat([user_positive_edges, user_negative_edges], dim=1)
            # get the rankings for this user
            user_edges = user_edges.to(device)
            user_rankings = model(user_edges)
            # divide the rankings into positive and negative rankings
            user_positive_rankings = user_rankings[:user_positive_edges.shape[1]]
            user_negative_rankings = user_rankings[user_positive_edges.shape[1]:]
            # create all pairs of positive and negative rankings
            user_positive_rankings = user_positive_rankings.unsqueeze(1).repeat(1, user_negative_rankings.shape[0])
            user_negative_rankings = user_negative_rankings.unsqueeze(0).repeat(user_positive_rankings.shape[0], 1)
            # get the user loss
            user_loss = model.recommendation_loss(user_positive_rankings, user_negative_rankings, 1e-4)
            # add the user loss to the total loss
            loss = loss + user_loss
        # divide the loss by the number of users
        loss = loss / BATCH_SIZE
        # log the loss
        # backprop
        optim.zero_grad()
        loss.backward()
        optim.step()
        writer.add_scalar("Loss/train", loss, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
        print(epoch * BATCH_SIZE + start_idx // BATCH_SIZE)
        average_loss = 0
        if (epoch * BATCH_SIZE + start_idx // BATCH_SIZE) % 100 == 0:
            # evaluate the model
            model.eval()
            # iterate over all users in the validation set
            validation_users = list(set([int(x) for x in validation_edges[0, :]]))
            # randomly select 1000 of the users
            validation_users = random.sample(validation_users, min(len(validation_users), 500))
            mean_ndcg = 0
            ndcg_scores = []
            for user in tqdm(validation_users):
                user_id = id_to_user[user]
                relevant_reviews = validation_df[validation_df['user_id'] == user_id]
                user_validation_edges = validation_edges[:, validation_edges[0] == user]
                user_validation_edges = user_validation_edges.to(device)
                user_rankings = model(user_validation_edges)
                edges_sorted = list(user_validation_edges[1, user_rankings.argsort(descending=True)])
                # use validation_df to get the relevances via the movie_id column and the movie_rating column
                relevances = []
                for edge in edges_sorted:
                    movie_id = id_to_movie[int(edge)]
                    if (movie_id in relevant_reviews['movie_id'].values):
                        relevances.append(relevant_reviews[relevant_reviews['movie_id'] == movie_id]['movie_rating'].values[0])
                    else:
                        relevances.append(0)
                # calculate the ndcg
                ndcg = compute_ndcg_at_k(relevances)
                if (math.isnan(ndcg)):
                    print(relevant_reviews)
                    input()
                mean_ndcg += ndcg
                ndcg_scores.append(ndcg)
            mean_ndcg = mean_ndcg / len(validation_users)
            print("Standard Deviation: {}".format(np.std(ndcg_scores)))
            # create a histogram of the ndcg scores, make bins for each 0.1
            ndcg_scores = np.array(ndcg_scores).squeeze()
            writer.add_histogram("hist_NDCG/val", ndcg_scores, epoch)
            # also make a histogram in matplotlib and save as png
            plt.hist(ndcg_scores, bins=np.arange(0, 1.1, 0.1))
            plt.suptitle("Validation NDCG Histogram")
            # write information about the model to the histogram
            plt.title(f"Model: LightGCN, Embedding Dim: {EMBEDDING_DIM}, Num Layers: {NUM_LAYERS}, Batch Size: {BATCH_SIZE}, LR: {LR}, Num Train Users: {num_train_users}, Num Train Items: {num_train_items}", fontsize=8, wrap=True)
            plt.xlabel("NDCG")
            plt.ylabel("Frequency")
            # save the figure in the hist_NDCG folder, with the title having the model information and the epoch number
            plt.savefig(f"hist_NDCG/val_{EMBEDDING_DIM}_{NUM_LAYERS}_{BATCH_SIZE}_{LR}_{num_train_users}_{num_train_items}_{epoch}.png")
            plt.close()
            # Also save the raw NDCG scores to a csv file, with the model information in the title, and the epoch number
            np.savetxt(f"hist_NDCG/val_{EMBEDDING_DIM}_{NUM_LAYERS}_{BATCH_SIZE}_{LR}_{num_train_users}_{num_train_items}_{epoch}.csv", ndcg_scores, delimiter=",")
            print(mean_ndcg)
            writer.add_scalar("NDCG/val", mean_ndcg, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
            recall_at_k = compute_recall_at_k(validation_graph, model, K)
            writer.add_scalar("Recall@K/val", recall_at_k, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
            print("Epoch: {}, NDCG: {}, Recall@{}: {}".format(epoch, mean_ndcg, K, recall_at_k))
            average_number_of_matches = 0
            for user_id in validation_users:
                all_edges = torch.tensor([(user_id, item_id) for item_id in range(num_train_users, num_train_items)], dtype=torch.long).t().contiguous()
                recommendations = model.recommend(all_edges.to(device), src_index=torch.tensor([user_id]), dst_index=torch.tensor([x for x in range(num_train_users + 1, num_train_items)]), k=10)[0]
                movie_names = [movie_id_to_movie_name[id_to_movie[int(recommendation)]] for recommendation in recommendations]
                true_user_reviews = user_review_data[id_to_user[user_id]]
                matches = 0
                for movie_name in movie_names:
                    if movie_name in true_user_reviews['movie_title'].values:
                        matches += 1
                average_number_of_matches += matches
            average_number_of_matches = average_number_of_matches / len(validation_users)
            print("Average number of matches: {}".format(average_number_of_matches))
            writer.add_scalar("Average number of matches", average_number_of_matches, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
            print("=====================================")

Running on device: cpu
64


  0%|          | 0/8 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [None]:
validation_users = list(set([int(x) for x in validation_edges[0, :]]))
validation_df[validation_df.user_id == id_to_user[0]]

In [None]:
validation_edges[:, validation_edges[0] == 0]

In [None]:
def get_user_positive_items(edge_index):
    """Generates dictionary of positive items for each user

    Args:
        edge_index (torch.Tensor): 2 by N list of edges

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [None]:
print()