In [8]:
!pip install torch-sparse torch-scatter -f https://data.pyg.org/whl/torch-2.0.0%2Bcu117.html

Looking in links: https://data.pyg.org/whl/torch-2.0.0%2Bcu117.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_scatter-2.1.1%2Bpt20cu117-cp38-cp38-win_amd64.whl (3.6 MB)
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.1.1+pt20cu117


In [37]:
!pip install pandas matplotlib tensorboard

Collecting tensorboard
  Using cached tensorboard-2.12.0-py3-none-any.whl (5.6 MB)
Collecting markdown>=2.6.8
  Using cached Markdown-3.4.1-py3-none-any.whl (93 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting werkzeug>=1.0.1
  Using cached Werkzeug-2.2.3-py3-none-any.whl (233 kB)
Collecting absl-py>=0.4
  Using cached absl_py-1.4.0-py3-none-any.whl (126 kB)
Collecting grpcio>=1.48.2
  Downloading grpcio-1.51.3-cp38-cp38-win_amd64.whl (3.7 MB)
     ---------------------------------------- 3.7/3.7 MB 12.0 MB/s eta 0:00:00
Collecting google-auth<3,>=1.6.3
  Downloading google_auth-2.16.2-py2.py3-none-any.whl (177 kB)
     ---------------------------------------- 177.2/177.2 kB ? eta 0:00:00
Collecting protobuf>=3.19.6
  Downloading protobuf-4.22.1-cp38-cp38-win_amd64.whl (420 kB)
     ------------------------------------- 420.6/420.6 kB 13.2 MB/s eta 0:00:00
Collecting tensorboard-plugin-wit>=1.6.0
  Using c

In [1]:
from torch_geometric.nn.models.lightgcn import LightGCN
import pandas as pd
import os
from tqdm import tqdm
import torch
import numpy as np

## Load Data
We can begin by loading in the user review data. For each user, we have a subset of the movies that they reviewed. We'll load each of the CSVs as dataframes, and store a dict of user IDs corresponding to their dataframes.

In [2]:
# for now we will use the first 10k rows of the data, set to None to use all data
AMOUNT_TO_LOAD = 5000

In [3]:
user_reviews_dir = 'user_reviews'
user_review_data = dict()

for filename in tqdm(os.listdir(user_reviews_dir)):
    if AMOUNT_TO_LOAD is not None and len(user_review_data) >= AMOUNT_TO_LOAD:
        break
    try:
        user_review_data[filename] = pd.read_csv(os.path.join(user_reviews_dir, filename), encoding='unicode_escape')
    except pd.errors.EmptyDataError:
        print(f'Empty file: {filename}')
        pass

  1%|          | 334/63111 [00:02<05:57, 175.76it/s]

Empty file: 468889434_reviews.csv


  4%|▍         | 2654/63111 [00:15<05:25, 185.63it/s]

Empty file: alinetta_reviews.csv


  8%|▊         | 5002/63111 [00:27<05:21, 180.96it/s]


Now let's split the data into training, validation, and test sets. Since this is a recommender, we're gonna split by removing some of the user's reviews.

For every user, so long as the user has more than 5 reviews, remove one review for the validation set and one review for the test set.

In [4]:
print(list(user_review_data.keys())[0])

0001kidd_reviews.csv


In [5]:
# remove all values with nan in the review column
for key in tqdm(user_review_data.keys()):
    user_review_data[key] = user_review_data[key].dropna(subset=['movie_rating'])

100%|██████████| 5000/5000 [00:04<00:00, 1179.76it/s]


In [6]:
train_reviews = []
validation_reviews = []
test_reviews = []
for user_id, reviews in tqdm(user_review_data.items()):
    if len(reviews) > 80:
        validation_review_data_df = reviews.sample(40, replace=False)
        validation_review_data = validation_review_data_df.to_dict('records')
        for review in validation_review_data:
            review['user_id'] = user_id
        validation_reviews.extend(validation_review_data)
        # remove the validation reviews from the training data
        reviews = reviews.drop(validation_review_data_df.index)
        test_review_data_df = reviews.sample(20, replace=False)
        test_review_data = test_review_data_df.to_dict('records')
        for review in test_review_data:
            review['user_id'] = user_id
        test_reviews.extend(test_review_data)
        # remove the test reviews from the training data
        reviews = reviews.drop(test_review_data_df.index)
        train_review_data = reviews.to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        train_reviews.extend(train_review_data)
    else:
        # if the user has less than 5 reviews, we will use all of them for training
        train_review_data = reviews.to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        train_reviews.extend(train_review_data)

print(f'Train reviews: {len(train_reviews)}')
print(f'Validation reviews: {len(validation_reviews)}')
print(f'Test reviews: {len(test_reviews)}')

100%|██████████| 5000/5000 [00:13<00:00, 378.44it/s]

Train reviews: 1748693
Validation reviews: 141560
Test reviews: 70780





## Build the Model
Now that we have the training data, let's construct the model to train.

In [7]:
num_train_users = len(set([review['user_id'] for review in train_reviews]))
num_train_items = len(set([review['movie_id'] for review in train_reviews]))
num_total_items = len(set([review['movie_id'] for review in train_reviews + validation_reviews + test_reviews]))
num_nodes = num_train_users + num_total_items
print(f'Number of train users: {num_train_users}')
print(f'Number of train items: {num_train_items}')
print(f'Number of nodes: {num_nodes}')

Number of train users: 5000
Number of train items: 87735
Number of nodes: 95299


In [8]:
num_val_users = len(set([review['user_id'] for review in validation_reviews]))
num_val_items = len(set([review['movie_id'] for review in validation_reviews]))
num_val_nodes = num_val_users + num_val_items
num_test_users = len(set([review['user_id'] for review in test_reviews]))
num_test_items = len(set([review['movie_id'] for review in test_reviews]))
num_test_nodes = num_test_users + num_test_items

In [9]:
# Let's map users to ids
movie_id_to_movie_name = dict()
for review in train_reviews + validation_reviews + test_reviews:
    movie_id_to_movie_name[review['movie_id']] = review['movie_title']

user_to_id = dict()
for i, user_id in enumerate(set([review['user_id'] for review in train_reviews + validation_reviews + test_reviews])):
    user_to_id[user_id] = i

# Let's map movies to ids
movie_to_id = dict()
for i, movie_id in enumerate(set([review['movie_id'] for review in train_reviews + validation_reviews + test_reviews])):
    movie_to_id[movie_id] = i + num_train_users

# Let's map ids to users
id_to_user = dict()
for user_id, index in user_to_id.items():
    id_to_user[index] = user_id

# Let's map ids to movies
id_to_movie = dict()
for movie_id, index in movie_to_id.items():
    id_to_movie[index] = movie_id

# Let's map movie names to movie ids
movie_name_to_movie_id = dict()
for movie_id, movie_name in movie_id_to_movie_name.items():
    movie_name_to_movie_id[movie_name] = movie_id

In [10]:
import random

def convert_review_to_edge(review):
    user_id = user_to_id[review['user_id']]
    movie_id = movie_to_id[review['movie_id']]
    edge_weight = review['movie_rating']
    if (edge_weight < 3.5 and edge_weight > 2.5):
        return None, None
    edge = (user_id, movie_id)
    edge_weight = review['movie_rating']
    return edge, edge_weight

def convert_reviews_to_edges(reviews):
    edges = []
    edge_weights = []
    for review in tqdm(reviews):
        edge, edge_weight = convert_review_to_edge(review)
        if edge is not None:
            edges.append(edge)
            edge_weights.append(edge_weight)
    
    # Reformat the edges to be a tensor
    edges = torch.tensor(edges, dtype=torch.long).t().contiguous()
    return edges, edge_weights

In [11]:
# Now let's create the edges between users and movies.
# The id of the user will be the index of the user in the user_to_id dict
# The id of the movie will be the index of the movie in the movie_to_id dict + the number of users

train_edges, train_edge_weights = convert_reviews_to_edges(train_reviews)
validation_edges, validation_edge_weights = convert_reviews_to_edges(validation_reviews)
test_edges, test_edge_weights = convert_reviews_to_edges(test_reviews)

print(f'Train edges: {train_edges.shape[1]}')
print(f'Validation edges: {validation_edges.shape[1]}')

100%|██████████| 1748693/1748693 [00:01<00:00, 1106817.88it/s]
100%|██████████| 141560/141560 [00:00<00:00, 1047565.88it/s]
100%|██████████| 70780/70780 [00:00<00:00, 994955.52it/s]

Train edges: 1407492
Validation edges: 116433





In [12]:
import torch_geometric.data as data

# create the graph
train_graph = data.Data(
    edge_index=train_edges,
    edge_attr=torch.tensor(train_edge_weights),
    num_nodes=num_nodes
)

validation_graph = data.Data(
    edge_index=validation_edges,
    edge_attr=torch.tensor(validation_edge_weights),
    num_nodes=num_nodes
)

test_graph = data.Data(
    edge_index=test_edges,
    edge_attr=torch.tensor(test_edge_weights),
    num_nodes=num_nodes
)

In [13]:
train_graph.validate(raise_on_error=True)
validation_graph.validate(raise_on_error=True)

True

In [14]:
# Let's create some negative edges
def resample_edges_for_user(user_positive_edges, user_negative_edges):
    num_negative_edges_to_add = user_positive_edges.shape[1] * 3 - user_negative_edges.shape[1]
    if (num_negative_edges_to_add <= 0):
        num_negative_edges_to_remove = -num_negative_edges_to_add
        # choose the negative edges to keep
        negative_edges_to_keep = torch.randint(user_negative_edges.shape[1], (user_negative_edges.shape[1] - num_negative_edges_to_remove,))
        # remove all the negative edges for this user
        user_negative_edges = user_negative_edges[:, negative_edges_to_keep]
    else:
        # Create new negative edges
        negative_edges_to_add = torch.tensor([[user_id] * num_negative_edges_to_add, torch.randint(num_train_users, num_train_items, (num_negative_edges_to_add,))], dtype=torch.long)
        # Add the negative edges to the negative edges for this user
        user_negative_edges = torch.cat([user_negative_edges, negative_edges_to_add], dim=1)
    return user_positive_edges, user_negative_edges
        

In [15]:
# let's compute ndcg
def compute_ndcg_at_k(relevances, k=5):
    dcg = 0
    for i, relevance in enumerate(relevances):
        if i == k:
            break
        dcg += (relevance) / np.log2(i + 2)
    idcg = 0
    for i, relevance in enumerate(sorted(relevances, reverse=True)):
        if i == k:
            break
        idcg += (relevance) / np.log2(i + 2)
    return dcg / idcg

In [16]:
def get_user_positive_items(edge_index):
    """Generates dictionary of positive items for each user

    Args:
        edge_index (torch.Tensor): 2 by N list of edges

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [17]:
import time
def compute_recall_at_k(validation_graph, model, K):
    # get positive edges in validation set
    positive_edges = validation_graph.edge_index[:, validation_graph.edge_attr > 3.5]

    # map users to positive edges
    user_pos_items = get_user_positive_items(positive_edges)

    # get users
    users = positive_edges[0].unique()

    users = users[torch.randint(users.shape[0], (min(200, len(users)),))]
    # filter the validation edges to only the users we want to evaluate
    user_validation_edges = []
    for user in users:
        user_validation_edges.append(validation_graph.edge_index[:, validation_graph.edge_index[0] == user])
    user_validation_edges = torch.cat(user_validation_edges, dim=1)
    print(user_validation_edges.shape)

    first_user_id = users[0].item()
    user_name = id_to_user[first_user_id]
    print(f'User: {user_name}')

    # get movies
    movie_indices = torch.LongTensor([_ for _ in range(len(users) + 1, validation_graph.num_nodes)]).to(device)

    # Get positive items for each user in validation set
    truth_items = [set(user_pos_items[user.item()]) for user in users]

    first_user_truth_items = truth_items[0]
    first_user_truth_items = [id_to_movie[item] for item in first_user_truth_items]
    first_user_truth_items = [movie_id_to_movie_name[item] for item in first_user_truth_items]
    print(first_user_truth_items)

    training_edges = train_graph.edge_index

    # Get top-K recommended items for each user in validation set
    total_recall = 0
    print("Computing recommendations for {} users".format(len(users)))
    for user_index, user_id in tqdm(enumerate(users), total=len(users)):
        tick = time.time()
        all_edges = torch.tensor([(user_id, item_id) for item_id in range(num_train_users, num_train_items)], dtype=torch.long).t().contiguous()
        recommendations = model.recommend(all_edges.to(device), src_index=torch.tensor([user_id]).to(device), dst_index=torch.tensor([x for x in range(num_train_users + 1, num_train_items)]).to(device), k=10 * K)[0]
        tock = time.time()
        train_edges_for_user = training_edges[:, training_edges[0] == user_id].to(device)
        # remove all the recommendations that are in the training set
        recommendations = recommendations[~torch.isin(recommendations, train_edges_for_user[1])][:K]
        if (len(recommendations) < K):
            print("Not enough recommendations for user {}".format(user_id))
            continue
        if (user_id == first_user_id):
            first_user_recommended_items = recommendations
            first_user_recommended_items = [id_to_movie[item.item()] for item in first_user_recommended_items if item.item() > num_train_users]
            first_user_recommended_items = [movie_id_to_movie_name[item] for item in first_user_recommended_items if item in movie_id_to_movie_name]
            print(first_user_recommended_items)
        # num_intersect = 0
        truth_items_for_user = truth_items[user_index]
        # for item in recommendations:
        #     item = item.item()
        #     if item in truth_items_for_user:
        #         num_intersect += 1
        # print(num_intersect)
        num_intersect = len(set([item.item() for item in recommendations]).intersection(truth_items[user_index]))
        recall = num_intersect / len(truth_items_for_user)
        total_recall += recall
    return total_recall / len(users)



In [18]:
from typing import Optional, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList, Linear
from torch.nn.modules.loss import _Loss

from torch_geometric.nn.conv import LGConv, GATv2Conv
from torch_geometric.typing import Adj, OptTensor, SparseTensor

In [19]:
class CustomLightGCN(torch.nn.Module):
    r"""The LightGCN model from the `"LightGCN: Simplifying and Powering
    Graph Convolution Network for Recommendation"
    <https://arxiv.org/abs/2002.02126>`_ paper.
    :class:`~torch_geometric.nn.models.LightGCN` learns embeddings by linearly
    propagating them on the underlying graph, and uses the weighted sum of the
    embeddings learned at all layers as the final embedding
    .. math::
        \textbf{x}_i = \sum_{l=0}^{L} \alpha_l \textbf{x}^{(l)}_i,
    where each layer's embedding is computed as
    .. math::
        \mathbf{x}^{(l+1)}_i = \sum_{j \in \mathcal{N}(i)}
        \frac{1}{\sqrt{\deg(i)\deg(j)}}\mathbf{x}^{(l)}_j.
    Two prediction heads and training objectives are provided:
    **link prediction** (via
    :meth:`~torch_geometric.nn.models.LightGCN.link_pred_loss` and
    :meth:`~torch_geometric.nn.models.LightGCN.predict_link`) and
    **recommendation** (via
    :meth:`~torch_geometric.nn.models.LightGCN.recommendation_loss` and
    :meth:`~torch_geometric.nn.models.LightGCN.recommend`).
    .. note::
        Embeddings are propagated according to the graph connectivity specified
        by :obj:`edge_index` while rankings or link probabilities are computed
        according to the edges specified by :obj:`edge_label_index`.
    Args:
        num_nodes (int): The number of nodes in the graph.
        embedding_dim (int): The dimensionality of node embeddings.
        num_layers (int): The number of
            :class:`~torch_geometric.nn.conv.LGConv` layers.
        alpha (float or torch.Tensor, optional): The scalar or vector
            specifying the re-weighting coefficients for aggregating the final
            embedding. If set to :obj:`None`, the uniform initialization of
            :obj:`1 / (num_layers + 1)` is used. (default: :obj:`None`)
        **kwargs (optional): Additional arguments of the underlying
            :class:`~torch_geometric.nn.conv.LGConv` layers.
    """
    def __init__(
        self,
        num_nodes: int,
        embedding_dim: int,
        num_layers: int,
        alpha: Optional[Union[float, Tensor]] = None,
        **kwargs,
    ):
        super().__init__()

        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        if alpha is None:
            alpha = 1. / (num_layers + 1)

        if isinstance(alpha, Tensor):
            assert alpha.size(0) == num_layers + 1
        else:
            alpha = torch.tensor([alpha] * (num_layers + 1))
        self.register_buffer('alpha', alpha)

        self.embedding = Embedding(num_nodes, embedding_dim)
        self.num_heads = 2
        self.convs = ModuleList([GATv2Conv(embedding_dim, embedding_dim, heads=self.num_heads, dropout=0.5) for _ in range(num_layers)])
        self.linears = ModuleList([Linear(embedding_dim * self.num_heads, embedding_dim) for _ in range(num_layers)])
        self.reset_parameters()

    def reset_parameters(self):
        r"""Resets all learnable parameters of the module."""
        torch.nn.init.xavier_uniform_(self.embedding.weight)
        for conv in self.convs:
            conv.reset_parameters()

    def get_embedding(self, edge_index: Adj) -> Tensor:
        r"""Returns the embedding of nodes in the graph."""
        x = self.embedding.weight
        out = x * self.alpha[0]

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = self.linears[i](x.view(-1, self.embedding_dim * self.num_heads))
            out = out + x * self.alpha[i + 1]

        return out

    def forward(self, edge_index: Adj,
                edge_label_index: OptTensor = None) -> Tensor:
        r"""Computes rankings for pairs of nodes.
        Args:
            edge_index (torch.Tensor or SparseTensor): Edge tensor specifying
                the connectivity of the graph.
            edge_label_index (torch.Tensor, optional): Edge tensor specifying
                the node pairs for which to compute rankings or probabilities.
                If :obj:`edge_label_index` is set to :obj:`None`, all edges in
                :obj:`edge_index` will be used instead. (default: :obj:`None`)
        """
        if edge_label_index is None:
            if isinstance(edge_index, SparseTensor):
                edge_label_index = torch.stack(edge_index.coo()[:2], dim=0)
            else:
                edge_label_index = edge_index

        out = self.get_embedding(edge_index)

        out_src = out[edge_label_index[0]]
        out_dst = out[edge_label_index[1]]
        return (out_src * out_dst).sum(dim=-1)

    def predict_link(self, edge_index: Adj, edge_label_index: OptTensor = None,
                     prob: bool = False) -> Tensor:
        r"""Predict links between nodes specified in :obj:`edge_label_index`.
        Args:
            prob (bool, optional): Whether probabilities should be returned.
                (default: :obj:`False`)
        """
        pred = self(edge_index, edge_label_index).sigmoid()
        return pred if prob else pred.round()

    def recommend(self, edge_index: Adj, src_index: OptTensor = None,
                  dst_index: OptTensor = None, k: int = 1) -> Tensor:
        r"""Get top-:math:`k` recommendations for nodes in :obj:`src_index`.
        Args:
            src_index (torch.Tensor, optional): Node indices for which
                recommendations should be generated.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            dst_index (torch.Tensor, optional): Node indices which represent
                the possible recommendation choices.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            k (int, optional): Number of recommendations. (default: :obj:`1`)
        """
        out_src = out_dst = self.get_embedding(edge_index)

        if src_index is not None:
            out_src = out_src[src_index]

        if dst_index is not None:
            out_dst = out_dst[dst_index]

        pred = out_src @ out_dst.t()
        top_index = pred.topk(k, dim=-1).indices

        if dst_index is not None:  # Map local top-indices to original indices.
            top_index = dst_index[top_index.view(-1)].view(*top_index.size())

        return top_index

    def link_pred_loss(self, pred: Tensor, edge_label: Tensor,
                       **kwargs) -> Tensor:
        r"""Computes the model loss for a link prediction objective via the
        :class:`torch.nn.BCEWithLogitsLoss`.
        Args:
            pred (torch.Tensor): The predictions.
            edge_label (torch.Tensor): The ground-truth edge labels.
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch.nn.BCEWithLogitsLoss` loss function.
        """
        loss_fn = torch.nn.BCEWithLogitsLoss(**kwargs)
        return loss_fn(pred, edge_label.to(pred.dtype))

    def recommendation_loss(self, pos_edge_rank: Tensor, neg_edge_rank: Tensor,
                            lambda_reg: float = 1e-4, **kwargs) -> Tensor:
        r"""Computes the model loss for a ranking objective via the Bayesian
        Personalized Ranking (BPR) loss.
        .. note::
            The i-th entry in the :obj:`pos_edge_rank` vector and i-th entry
            in the :obj:`neg_edge_rank` entry must correspond to ranks of
            positive and negative edges of the same entity (*e.g.*, user).
        Args:
            pos_edge_rank (torch.Tensor): Positive edge rankings.
            neg_edge_rank (torch.Tensor): Negative edge rankings.
            lambda_reg (int, optional): The :math:`L_2` regularization strength
                of the Bayesian Personalized Ranking (BPR) loss.
                (default: :obj:`1e-4`)
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch_geometric.nn.models.lightgcn.BPRLoss` loss
                function.
        """
        loss_fn = BPRLoss(lambda_reg, **kwargs)
        return loss_fn(pos_edge_rank, neg_edge_rank, self.embedding.weight)

    def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({self.num_nodes}, '
                f'{self.embedding_dim}, num_layers={self.num_layers})')


class BPRLoss(_Loss):
    r"""The Bayesian Personalized Ranking (BPR) loss.
    The BPR loss is a pairwise loss that encourages the prediction of an
    observed entry to be higher than its unobserved counterparts
    (see `here <https://arxiv.org/abs/2002.02126>`__).
    .. math::
        L_{\text{BPR}} = - \sum_{u=1}^{M} \sum_{i \in \mathcal{N}_u}
        \sum_{j \not\in \mathcal{N}_u} \ln \sigma(\hat{y}_{ui} - \hat{y}_{uj})
        + \lambda \vert\vert \textbf{x}^{(0)} \vert\vert^2
    where :math:`lambda` controls the :math:`L_2` regularization strength.
    We compute the mean BPR loss for simplicity.
    Args:
        lambda_reg (float, optional): The :math:`L_2` regularization strength
            (default: 0).
        **kwargs (optional): Additional arguments of the underlying
            :class:`torch.nn.modules.loss._Loss` class.
    """
    __constants__ = ['lambda_reg']
    lambda_reg: float

    def __init__(self, lambda_reg: float = 0, **kwargs):
        super().__init__(None, None, "sum", **kwargs)
        self.lambda_reg = lambda_reg

    def forward(self, positives: Tensor, negatives: Tensor,
                parameters: Tensor = None) -> Tensor:
        r"""Compute the mean Bayesian Personalized Ranking (BPR) loss.
        .. note::
            The i-th entry in the :obj:`positives` vector and i-th entry
            in the :obj:`negatives` entry should correspond to the same
            entity (*.e.g*, user), as the BPR is a personalized ranking loss.
        Args:
            positives (Tensor): The vector of positive-pair rankings.
            negatives (Tensor): The vector of negative-pair rankings.
            parameters (Tensor, optional): The tensor of parameters which
                should be used for :math:`L_2` regularization
                (default: :obj:`None`).
        """
        n_pairs = positives.size(0)
        log_prob = F.logsigmoid(positives - negatives).mean()
        regularization = 0

        if self.lambda_reg != 0:
            regularization = self.lambda_reg * parameters.norm(p=2).pow(2)

        return (-log_prob + regularization) / n_pairs

In [20]:
""" This is verbatim from https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/models/lightgcn.html. """
class BPRLoss(_Loss):
    """The Bayesian Personalized Ranking (BPR) loss."""
    __constants__ = ['lambda_reg']
    lambda_reg: float

    def __init__(self, lambda_reg: float = 0, **kwargs):
        super().__init__(None, None, "sum", **kwargs)
        self.lambda_reg = 0

    def forward(self, positives: Tensor, negatives: Tensor,
                parameters: Tensor = None) -> Tensor:
        """Compute the mean Bayesian Personalized Ranking (BPR) loss.

        Args:
            positives (Tensor): The vector of positive-pair rankings.
            negatives (Tensor): The vector of negative-pair rankings.
            parameters (Tensor, optional): The tensor of parameters which
                should be used for :math:`L_2` regularization
                (default: :obj:`None`).
        """
        n_pairs = positives.size(0)
        log_prob = F.logsigmoid(positives - negatives).mean()
        regularization = 0

        if self.lambda_reg != 0:
            regularization = self.lambda_reg * parameters.norm(p=2).pow(2)

        return (-log_prob + regularization) / n_pairs

In [21]:
def resample_hard_negative_edges_for_user(user_positive_edges, user_negative_edges, model, num_train_items, k):
    device = 'cuda'
    # Select hard negative edges based on current model parameters
    user_positive_items = user_positive_edges[1, :]
    # randomly select a positive edge
    positive_edge = user_positive_edges[:, torch.randint(0, user_positive_edges.shape[1], (1,))]
    # get the rankings for this user
    all_edges = torch.tensor([(user_id, item_id) for item_id in range(num_train_users, num_train_items)], dtype=torch.long).t().contiguous().to(device)
    with torch.no_grad():
        user_rankings = model.forward(all_edges) # this is of shape (42263) -- each index is the prediction for that index's movie
    mask = torch.ones(num_train_items - num_train_users, dtype=torch.bool).to(device) # gets indices of all the movies
    pos_items_mask = user_positive_items < num_train_items - num_train_users
    filtered_pos_items = user_positive_items[pos_items_mask]
    mask[filtered_pos_items] = True

    # get the rankings for negative items
    negative_rankings = user_rankings[mask]

    _, topk_items = torch.topk(negative_rankings, k)
    negative_items = torch.nonzero(mask).flatten()[topk_items]
    # create the new negative edges
    negative_edges_to_add = torch.tensor([[user_id] * k, negative_items], dtype=torch.long).to(device)
    return positive_edge, negative_edges_to_add


In [24]:
    import numpy as np
    import math
    import matplotlib.pyplot as plt

    NUM_LAYERS = 1
    LR = 5e-4
    BATCH_SIZE = 16
    EMBEDDING_DIM = 32
    LOAD_CHECKPOINT = False
    K = 20
    REG = 1e-3
    model = CustomLightGCN(num_nodes=num_nodes, embedding_dim=EMBEDDING_DIM, num_layers=NUM_LAYERS, normalize=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    if LOAD_CHECKPOINT:
        model.load_state_dict(torch.load(f'models/{EMBEDDING_DIM}_{NUM_LAYERS}_{1024}_{1e-3}_{num_train_users}_{143295}.pt', map_location=device))

    print("Running on device: {}".format(device))
    print(EMBEDDING_DIM)

    optim = torch.optim.Adam(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.95)
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optim, milestones=[100, 200, 300, 400], gamma=0.5)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim, T_0=100)

    train_positive_edges = train_graph.edge_index[:, train_graph.edge_attr >= 3.5]
    train_negative_edges = train_graph.edge_index[:, train_graph.edge_attr <= 2.5]

    validation_df = pd.DataFrame.from_dict(validation_reviews)
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter(comment=f'LightGCN_{EMBEDDING_DIM}_layers_{NUM_LAYERS}_batch_size_{BATCH_SIZE}_lr_{LR}_num_train_users_{num_train_users}_num_train_items_{num_train_items}_recall_{K}')

    for epoch in range(10001):
        # we are using BPR so we go by user
        # We'll proceed in batches of users
        for start_idx in tqdm(range(0, num_train_users, BATCH_SIZE)):
            model.train()
            all_positive_rankings = torch.tensor([]).cuda()
            all_negative_rankings = torch.tensor([]).cuda()
            # randomly select a batch of users
            users_in_batch = torch.randperm(num_train_users)[:BATCH_SIZE]
            # for each user randomly select a positive edge and 5 negative edges
            # use torch to do this efficiently
            for user_id in users_in_batch:
                # get one random positive edge
                user_positive_edges = train_positive_edges[:, train_positive_edges[0] == user_id]
                user_negative_edges = train_negative_edges[:, train_negative_edges[0] == user_id]
                if (user_positive_edges.shape[1] == 0 or user_negative_edges.shape[1] == 0):
                    continue
                first_negative_edges = user_negative_edges[:, torch.randint(0, user_negative_edges.shape[1], (5,))].cuda()
                positive_edge, negative_edges = resample_hard_negative_edges_for_user(user_positive_edges, user_negative_edges, model, num_train_items, epoch)
                negative_edges = torch.cat((negative_edges, first_negative_edges), dim=1)
                positive_edge = positive_edge.cuda()
                user_edges = torch.cat((positive_edge, negative_edges), dim=1)
                # get the rankings of the positive and negative edges
                user_rankings = model(user_edges.cuda())
                del user_edges
                torch.cuda.empty_cache()
                # compute the loss
                positive_rankings = user_rankings[0].unsqueeze(0).repeat(negative_edges.shape[1])
                negative_rankings = user_rankings[1:]
                all_positive_rankings = torch.cat((all_positive_rankings, positive_rankings))
                all_negative_rankings = torch.cat((all_negative_rankings, negative_rankings))
            # compute the loss
            loss = model.recommendation_loss(all_positive_rankings, all_negative_rankings, REG)
            del all_positive_rankings
            del all_negative_rankings
            torch.cuda.empty_cache()
            optim.zero_grad()
            loss.backward()
            optim.step()
            writer.add_scalar("Loss/train", loss, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
            if (start_idx / BATCH_SIZE) % 100 == 0:
                # evaluate the model
                model.eval()
                # iterate over all users in the validation set
                validation_users = list(set([int(x) for x in validation_edges[0, :]]))
                # randomly select 1000 of the users
                average_val_loss = 0
                for i in range(3):
                    validation_users = random.sample(validation_users, min(len(validation_users), 16))
                    mean_ndcg = 0
                    ndcg_scores = []
                    validation_positive_edges = validation_graph.edge_index[:, validation_graph.edge_attr >= 3.5]
                    validation_negative_edges = validation_graph.edge_index[:, validation_graph.edge_attr <= 2.5]
                    val_positive_rankings = torch.tensor([]).cuda()
                    val_negative_rankings = torch.tensor([]).cuda()
                    for user in tqdm(validation_users):
                        user_id = id_to_user[user]
                        relevant_reviews = validation_df[validation_df['user_id'] == user_id]
                        user_validation_edges = validation_edges[:, validation_edges[0] == user]
                        user_validation_edges = user_validation_edges.to(device)
                        user_rankings = model(user_validation_edges).cpu()
                        user_validation_edges = user_validation_edges.cpu()
                        edges_sorted = list(user_validation_edges[1, user_rankings.argsort(descending=True)])
                        # use validation_df to get the relevances via the movie_id column and the movie_rating column
                        relevances = []
                        for edge in edges_sorted:
                            movie_id = id_to_movie[int(edge)]
                            if (movie_id in relevant_reviews['movie_id'].values):
                                relevances.append(relevant_reviews[relevant_reviews['movie_id'] == movie_id]['movie_rating'].values[0])
                            else:
                                relevances.append(0)
                        # calculate the ndcg
                        if (len(relevances) >= K):
                            ndcg = compute_ndcg_at_k(relevances, k=K)
                        if (math.isnan(ndcg)):
                            print(relevant_reviews)
                            input()
                        mean_ndcg += ndcg
                        ndcg_scores.append(ndcg)
                        user_positive_edges = validation_positive_edges[:, validation_positive_edges[0] == user]
                        user_negative_edges = validation_negative_edges[:, validation_negative_edges[0] == user]
                        if (user_positive_edges.shape[1] == 0 or user_negative_edges.shape[1] == 0):
                            continue
                        positive_edge = user_positive_edges[:, torch.randint(0, user_positive_edges.shape[1], (1,))]
                        negative_edges = user_negative_edges[:, torch.randint(0, user_negative_edges.shape[1], (5,))]
                        all_edges = torch.cat([positive_edge, negative_edges], dim=1)
                        all_rankings = model(all_edges.cuda())
                        del all_edges
                        torch.cuda.empty_cache()
                        positive_rankings = all_rankings[0].unsqueeze(0).repeat(5)
                        negative_rankings = all_rankings[1:]
                        val_positive_rankings = torch.cat([val_positive_rankings, positive_rankings])
                        val_negative_rankings = torch.cat([val_negative_rankings, negative_rankings])
                        # calculate the validation loss
                    with torch.no_grad():
                        val_loss = model.recommendation_loss(val_positive_rankings, val_negative_rankings, REG)
                    del val_positive_rankings
                    del val_negative_rankings
                    torch.cuda.empty_cache()
                    average_val_loss += val_loss
                writer.add_scalar("Loss/val", val_loss / 3, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
                mean_ndcg = mean_ndcg / len(validation_users)
                print("Standard Deviation: {}".format(np.std(ndcg_scores)))
                # create a histogram of the ndcg scores, make bins for each 0.1
                ndcg_scores = np.array(ndcg_scores).squeeze()
                writer.add_histogram("hist_NDCG/val", ndcg_scores, epoch)
                # also make a histogram in matplotlib and save as png
                plt.hist(ndcg_scores, bins=np.arange(0, 1.1, 0.1))
                plt.suptitle("Validation NDCG Histogram")
                # write information about the model to the histogram
                plt.title(f"Model: LightGCN, Embedding Dim: {EMBEDDING_DIM}, Num Layers: {NUM_LAYERS}, Batch Size: {BATCH_SIZE}, LR: {LR}, Num Train Users: {num_train_users}, Num Train Items: {num_train_items}", fontsize=8, wrap=True)
                plt.xlabel("NDCG")
                plt.ylabel("Frequency")
                # save the figure in the hist_NDCG folder, with the title having the model information and the epoch number
                plt.savefig(f"hist_NDCG/val_{EMBEDDING_DIM}_{NUM_LAYERS}_{BATCH_SIZE}_{LR}_{num_train_users}_{num_train_items}_{epoch}.png")
                plt.close()
                # Also save the raw NDCG scores to a csv file, with the model information in the title, and the epoch number
                np.savetxt(f"hist_NDCG/val_{EMBEDDING_DIM}_{NUM_LAYERS}_{BATCH_SIZE}_{LR}_{num_train_users}_{num_train_items}_{epoch}.csv", ndcg_scores, delimiter=",")
                print(mean_ndcg)
                writer.add_scalar("NDCG", mean_ndcg.item(), epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
                recall_at_k = compute_recall_at_k(validation_graph, model, K)
                print(recall_at_k)
                writer.add_scalar("Recall@K/val", recall_at_k, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
                print("Epoch: {}, NDCG: {}, Recall@{}: {}".format(epoch, mean_ndcg, K, recall_at_k))
                average_number_of_matches = 0
                for user_id in validation_users:
                    all_edges = torch.tensor([(user_id, item_id) for item_id in range(num_train_users, num_train_items)], dtype=torch.long).t().contiguous()
                    dst_index = torch.tensor([x for x in range(num_train_users + 1, num_train_items)]).to(device)
                    recommendations = model.recommend(all_edges.to(device), src_index=torch.tensor([user_id]).to(device), dst_index=dst_index, k=10)[0].cpu()
                    del all_edges
                    del dst_index
                    torch.cuda.empty_cache()
                    movie_names = [movie_id_to_movie_name[id_to_movie[int(recommendation)]] for recommendation in recommendations]
                    true_user_reviews = user_review_data[id_to_user[user_id]]
                    matches = 0
                    for movie_name in movie_names:
                        if movie_name in true_user_reviews['movie_title'].values:
                            matches += 1
                    average_number_of_matches += matches
                average_number_of_matches = average_number_of_matches / len(validation_users)
                print("Average number of matches: {}".format(average_number_of_matches))
                writer.add_scalar("Average number of matches", average_number_of_matches, epoch * (num_train_users // BATCH_SIZE) + start_idx // BATCH_SIZE)
                print("=====================================")
        
        scheduler.step()

Running on device: cuda
32


  1%|▏         | 4/313 [00:15<20:32,  3.99s/it]


KeyboardInterrupt: 

In [23]:
# save the model
torch.save(model.state_dict(), f"models/{EMBEDDING_DIM}_{NUM_LAYERS}_{BATCH_SIZE}_{LR}_{num_train_users}_{num_train_items}.pt")

In [24]:
for param_group in optim.param_groups:
    print(param_group['lr'])

0.00475


In [None]:
validation_users = list(set([int(x) for x in validation_edges[0, :]]))
validation_df[validation_df.user_id == id_to_user[0]]

In [None]:
validation_edges[:, validation_edges[0] == 0]

In [None]:
def get_user_positive_items(edge_index):
    """Generates dictionary of positive items for each user

    Args:
        edge_index (torch.Tensor): 2 by N list of edges

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [None]:
print()