In [23]:
import os
import pickle
import random
import sys
import uuid
from pathlib import Path

import implicit
import lightgbm as lgb
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)

sys.path.append(os.pardir)
from hydra import compose, initialize

from utils import load_datasets
from utils.embedding import TextEmbedder

with initialize(config_path="../yamls", version_base=None):
    config = compose(config_name="config.yaml")


train_df = pd.read_csv(Path(config.input_path) / "train.csv")
test_df = pd.read_csv(Path(config.input_path) / "test.csv")

sample_submission_df = pd.read_csv(Path(config.input_path) / "sample_submission.csv")
anime_df = pd.read_csv(Path(config.input_path) / "anime.csv")

# 整形
anime_df["genres"] = anime_df["genres"].str.replace(" ", "")

# Merge the train data with the anime meta data
all_df = pd.concat([train_df, test_df])
all_df = all_df.merge(anime_df, on="anime_id", how="left")

In [28]:
import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList
from torch.nn.modules.loss import _Loss
from torch_geometric.data import Data
from torch_geometric.nn.conv import LGConv
from torch_geometric.typing import Adj, OptTensor
from torch_geometric.utils import is_sparse, to_edge_index

device = "cpu"  # "cuda" if torch.cuda.is_available() else "cpu"

# make Data
all_df = pd.concat([train_df[["user_id", "anime_id"]], test_df[["user_id", "anime_id"]]]).reset_index(drop=True)

all_df["user_label"], user_idx = pd.factorize(all_df["user_id"])
all_df["anime_label"], anime_idx = pd.factorize(all_df["anime_id"])
all_df["is_train"] = True
all_df.loc[len(train_df) :, "is_train"] = False
# userとanimeの番号が別になるようにずらす
all_df["anime_label"] += len(user_idx)
num_nodes = len(user_idx) + len(anime_idx)
edges = all_df[["user_label", "anime_label"]].to_numpy()
edge_index = torch.tensor(edges.T, dtype=torch.long).contiguous()
data = Data(num_nodes=num_nodes, edge_index=edge_index).to(device)
data.edge_weight = torch.ones(len(all_df)).contiguous().to(device)

In [29]:
all_df["user_label"]

0            0
1            0
2            0
3            0
4            0
          ... 
254072    1997
254073    1997
254074    1997
254075    1997
254076    1997
Name: user_label, Length: 254077, dtype: int64

In [30]:
from torch_geometric.transforms import RandomLinkSplit

In [31]:
from typing import Optional, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList
from torch.nn.modules.loss import _Loss
from torch_geometric.nn.conv import LGConv
from torch_geometric.typing import Adj, OptTensor
from torch_geometric.utils import is_sparse, to_edge_index


class LightGCN(torch.nn.Module):
    def __init__(
        self,
        num_nodes: int,
        embedding_dim: int,
        num_layers: int,
        alpha: Optional[Union[float, Tensor]] = None,
        **kwargs,
    ):
        super().__init__()

        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        if alpha is None:
            alpha = 1.0 / (num_layers + 1)

        if isinstance(alpha, Tensor):
            assert alpha.size(0) == num_layers + 1
        else:
            alpha = torch.tensor([alpha] * (num_layers + 1))
        self.register_buffer("alpha", alpha)

        self.embedding = Embedding(num_nodes, embedding_dim)
        self.convs = ModuleList([LGConv(**kwargs) for _ in range(num_layers)])

        self.reset_parameters()

    def reset_parameters(self):
        r"""Resets all learnable parameters of the module."""
        torch.nn.init.xavier_uniform_(self.embedding.weight)
        for conv in self.convs:
            conv.reset_parameters()

    def get_embedding(
        self,
        edge_index: Adj,
        edge_weight: OptTensor = None,
    ) -> Tensor:
        r"""Returns the embedding of nodes in the graph."""
        x = self.embedding.weight
        out = x * self.alpha[0]

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index, edge_weight)
            out = out + x * self.alpha[i + 1]

        return out

    def forward(
        self,
        edge_index: Adj,
        edge_label_index: OptTensor = None,
        edge_weight: OptTensor = None,
    ) -> Tensor:
        r"""Computes rankings for pairs of nodes.

        Args:
            edge_index (torch.Tensor or SparseTensor): Edge tensor specifying
                the connectivity of the graph.
            edge_label_index (torch.Tensor, optional): Edge tensor specifying
                the node pairs for which to compute rankings or probabilities.
                If :obj:`edge_label_index` is set to :obj:`None`, all edges in
                :obj:`edge_index` will be used instead. (default: :obj:`None`)
            edge_weight (torch.Tensor, optional): The weight of each edge in
                :obj:`edge_index`. (default: :obj:`None`)
        """
        if edge_label_index is None:
            if is_sparse(edge_index):
                edge_label_index, _ = to_edge_index(edge_index)
            else:
                edge_label_index = edge_index

        out = self.get_embedding(edge_index, edge_weight)

        out_src = out[edge_label_index[0]]
        out_dst = out[edge_label_index[1]]

        return (out_src * out_dst).sum(dim=-1)

    def predict_link(
        self,
        edge_index: Adj,
        edge_label_index: OptTensor = None,
        edge_weight: OptTensor = None,
        prob: bool = False,
    ) -> Tensor:
        r"""Predict links between nodes specified in :obj:`edge_label_index`.

        Args:
            prob (bool, optional): Whether probabilities should be returned.
                (default: :obj:`False`)
        """
        pred = self(edge_index, edge_label_index, edge_weight).sigmoid()
        return pred if prob else pred.round()

    def recommend(
        self,
        edge_index: Adj,
        edge_weight: OptTensor = None,
        src_index: OptTensor = None,
        dst_index: OptTensor = None,
        k: int = 1,
    ) -> Tensor:
        r"""Get top-:math:`k` recommendations for nodes in :obj:`src_index`.

        Args:
            src_index (torch.Tensor, optional): Node indices for which
                recommendations should be generated.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            dst_index (torch.Tensor, optional): Node indices which represent
                the possible recommendation choices.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            k (int, optional): Number of recommendations. (default: :obj:`1`)
        """
        out_src = out_dst = self.get_embedding(edge_index, edge_weight)

        if src_index is not None:
            out_src = out_src[src_index]

        if dst_index is not None:
            out_dst = out_dst[dst_index]

        pred = out_src @ out_dst.t()
        top_index = pred.topk(k, dim=-1).indices

        if dst_index is not None:  # Map local top-indices to original indices.
            top_index = dst_index[top_index.view(-1)].view(*top_index.size())

        return top_index

    def link_pred_loss(self, pred: Tensor, edge_label: Tensor, **kwargs) -> Tensor:
        r"""Computes the model loss for a link prediction objective via the
        :class:`torch.nn.BCEWithLogitsLoss`.

        Args:
            pred (torch.Tensor): The predictions.
            edge_label (torch.Tensor): The ground-truth edge labels.
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch.nn.BCEWithLogitsLoss` loss function.
        """
        loss_fn = torch.nn.BCEWithLogitsLoss(**kwargs)
        return loss_fn(pred, edge_label.to(pred.dtype))

    def recommendation_loss(
        self,
        pos_edge_rank: Tensor,
        neg_edge_rank: Tensor,
        node_id: Optional[Tensor] = None,
        lambda_reg: float = 1e-4,
        **kwargs,
    ) -> Tensor:
        r"""Computes the model loss for a ranking objective via the Bayesian
        Personalized Ranking (BPR) loss.

        .. note::

            The i-th entry in the :obj:`pos_edge_rank` vector and i-th entry
            in the :obj:`neg_edge_rank` entry must correspond to ranks of
            positive and negative edges of the same entity (*e.g.*, user).

        Args:
            pos_edge_rank (torch.Tensor): Positive edge rankings.
            neg_edge_rank (torch.Tensor): Negative edge rankings.
            node_id (torch.Tensor): The indices of the nodes involved for
                deriving a prediction for both positive and negative edges.
                If set to :obj:`None`, all nodes will be used.
            lambda_reg (int, optional): The :math:`L_2` regularization strength
                of the Bayesian Personalized Ranking (BPR) loss.
                (default: :obj:`1e-4`)
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch_geometric.nn.models.lightgcn.BPRLoss` loss
                function.
        """
        loss_fn = BPRLoss(lambda_reg, **kwargs)
        emb = self.embedding.weight
        emb = emb if node_id is None else emb[node_id]
        return loss_fn(pos_edge_rank, neg_edge_rank, emb)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.num_nodes}, " f"{self.embedding_dim}, num_layers={self.num_layers})"


class BPRLoss(_Loss):
    r"""The Bayesian Personalized Ranking (BPR) loss.

    The BPR loss is a pairwise loss that encourages the prediction of an
    observed entry to be higher than its unobserved counterparts
    (see `here <https://arxiv.org/abs/2002.02126>`__).

    .. math::
        L_{\text{BPR}} = - \sum_{u=1}^{M} \sum_{i \in \mathcal{N}_u}
        \sum_{j \not\in \mathcal{N}_u} \ln \sigma(\hat{y}_{ui} - \hat{y}_{uj})
        + \lambda \vert\vert \textbf{x}^{(0)} \vert\vert^2

    where :math:`lambda` controls the :math:`L_2` regularization strength.
    We compute the mean BPR loss for simplicity.

    Args:
        lambda_reg (float, optional): The :math:`L_2` regularization strength
            (default: 0).
        **kwargs (optional): Additional arguments of the underlying
            :class:`torch.nn.modules.loss._Loss` class.
    """
    __constants__ = ["lambda_reg"]
    lambda_reg: float

    def __init__(self, lambda_reg: float = 0, **kwargs):
        super().__init__(None, None, "sum", **kwargs)
        self.lambda_reg = lambda_reg

    def forward(self, positives: Tensor, negatives: Tensor, parameters: Tensor = None) -> Tensor:
        r"""Compute the mean Bayesian Personalized Ranking (BPR) loss.

        .. note::

            The i-th entry in the :obj:`positives` vector and i-th entry
            in the :obj:`negatives` entry should correspond to the same
            entity (*.e.g*, user), as the BPR is a personalized ranking loss.

        Args:
            positives (Tensor): The vector of positive-pair rankings.
            negatives (Tensor): The vector of negative-pair rankings.
            parameters (Tensor, optional): The tensor of parameters which
                should be used for :math:`L_2` regularization
                (default: :obj:`None`).
        """
        log_prob = F.logsigmoid(positives - negatives).mean()

        regularization = 0
        if self.lambda_reg != 0:
            regularization = self.lambda_reg * parameters.norm(p=2).pow(2)
            regularization = regularization / positives.size(0)

        return -log_prob + regularization

In [32]:
transform = RandomLinkSplit(num_val=0, num_test=0, add_negative_train_samples=True, neg_sampling_ratio=1.0)
train_data, _, _ = transform(data)
train_data

Data(edge_index=[2, 254077], num_nodes=3954, edge_weight=[254077], edge_label=[508154], edge_label_index=[2, 508154])

In [33]:
from tqdm.auto import tqdm

oof_pred = np.zeros(len(train_df))
test_preds = []

model = LightGCN(
    num_nodes=data.num_nodes,
    embedding_dim=64,
    num_layers=4,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

for epoch in tqdm(range(1 if config.debug is False else 6)):
    # train
    pred = model.predict_link(
        train_data.edge_index, train_data.edge_label_index, edge_weight=train_data.edge_weight, prob=True
    )
    loss = model.link_pred_loss(pred, train_data["edge_label"])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f"epoch {epoch} : loss {loss.item()}")

  0%|          | 0/1 [00:00<?, ?it/s]

epoch 0 : loss 0.7240769863128662


In [34]:
vectors = model.get_embedding(data.edge_index).detach().cpu().numpy()

In [35]:
vectors.shape

(3954, 128)

In [36]:
vectors /= np.linalg.norm(vectors)
vectors.shape

(3954, 128)

0         1998
1         1999
2         2000
3         2001
4         2002
          ... 
254072    2184
254073    3296
254074    2427
254075    3380
254076    3900
Name: anime_label, Length: 254077, dtype: int64

In [49]:
user_factors = vectors[: len(user_idx)]
item_factors = vectors[len(user_idx) :]
embeddings = np.concatenate(
    (user_factors[all_df["user_label"]], item_factors[(all_df["anime_label"] - len(user_idx))]), axis=1
)

In [52]:
embeddings_df = pd.DataFrame(embeddings)
embeddings_df.columns = [f"lightgcn_user_factor_{i}" for i in range(user_factors.shape[1])] + [
    f"lightgcn_item_factor_{j}" for j in range(item_factors.shape[1])
]
embeddings_df

Unnamed: 0,lightgcn_user_factor_0,lightgcn_user_factor_1,lightgcn_user_factor_2,lightgcn_user_factor_3,lightgcn_user_factor_4,lightgcn_user_factor_5,lightgcn_user_factor_6,lightgcn_user_factor_7,lightgcn_user_factor_8,lightgcn_user_factor_9,lightgcn_user_factor_10,lightgcn_user_factor_11,lightgcn_user_factor_12,lightgcn_user_factor_13,lightgcn_user_factor_14,lightgcn_user_factor_15,lightgcn_user_factor_16,lightgcn_user_factor_17,lightgcn_user_factor_18,lightgcn_user_factor_19,lightgcn_user_factor_20,lightgcn_user_factor_21,lightgcn_user_factor_22,lightgcn_user_factor_23,lightgcn_user_factor_24,...,lightgcn_item_factor_103,lightgcn_item_factor_104,lightgcn_item_factor_105,lightgcn_item_factor_106,lightgcn_item_factor_107,lightgcn_item_factor_108,lightgcn_item_factor_109,lightgcn_item_factor_110,lightgcn_item_factor_111,lightgcn_item_factor_112,lightgcn_item_factor_113,lightgcn_item_factor_114,lightgcn_item_factor_115,lightgcn_item_factor_116,lightgcn_item_factor_117,lightgcn_item_factor_118,lightgcn_item_factor_119,lightgcn_item_factor_120,lightgcn_item_factor_121,lightgcn_item_factor_122,lightgcn_item_factor_123,lightgcn_item_factor_124,lightgcn_item_factor_125,lightgcn_item_factor_126,lightgcn_item_factor_127
0,0.000435,-0.000651,0.000225,0.000100,0.001384,0.000728,-0.001824,-0.000837,-0.001304,0.002275,0.000646,-0.000460,0.001808,0.002445,0.000127,0.002651,0.000657,0.001603,0.001061,0.000752,-0.001758,-0.000323,-0.002247,-0.000240,-0.001351,...,0.001564,-0.000987,0.002079,-0.001746,0.000822,0.001040,-0.001981,0.000092,-0.000286,0.002543,0.001307,-0.000524,-0.000107,0.002027,0.000698,-0.002927,0.001641,-0.001697,-0.000899,-0.002484,0.000198,-0.002671,0.001865,0.001271,-0.000394
1,0.000435,-0.000651,0.000225,0.000100,0.001384,0.000728,-0.001824,-0.000837,-0.001304,0.002275,0.000646,-0.000460,0.001808,0.002445,0.000127,0.002651,0.000657,0.001603,0.001061,0.000752,-0.001758,-0.000323,-0.002247,-0.000240,-0.001351,...,-0.001093,-0.001788,0.001001,-0.000982,0.000895,-0.000173,-0.002590,-0.000009,-0.001075,0.002726,-0.000302,0.000289,0.001854,-0.000094,-0.000118,-0.002113,0.000469,0.000201,-0.000862,0.002007,-0.001404,-0.001314,-0.001505,0.000134,-0.000992
2,0.000435,-0.000651,0.000225,0.000100,0.001384,0.000728,-0.001824,-0.000837,-0.001304,0.002275,0.000646,-0.000460,0.001808,0.002445,0.000127,0.002651,0.000657,0.001603,0.001061,0.000752,-0.001758,-0.000323,-0.002247,-0.000240,-0.001351,...,-0.000284,0.001137,-0.001866,0.002506,0.001189,-0.001736,-0.001171,0.001776,-0.002368,-0.000995,-0.002181,0.000659,0.000050,-0.001567,-0.001753,-0.001991,-0.001260,0.002844,0.002419,0.000079,-0.000103,-0.000979,-0.000213,-0.000641,0.001474
3,0.000435,-0.000651,0.000225,0.000100,0.001384,0.000728,-0.001824,-0.000837,-0.001304,0.002275,0.000646,-0.000460,0.001808,0.002445,0.000127,0.002651,0.000657,0.001603,0.001061,0.000752,-0.001758,-0.000323,-0.002247,-0.000240,-0.001351,...,0.001885,0.001839,0.002522,0.001556,-0.000124,-0.000747,0.002635,0.002480,0.001213,0.000747,0.001014,-0.001287,-0.001980,0.000652,-0.002772,0.001051,-0.001898,-0.001861,0.001989,0.001299,0.000614,-0.001084,-0.001275,0.002092,-0.002343
4,0.000435,-0.000651,0.000225,0.000100,0.001384,0.000728,-0.001824,-0.000837,-0.001304,0.002275,0.000646,-0.000460,0.001808,0.002445,0.000127,0.002651,0.000657,0.001603,0.001061,0.000752,-0.001758,-0.000323,-0.002247,-0.000240,-0.001351,...,-0.000688,-0.003166,0.000797,0.000818,0.000013,0.000786,0.000949,0.001538,-0.002706,0.000055,-0.001768,0.001463,0.000096,0.000274,-0.003396,0.002390,-0.000644,0.001989,0.000202,-0.001335,-0.000213,-0.000332,0.001180,0.000483,-0.001489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254072,-0.000028,-0.000307,-0.002189,0.001443,-0.002923,0.002287,-0.001104,-0.000928,-0.000424,0.000713,0.000073,0.000087,-0.001400,0.000382,-0.001500,0.001736,-0.002523,-0.001768,-0.002022,-0.001503,-0.001620,0.000147,-0.000747,0.000891,-0.000942,...,0.000456,0.000574,0.000420,-0.000430,-0.000417,-0.000715,-0.001669,0.001855,-0.000675,0.001284,0.001795,0.000880,-0.001531,0.001650,0.000222,-0.002569,-0.000526,-0.001276,-0.000707,-0.000494,-0.000513,-0.000002,0.001140,-0.001624,-0.000933
254073,-0.000028,-0.000307,-0.002189,0.001443,-0.002923,0.002287,-0.001104,-0.000928,-0.000424,0.000713,0.000073,0.000087,-0.001400,0.000382,-0.001500,0.001736,-0.002523,-0.001768,-0.002022,-0.001503,-0.001620,0.000147,-0.000747,0.000891,-0.000942,...,-0.001350,-0.002021,0.002350,-0.000780,0.001533,0.002706,0.000072,-0.000910,-0.000178,-0.000218,0.000139,-0.000345,-0.000301,-0.001494,-0.000808,-0.001803,-0.000124,0.001243,-0.000776,0.000104,0.000587,0.000102,-0.001031,-0.000239,-0.000211
254074,-0.000028,-0.000307,-0.002189,0.001443,-0.002923,0.002287,-0.001104,-0.000928,-0.000424,0.000713,0.000073,0.000087,-0.001400,0.000382,-0.001500,0.001736,-0.002523,-0.001768,-0.002022,-0.001503,-0.001620,0.000147,-0.000747,0.000891,-0.000942,...,0.002128,0.002762,0.003372,-0.000576,0.001729,0.002266,-0.002173,0.001618,0.000924,-0.001585,0.000699,0.001050,0.000829,0.001467,-0.000244,0.000501,0.000077,0.001617,-0.001484,-0.002601,0.000154,-0.000751,0.000479,-0.002184,-0.000362
254075,-0.000028,-0.000307,-0.002189,0.001443,-0.002923,0.002287,-0.001104,-0.000928,-0.000424,0.000713,0.000073,0.000087,-0.001400,0.000382,-0.001500,0.001736,-0.002523,-0.001768,-0.002022,-0.001503,-0.001620,0.000147,-0.000747,0.000891,-0.000942,...,0.002247,0.001504,0.002171,-0.000527,-0.001229,-0.002396,-0.001218,-0.000423,-0.001500,0.000716,-0.000331,0.000030,0.001440,0.002151,-0.000232,0.000338,-0.000385,0.001085,-0.000968,0.001189,-0.001084,0.001774,0.000378,-0.000237,-0.000142


In [41]:
user_vectors.shape

(1998, 128)

In [43]:
user_vectors[all_df["user_label"]].shape

(254077, 128)

In [22]:
user_vectors.shape

(1956, 64)

In [17]:
all_df["user_label"]

0            0
1            0
2            0
3            0
4            0
          ... 
254072    1997
254073    1997
254074    1997
254075    1997
254076    1997
Name: user_label, Length: 254077, dtype: int64

In [7]:
torch.cuda.empty_cache()

NameError: name 'user_vectors' is not defined