In [1]:
import os
import random
import sys
import uuid
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch_geometric
import wandb
from torch_geometric.data import Data

In [2]:
from hydra import compose, initialize

with initialize(config_path="../yamls", version_base=None):
    config = compose(config_name="config.yaml")


def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


device = "cuda" if torch.cuda.is_available() else "cpu"
seed_everything(config.seed)
config

{'debug': False, 'input_path': '../input/atmaCup15_dataset', 'seed': 7, 'train': {'lr': 0.01, 'embedding_dim': 1024, 'num_layers': 3, 'num_epochs': 1000, 'early_stopping': 20, 'num_folds': 5}, 'lgb': {'feats': {'num': ['duration', 'genres', 'user_num', 'aired'], 'cat': ['categorical_label_encoded_with_user'], 'models': ['000_baseline/8d9ed97c']}, 'target_name': 'score', 'num_folds': 5, 'params': {'objective': 'regression', 'metric': 'rmse', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.05, 'num_leaves': 64, 'min_child_samples': 160, 'max_depth': -1, 'subsample_freq': 0, 'bagging_seed': 0, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'max_bin': 200, 'device': 'gpu'}}, 'combine': {'seen': ['100_lgb/f6a2ff9e'], 'unseen': ['201_stratify/8b996136']}}

In [3]:
train_df = pd.read_csv(Path(config.input_path) / "train.csv")
test_df = pd.read_csv(Path(config.input_path) / "test.csv")

sample_submission_df = pd.read_csv(Path(config.input_path) / "sample_submission.csv")
anime_df = pd.read_csv(Path(config.input_path) / "anime.csv")

# 整形
anime_df["genres"] = anime_df["genres"].str.replace(" ", "")

# Merge the train data with the anime meta data
all_df = pd.concat([train_df, test_df])
all_df = all_df.merge(anime_df, on="anime_id", how="left")

In [4]:
all_df.head()

Unnamed: 0,user_id,anime_id,score,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch
0,0008e10fb39e55447333,0669cc0219d468761195,2.0,"Action,Adventure,Shounen",ジョジョの奇妙な冒険 黄金の風,TV,39,"Oct 6, 2018 to Jul 28, 2019","Warner Bros. Japan, KlockWorx, Medicos Enterta...",VIZ Media,David Production,Manga,23 min. per ep.,R - 17+ (violence & profanity),542642,64809,383733,10625,5735,77740
1,0008e10fb39e55447333,111adb8835b8a1a2cf54,10.0,"Action,Dementia,Demons,Horror,Supernatural",DEVILMAN crybaby,ONA,10,"Jan 5, 2018","Aniplex, Dynamic Planning, Netflix",Unknown,Science SARU,Manga,25 min. per ep.,R+ - Mild Nudity,650309,29665,477257,13336,18054,111997
2,0008e10fb39e55447333,1fc8683c393432a2f9c7,1.0,"Hentai,Yaoi",ぼくのぴこ,OVA,1,"Sep 7, 2006",Natural High,Unknown,"Sugar Boy, Blue Cat",Original,34 min.,Rx - Hentai,137560,5153,113190,758,9431,9028
3,0008e10fb39e55447333,2290175205d55e81b197,8.0,"Action,SliceofLife,Comedy,Supernatural",モブサイコ100,TV,12,"Jul 11, 2016 to Sep 27, 2016","Warner Bros. Japan, KlockWorx, BS Fuji, Hakuho...","Funimation, Crunchyroll",Bones,Web manga,24 min. per ep.,PG-13 - Teens 13 or older,1255830,68041,942402,26125,19213,200049
4,0008e10fb39e55447333,28f173b60331d5cabb0d,9.0,"Comedy,Shounen,Sports",はじめの一歩 -Champion Road-,Special,1,"Apr 18, 2003",Unknown,"Discotek Media, Geneon Entertainment USA",Madhouse,Manga,1 hr. 30 min.,PG-13 - Teens 13 or older,97346,1565,82189,502,379,12711


In [7]:
%%time
cols = ["producers", "licensors", "studios"]


def sort_and_join(s):
    # split by comma, strip whitespace, sort, and rejoin
    return ", ".join(sorted(x.strip() for x in s.split(",")))


for col in cols:
    all_df[col] = all_df[col].fillna("Undefined").apply(sort_and_join)

CPU times: user 1.52 s, sys: 35.6 ms, total: 1.55 s
Wall time: 1.55 s


In [30]:
%%time
import cuml

X = all_df[["members", "watching", "completed", "on_hold", "dropped", "plan_to_watch"]].head().astype(float)
X = (X - X.mean(axis=0)) / X.std(axis=0)
X = X.fillna(0)

pca = cuml.PCA(n_components=2)
Z_pca = pca.fit_transform(X)

CPU times: user 12.8 ms, sys: 4.05 ms, total: 16.9 ms
Wall time: 14.1 ms


Unnamed: 0,members,watching,completed,on_hold,dropped,plan_to_watch
0,0.012571,0.977793,-0.046072,0.033665,-0.599639,-0.057768
1,0.241801,-0.132055,0.222875,0.290174,0.930575,0.375736
2,-0.849874,-0.906145,-0.824072,-0.899931,-0.140538,-0.927279
3,1.530994,1.07986,1.560492,1.500244,1.074541,1.489984
4,-0.935492,-1.019454,-0.913222,-0.924153,-1.264939,-0.880673


In [12]:
partial_ratio("KlockWorx, Medicos Entertainment, Shueisha", "KlockWorx, Medicos Entertainment, Shueisha")

100.0

In [9]:
%%time
from rapidfuzz.distance import DamerauLevenshtein, Postfix, Prefix
from rapidfuzz.fuzz import partial_ratio
from rapidfuzz.process import cdist


def calculate_csim(df, aggway="sum", scorer="prefix"):
    queries = df
    if scorer == "partial":
        score = cdist(queries, queries, scorer=partial_ratio)
        result_df = pd.DataFrame(score, index=queries, columns=queries) / 100
        result = result_df.to_numpy()
        np.fill_diagonal(result, 0.0)  # 自身は0にする
    else:
        if scorer == "prefix":
            scorer = Prefix.normalized_distance
        elif scorer == "prefix":
            scorer = Postfix.normalized_distance
        else:
            scorer = DamerauLevenshtein.normalized_distance
        score = cdist(queries, queries, scorer=Prefix.normalized_distance)
        result_df = pd.DataFrame(score, index=queries, columns=queries)
        result_df = 1 - result_df  # distanceなので反転
        result = result_df.to_numpy()
        np.fill_diagonal(result, 0.0)  # 自身は0にする
    if aggway == "sum":
        result = np.sum(result, axis=1)
    elif aggway == "mean":
        result = np.mean(result, axis=1)
    elif aggway == "var":
        result = np.var(result, axis=1)
    elif aggway == "max":
        result = np.max(result, axis=1)
    return result


cdist_dict = all_df.head(100).groupby("user_id")["japanese_name"].transform(calculate_csim, "sum", "partial")

CPU times: user 1min 8s, sys: 0 ns, total: 1min 8s
Wall time: 1min 8s


In [67]:
cdist_dict

0    0.554489
1    0.268775
2    0.507936
3    0.000000
4    0.585859
Name: japanese_name, dtype: float32

In [None]:
# calculate each metric using the pre-calculated cdist
def calculate_metric(df, metric):
    cdist_result = cdist_dict[df.name]
    if metric == "sum":
        return cdist_result.sum(axis=1)
    elif metric == "mean":
        return cdist_result.mean(axis=1)
    elif metric == "var":
        return cdist_result.var(axis=1)
    elif metric == "max":
        return cdist_result.max(axis=1)
    else:
        return None


for metric in ["sum", "mean", "var", "max"]:
    anime_df[f"matched_{metric}"] = anime_df.groupby("user_id").transform(calculate_metric, metric)

In [12]:
anime_df.head()

Unnamed: 0,anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch
0,000ba7f7e34e107e7544,"Comedy, Sci-Fi, Seinen, Slice of Life, Space",宇宙兄弟,TV,99,"Apr 1, 2012 to Mar 22, 2014","Aniplex, Dentsu, YTV, Trinity Sound",Sentai Filmworks,A-1 Pictures,Manga,24 min. per ep.,PG-13 - Teens 13 or older,150428,16552,37234,13009,6948,76685
1,00427279d72064e7fb69,"Adventure, Slice of Life, Mystery, Historical,...",蟲師,TV,26,"Oct 23, 2005 to Jun 19, 2006","Avex Entertainment, Marvelous, SKY Perfect Wel...",Funimation,Artland,Manga,25 min. per ep.,PG-13 - Teens 13 or older,620736,55482,235371,42786,20017,267080
2,00444b67aaabdf740a68,"Adventure, Slice of Life, Mystery, Historical,...",蟲師 続章,TV,10,"Apr 5, 2014 to Jun 21, 2014","Aniplex, Kodansha, Delfi Sound",Aniplex of America,Artland,Manga,24 min. per ep.,PG-13 - Teens 13 or older,226522,12585,113559,6095,2606,91677
3,00839a3507ab168abe75,"Comedy, Ecchi, Fantasy, School",星刻の竜騎士,TV,12,"Apr 5, 2014 to Jun 21, 2014","Media Factory, AT-X, Sony Music Communications...",Funimation,C-Station,Light novel,24 min. per ep.,R+ - Mild Nudity,170220,8723,118202,3753,8034,31508
4,0192331235e110fe4f76,"Comedy, Harem, Romance, Sci-Fi, Shounen, Space",天地無用！,TV,26,"Apr 2, 1995 to Sep 24, 1995","TV Tokyo, Pioneer LDC","Funimation, Geneon Entertainment USA",AIC,Original,23 min. per ep.,PG-13 - Teens 13 or older,62599,2565,39890,2093,1986,16065


Unnamed: 0,user_id,anime_id,user_label,anime_label,is_train
254072,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0,1997,2184,False
254073,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79,1997,3296,False
254074,ffe85a36cd20500faa58,f6c208226b6b69948053,1997,2427,False
254075,ffe85a36cd20500faa58,fe67592c312fc1e17745,1997,3380,False
254076,ffe85a36cd20500faa58,ff73475b68001c5e533d,1997,3900,False


In [7]:
edges = all_df[["user_label", "anime_label"]].to_numpy()
edge_index = torch.tensor(edges.T, dtype=torch.long).contiguous()
data = Data(num_nodes=num_nodes, edge_index=edge_index).to(device)
data.edge_weight = torch.ones(len(all_df)).contiguous()

data

Data(edge_index=[2, 254077], num_nodes=3954, edge_weight=[254077])

In [8]:
from typing import Optional, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList
from torch.nn.modules.loss import _Loss
from torch_geometric.nn.conv import LGConv
from torch_geometric.typing import Adj, OptTensor
from torch_geometric.utils import is_sparse, to_edge_index


class LightGCN(torch.nn.Module):
    def __init__(
        self,
        num_nodes: int,
        embedding_dim: int,
        num_layers: int,
        alpha: Optional[Union[float, Tensor]] = None,
        **kwargs,
    ):
        super().__init__()

        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        if alpha is None:
            alpha = 1.0 / (num_layers + 1)

        if isinstance(alpha, Tensor):
            assert alpha.size(0) == num_layers + 1
        else:
            alpha = torch.tensor([alpha] * (num_layers + 1))
        self.register_buffer("alpha", alpha)

        self.embedding = Embedding(num_nodes, embedding_dim)
        self.convs = ModuleList([LGConv(**kwargs) for _ in range(num_layers)])

        self.reset_parameters()

    def reset_parameters(self):
        r"""Resets all learnable parameters of the module."""
        torch.nn.init.xavier_uniform_(self.embedding.weight)
        for conv in self.convs:
            conv.reset_parameters()

    def get_embedding(
        self,
        edge_index: Adj,
        edge_weight: OptTensor = None,
    ) -> Tensor:
        r"""Returns the embedding of nodes in the graph."""
        x = self.embedding.weight
        out = x * self.alpha[0]

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index, edge_weight)
            out = out + x * self.alpha[i + 1]

        return out

    def forward(
        self,
        edge_index: Adj,
        edge_label_index: OptTensor = None,
        edge_weight: OptTensor = None,
    ) -> Tensor:
        r"""Computes rankings for pairs of nodes.

        Args:
            edge_index (torch.Tensor or SparseTensor): Edge tensor specifying
                the connectivity of the graph.
            edge_label_index (torch.Tensor, optional): Edge tensor specifying
                the node pairs for which to compute rankings or probabilities.
                If :obj:`edge_label_index` is set to :obj:`None`, all edges in
                :obj:`edge_index` will be used instead. (default: :obj:`None`)
            edge_weight (torch.Tensor, optional): The weight of each edge in
                :obj:`edge_index`. (default: :obj:`None`)
        """
        if edge_label_index is None:
            if is_sparse(edge_index):
                edge_label_index, _ = to_edge_index(edge_index)
            else:
                edge_label_index = edge_index

        out = self.get_embedding(edge_index, edge_weight)

        out_src = out[edge_label_index[0]]
        out_dst = out[edge_label_index[1]]

        return (out_src * out_dst).sum(dim=-1)

In [22]:
import os

import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm


def k_fold(num_fold, all_df):
    skf = StratifiedKFold(num_fold, shuffle=True, random_state=12345)
    train_len = all_df["is_train"].sum()
    train_indices, val_indices, test_indices = [], [], []
    for _, idx in skf.split(torch.zeros(train_len), all_df.iloc[:train_len]["user_id"]):
        val_indices.append(torch.from_numpy(idx).to(torch.long))
        test_indices.append(torch.tensor(range(train_len, len(all_df))).to(torch.long))

    for i in range(num_fold):
        train_mask = torch.ones(train_len, dtype=torch.bool)
        train_mask[val_indices[i]] = 0
        train_indices.append(train_mask.nonzero(as_tuple=False).view(-1))

    return train_indices, val_indices, test_indices


exp_name = f"{Path(sys.argv[0]).stem}_{str(uuid.uuid1())[:8]}"


wandb.init(
    project="atmacup-21",
    name=exp_name,  # {file}_{id}
    mode="online" if config.debug is False else "disabled",
    config=config.train,
)

oof_pred = np.zeros(len(train_df))
test_preds = []

for fold, (train_idx, val_idx, test_idx) in enumerate(zip(*k_fold(config.train.num_folds, all_df))):
    model = LightGCN(
        num_nodes=data.num_nodes,
        embedding_dim=config.train.embedding_dim,
        num_layers=config.train.num_layers,
    ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.train.lr)
    best_val_loss = float("inf")
    early_stopping_counter = 0

    for epoch in tqdm(range(config.train.num_epochs if config.debug is False else 6), desc=f"Fold-{fold+1}"):
        model.train()
        optimizer.zero_grad()
        pred = model(data.edge_index[:, train_idx])
        target = torch.tensor(train_df.loc[train_idx.numpy(), "score"].to_numpy()).float().to(device)
        loss = F.mse_loss(pred, target).sqrt()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # validation
        with torch.no_grad():
            pred = model(data.edge_index[:, val_idx])
            target = torch.tensor(train_df.loc[val_idx.numpy(), "score"].to_numpy()).float().to(device)
            val_loss = F.mse_loss(pred, target).sqrt()

        wandb.log({"epoch": epoch, f"loss/train/fold-{fold}": loss.item(), f"loss/valid/fold-{fold}": val_loss.item()})
        if epoch % config.train.early_stopping == 0:
            tqdm.write(f"Epoch: {epoch}, Loss: {loss.item()}, Val Loss: {val_loss.item()}")

        # early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f"model_best_{fold}.pt")
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= config.train.early_stopping:
                print("Early stopping triggered.")
                model.load_state_dict(torch.load(f"model_best_{fold}.pt"))
                break

    # testing
    with torch.no_grad():
        # Out-of-Fold (OOF) prediction
        oof_pred[val_idx.cpu().detach().numpy()] = model(data.edge_index[:, val_idx]).cpu().detach().numpy()

        test_pred = model(data.edge_index[:, test_idx]).cpu().detach().numpy()
        test_preds.append(test_pred)

# calculate mean of predictions across all folds
mean_test_preds = np.mean(test_preds, axis=0)
wandb.finish()



Fold-1:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch: 0, Loss: 7.924870014190674, Val Loss: 7.922195911407471


Fold-2:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch: 0, Loss: 7.922452926635742, Val Loss: 7.931846618652344


Fold-3:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch: 0, Loss: 7.925309181213379, Val Loss: 7.920421600341797


Fold-4:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch: 0, Loss: 7.922451496124268, Val Loss: 7.931850910186768


Fold-5:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch: 0, Loss: 7.926576614379883, Val Loss: 7.915343284606934


In [27]:
sample_submission_df["score"] = mean_test_preds
sample_submission_df.to_csv(f"../output/sub_{exp_name}.csv", index=False)

# データフレームを作成
oof_df = pd.DataFrame({"score": oof_pred})
# CSVファイルとして保存
oof_df.to_csv(f"../output/oof_{exp_name}.csv", index=False)