In [10]:
import os
import pickle
import random
import sys
import uuid
from pathlib import Path

import implicit
import lightgbm as lgb
import numpy as np
import pandas as pd
from hydra import compose, initialize
from scipy.sparse import csr_matrix, random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)

sys.path.append(os.pardir)

from utils import evaluate_score, load_datasets, load_sample_sub, load_target
from utils.embedding import TextEmbedder

with initialize(config_path="../yamls", version_base=None):
    config = compose(config_name="config.yaml")
config.debug = True


def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

In [4]:
import datetime
import logging
import os
import time
import warnings
from glob import glob
from typing import Any, List, Optional, Tuple

import numpy as np
import pandas as pd
import torch
from timm.scheduler import CosineLRScheduler
from timm.utils import AverageMeter
from torch import nn
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


class TransformerModel(nn.Module):
    def __init__(
        self,
        num_layers=2,
        hidden_size: int = 64,
        nhead: int = 4,
        dim_feedforward: int = 1024,
    ):
        super(TransformerModel, self).__init__()
        self.hidden_size = hidden_size

        # embedding
        self.user_embedding = nn.Embedding(2000, hidden_size)
        self.anime_embedding = nn.Embedding(2000, hidden_size)

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=hidden_size,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=0.0,
                batch_first=True,
            ),
            num_layers=num_layers,
        )
        self.fc = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, 1))

    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        user_x = self.user_embedding(x[:, 0:1])
        anime_x = self.anime_embedding(x[:, 1:])  # 修正
        x = torch.cat([user_x, anime_x], dim=1)
        x = self.transformer_encoder(x)
        output = self.fc(x).squeeze(2)
        return output

    def get_losses(
        self,
        input: torch.Tensor,
        target: torch.Tensor,
        mode_tensor: torch.Tensor,
        mode: int = 1,
    ) -> float:
        loss_fn = nn.MSELoss()
        loss = loss_fn(input[mode_tensor == mode], target[mode_tensor == mode])
        loss = torch.sqrt(loss)
        return loss

In [5]:
seed_everything(config.seed)
output_path = Path(f".")
os.makedirs(output_path, exist_ok=True)

In [11]:
train_df = pd.read_csv(Path(config.input_path) / "train.csv")
test_df = pd.read_csv(Path(config.input_path) / "test.csv")
anime = pd.read_csv(Path(config.input_path) / "anime.csv")
train_user_ids = load_target("user_id")
sub = load_sample_sub()


if config.debug:
    sample_index = train_df.sample(100).index
    train_df = train_df.iloc[sample_index].reset_index(drop=True)
    test_df = test_df.head(100)
    train_user_ids = train_user_ids.iloc[sample_index].reset_index(drop=True)
    sub = sub.head(100)

In [None]:
# =========================
# Dataset & Model
# =========================
class UserDataset(Dataset):
    def __init__(self, merge_df: pd.DataFrame, max_padding: int = 531):
        """
        merge_df: すべてのデータを結合したもの。以下のカラムを持つ。
        - 
        """
        self.merge_df = merge_df
        self.max_padding = max_padding
        self.user2anime_dict = merge_df.groupby("ordinal_user_id")["ordinal_anime_id"].apply(list).to_dict()
        self.user2mode_dict = merge_df.groupby("ordinal_user_id")["ordinal_mode"].apply(list).to_dict()
        self.user2score = merge_df.groupby("ordinal_user_id")["score"].apply(list).to_dict()

    def __len__(self):
        return self.merge_df["ordinal_user_id"].nunique()

    def __getitem__(self, idx):
        """
        出力したいもの
        - input_tensor: user_id, anime_id 系列　を結合したもの
        - mode_tensor: user_idか、train用の anime_id か、validation用のanime_idか、test用のanime_id かを判断するためのもの。
        損失計算の対象を決めるために設定する。{user_id: 0, train:1, valid:2, test:3}
        - attention_mask: 計算対象外のpaddingの位置をtransformerに教えるために必要
        - score_tensor: ラベルとなるスコア情報。ラベルが無いものは適当に0で埋めるが使わない
        """
        user_tensor = torch.Tensor([idx]).int()
        anime_tensor = torch.Tensor(self.user2anime_dict[idx]).int()
        mode_tensor = torch.Tensor(self.user2mode_dict[idx]).int()
        score_tensor = torch.Tensor(self.user2score[idx]).float()

        attention_mask = torch.zeros([self.max_padding + 1, self.max_padding + 1], dtype=torch.bool)
        attention_mask[: anime_tensor.size(0) + 1, : anime_tensor.size(0) + 1] = True

        pad_length = self.max_padding - anime_tensor.size(0)
        input_tensor = torch.cat((user_tensor, anime_tensor, torch.zeros(pad_length, dtype=torch.int32)))
        mode_tensor = torch.cat(
            (
                torch.zeros(1, dtype=torch.int32),
                mode_tensor,
                torch.zeros(pad_length, dtype=torch.int32),
            )
        )
        score_tensor = torch.cat(
            (
                torch.zeros(1, dtype=torch.float),
                score_tensor,
                torch.zeros(pad_length, dtype=torch.float),
            )
        )
        sample = {
            "input_tensor": input_tensor,
            "mode_tensor": mode_tensor,
            "attention_mask": attention_mask,
            "score_tensor": score_tensor,
        }
        return sample

In [None]:
oof_pred = np.zeros(X_train_all.shape[0])
test_preds = []


kf = StratifiedGroupKFold(n_splits=config.nn.num_folds, shuffle=True, random_state=config.seed)
for fold, (train_index, valid_index) in enumerate(kf.split(train_df, train_df["score"], train_user_ids)):
    print(f"Fold {fold} start !")

    # ここで、userをtrainとvalidに分ける