In [1]:
import sys
import os
import gc
import time
import warnings
warnings.filterwarnings('ignore')
import random
import math
from pathlib import Path

import json
import argparse
from itertools import chain
from functools import partial

from hydra import compose, initialize
from omegaconf import OmegaConf
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from tokenizers import AddedToken
from datasets import Dataset, features
import numpy as np
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score

from src.seed import seed_everything

with initialize(config_path="config", version_base=None):
    cfg = compose(config_name="config")
    cfg.exp_number = Path().resolve().name

print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


lgb:
  params:
    objective: binary
    metric: auc
    verbose: -1
    boosting_type: gbdt
    learning_rate: 0.01
    max_depth: 5
    num_leaves: 31
    min_data_in_leaf: 50
    bagging_fraction: 0.8
    bagging_freq: 1
    feature_fraction: 0.8
    lambda_l1: 0
    lambda_l2: 1
    seed: 42
  early_stopping_rounds: 100
  log_evaluation: 100
  num_boost_round: 10000000
bert:
  params:
    model_path: microsoft/deberta-v3-small
    max_length: 256
    metric: auc
    steps: 50
    target_col_class_num: 2
exp_number: '003'
run_name: base
data:
  data_root: ../../data
  results_root: ../../results
  train_path: ../../data/train.csv
  clothing_path: ../../data/clothing_master.csv
  test_path: ../../data/test.csv
  sample_submission_path: ../../data/sample_submission.csv
  results_dir: ../../results/003/base
seed: 42
n_splits: 5
target: Recommended IND

cuda


### Import

In [2]:
train_df = pd.read_csv(cfg.data.train_path)
clothing_df = pd.read_csv(cfg.data.clothing_path)
test_df = pd.read_csv(cfg.data.test_path)

train_df = train_df.merge(clothing_df, how="left", on="Clothing ID")
test_df = test_df.merge(clothing_df, how="left", on="Clothing ID")

train_df["prompt"] = train_df["Title"].fillna("") + " " + train_df["Review Text"].fillna("")
test_df["prompt"] = test_df["Title"].fillna("") + " " + test_df["Review Text"].fillna("")
train_df["labels"] = train_df[cfg.target].astype(np.int8)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=cfg.seed)


In [3]:
x_train = train_df["prompt"]
y_train = train_df[cfg.target]

tokenizer = AutoTokenizer.from_pretrained(cfg.bert.params.model_path)

def tokenize(sample):
    return tokenizer(sample['prompt'], max_length=cfg.bert.params.max_length, truncation=True)


def compute_metrics(p):
    preds, labels = p
    preds = torch.softmax(torch.tensor(preds), dim = 1).numpy()
    score = roc_auc_score(labels, preds[:, 1])
    return {'auc': score}


# 実験結果格納用のディレクトリを作成
cfg.run_name = time.strftime("%Y%m%d_%H%M%S")
Path(cfg.data.results_dir).mkdir(exist_ok=True, parents=True)

predictions = np.zeros((len(train_df), cfg.bert.params.target_col_class_num))

for fold, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
    ds_train = Dataset.from_pandas(train_df.iloc[train_index][['prompt', 'labels']].copy())
    ds_eval = Dataset.from_pandas(train_df.iloc[valid_index][['prompt', 'labels']].copy())

    ds_train = ds_train.map(tokenize).remove_columns(['prompt', '__index_level_0__'])
    ds_eval = ds_eval.map(tokenize).remove_columns(['prompt', '__index_level_0__'])

    train_args = TrainingArguments(
        output_dir=Path(cfg.data.results_dir) / f'deberta-large-fold{fold}',
        fp16=True,
        learning_rate=2e-5,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        report_to="none",
        evaluation_strategy="steps",
        do_eval=True,
        eval_steps=cfg.bert.params.steps,
        save_total_limit=3,
        save_strategy="steps",
        save_steps=cfg.bert.params.steps,
        logging_steps=cfg.bert.params.steps,
        lr_scheduler_type='linear',
        metric_for_best_model="auc", # AUCを評価に使用する
        greater_is_better=True,
        warmup_ratio=0.1,
        weight_decay=0.01,
        save_safetensors=True,
        seed=cfg.seed,
        data_seed=cfg.seed,
    )

    config = AutoConfig.from_pretrained(cfg.bert.params.model_path)
    model = AutoModelForSequenceClassification.from_pretrained(cfg.bert.params.model_path, config=config)

    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=ds_train,
        eval_dataset=ds_eval,
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    trainer.save_model(f"deberta-large/deberta-large-fold{fold}")
    tokenizer.save_pretrained(f"deberta-large/deberta-large-fold{fold}")

    predictions[valid_index] = torch.softmax(torch.tensor(trainer.predict(ds_eval).predictions), dim = 1).numpy()


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Auc
50,0.6757,0.5405,0.592282
100,0.4641,0.313781,0.913399
150,0.4342,0.303173,0.930708
200,0.3494,0.248823,0.938685
250,0.2938,0.275491,0.945187
300,0.3076,0.21955,0.950291
350,0.2767,0.297905,0.95106
400,0.215,0.270108,0.955485
450,0.2597,0.239359,0.957297
500,0.2467,0.242852,0.956926


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Auc
50,0.7914,0.581067,0.567341
100,0.4678,0.33518,0.906437
150,0.3196,0.374506,0.907744
200,0.2813,0.228782,0.947416
250,0.2947,0.391593,0.94635
300,0.3764,0.247062,0.948719
350,0.2794,0.228456,0.958141
400,0.2392,0.23995,0.95478
450,0.2389,0.266208,0.955879
500,0.2642,0.223139,0.961546


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Auc
50,0.7858,0.567212,0.609223
100,0.457,0.365641,0.884117
150,0.4605,0.490539,0.630547
200,0.3974,0.264443,0.934226
250,0.2865,0.235285,0.949137
300,0.325,0.261832,0.948493
350,0.2633,0.27654,0.953225
400,0.3371,0.282978,0.953762
450,0.2716,0.225714,0.957154
500,0.2922,0.289981,0.954202


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Auc
50,0.7804,0.517937,0.58107
100,0.4759,0.506193,0.733362
150,0.493,0.396848,0.874056
200,0.3293,0.402779,0.869671
250,0.307,0.283996,0.913886
300,0.3404,0.396085,0.90843
350,0.3437,0.262515,0.927973
400,0.2711,0.305475,0.933648
450,0.2874,0.244051,0.941889
500,0.2199,0.271872,0.939809


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Auc
50,0.7915,0.523117,0.561846
100,0.4623,0.372591,0.889414
150,0.3106,0.313395,0.894737
200,0.3026,0.445335,0.929084
250,0.2989,0.242276,0.949311
300,0.289,0.213846,0.959613
350,0.2391,0.334459,0.961623
400,0.278,0.274472,0.961598
450,0.2169,0.27623,0.959519
500,0.3223,0.236105,0.963157


In [4]:
ds_test = Dataset.from_pandas(test_df[['prompt']].copy())
ds_test = ds_test.map(tokenize).remove_columns(['prompt'])

preds = []
for i in range(5):
    i = 0
    tokenizer = AutoTokenizer.from_pretrained(f"deberta-large/deberta-large-fold{i}")
    model = AutoModelForSequenceClassification.from_pretrained(f"deberta-large/deberta-large-fold{i}")

    test_args = TrainingArguments(
        output_dir=cfg.data.results_dir,
        per_device_eval_batch_size=4,
        do_predict=True,
        dataloader_drop_last=False,
    )
    trainer = Trainer(
        model=model,
        args=test_args,
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer,
    )
    predictions = torch.softmax(torch.tensor(trainer.predict(ds_test).predictions), dim = 1).numpy()
    preds.append(predictions[:, 1])

pred = np.mean(preds, axis=0)

# 提出用ファイル作成
sub_df = pd.read_csv(cfg.data.sample_submission_path)
sub_df["target"] = pred
sub_df.to_csv(f"{cfg.data.results_dir}/{cfg.run_name}_submission.csv", index=False)
sub_df.head()


Map:   0%|          | 0/11155 [00:00<?, ? examples/s]

Unnamed: 0,target
0,0.998784
1,0.305749
2,0.998914
3,0.038975
4,0.998835
