In [1]:
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from datasets import Dataset
from hydra import compose, initialize
from omegaconf import OmegaConf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from src.seed import seed_everything
from torch import nn
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

warnings.filterwarnings("ignore")
with initialize(config_path="config", version_base=None):
    cfg = compose(config_name="config")
    cfg.exp_number = Path().resolve().name

print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


lgb:
  params:
    objective: binary
    metric: auc
    verbose: -1
    boosting_type: gbdt
    learning_rate: 0.01
    max_depth: 5
    num_leaves: 31
    min_data_in_leaf: 50
    bagging_fraction: 0.8
    bagging_freq: 1
    feature_fraction: 0.8
    lambda_l1: 0
    lambda_l2: 1
    seed: 42
  early_stopping_rounds: 100
  log_evaluation: 100
  num_boost_round: 10000000
bert:
  params:
    model_path: microsoft/deberta-v3-large
    metric: auc
    target_col_class_num: 2
    max_length: 50
    fp16: true
    learning_rate: 2.0e-05
    epochs: 2
    per_device_train_batch_size: 8
    per_device_eval_batch_size: 32
    steps: 500
    lr_scheduler_type: cosine
    weight_decay: 0.01
exp_number: '013'
run_name: base
data:
  data_root: ../../data
  results_root: ../../results
  train_path: ../../data/train.csv
  clothing_path: ../../data/clothing_master.csv
  test_path: ../../data/test.csv
  sample_submission_path: ../../data/sample_submission.csv
  results_dir: ../../results/013/base
se

### 前処理

In [2]:
debug = False

train_df = pd.read_csv(cfg.data.train_path)
clothing_df = pd.read_csv(cfg.data.clothing_path)
test_df = pd.read_csv(cfg.data.test_path)

if debug:
    train_df = train_df.sample(1000, random_state=cfg.seed).reset_index(drop=True)

train_df = pd.merge(train_df, clothing_df, on="Clothing ID", how="left")
test_df = pd.merge(test_df, clothing_df, on="Clothing ID", how="left")
all_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

def preprocess(df):
    df['prompt'] = (
        df['Age'].map(str) + "-Year-Old's Review of " + df['Class Name'] + ' [SEP] TITLE: ' + df['Title'].fillna('none') + ' [SEP] Positive Feedback Count: ' + df['Positive Feedback Count'].map(str)
    )
    return df

train_df = preprocess(train_df)
test_df = preprocess(test_df)
train_df = train_df[~train_df["Title"].isna()].reset_index(drop=True)
test_df = test_df[(test_df["Review Text"].isna()) & (~test_df["Title"].isna())].reset_index(drop=True)

train_df["labels"] = train_df[cfg.target].astype(np.int8)

display(train_df["prompt"][0])

tokenizer = AutoTokenizer.from_pretrained(cfg.bert.params.model_path)
train_max_length = train_df["prompt"].map(lambda x: len(tokenizer(x)["input_ids"])).max()
test_max_length = test_df["prompt"].map(lambda x: len(tokenizer(x)["input_ids"])).max()

print(train_max_length, test_max_length)


"25-Year-Old's Review of Skirts [SEP] TITLE: 3-season skirt! [SEP] Positive Feedback Count: 4"

40 36


### Smoth Focal Loss

In [3]:
class FocalLoss(nn.Module):
    def __init__(self, reduction="none", alpha=1, gamma=2):
        super().__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
        pt = torch.exp(-bce_loss)
        loss = self.alpha * (1.0 - pt) ** self.gamma * bce_loss
        if self.reduction == "none":
            loss = loss
        elif self.reduction == "sum":
            loss = loss.sum()
        elif self.reduction == "mean":
            loss = loss.mean()
        return loss


class SmoothFocalLoss(nn.Module):
    def __init__(self, reduction="mean", alpha=1, gamma=2, smoothing=0.0):
        super().__init__()
        self.reduction = reduction
        self.focal_loss = FocalLoss(reduction="none", alpha=alpha, gamma=gamma)
        self.smoothing = smoothing

    @staticmethod
    def _smooth(targets: torch.Tensor, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothFocalLoss._smooth(targets, self.smoothing)
        loss = self.focal_loss(inputs, targets)
        if self.reduction == "none":
            loss = loss
        elif self.reduction == "sum":
            loss = loss.sum()
        elif self.reduction == "mean":
            loss = loss.mean()
        return loss


class CustomTrainer(Trainer):
    def __init__(self, loss_fn=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if self.loss_fn is None:
            loss = F.cross_entropy(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        else:
            # ロジットとラベルの形状を調整
            logits = logits[:, 1]  # ポジティブクラスのロジットのみを使用
            labels = labels.float()
            loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# Smooth Focal Lossのインスタンスを作成
loss_fn = SmoothFocalLoss(alpha=1, gamma=2, smoothing=0)  # 結局gammaしか効かしてない


In [4]:
def compute_metrics(p):
    preds, labels = p
    preds = torch.softmax(torch.tensor(preds), dim = 1).numpy()
    score = roc_auc_score(labels, preds[:, 1])
    return {'auc': score}

# 実験結果格納用のディレクトリを作成
cfg.run_name = time.strftime("%Y%m%d_%H%M%S")
Path(cfg.data.results_dir).mkdir(exist_ok=True, parents=True)

y_train = train_df[cfg.target]
oof = np.zeros(len(train_df))

ds_train = Dataset.from_pandas(train_df[['prompt', 'labels']].copy())
ds_eval = Dataset.from_pandas(train_df[['prompt', 'labels']].copy())

tokenizer = AutoTokenizer.from_pretrained(cfg.bert.params.model_path)
config = AutoConfig.from_pretrained(cfg.bert.params.model_path)
model = AutoModelForSequenceClassification.from_pretrained(cfg.bert.params.model_path, config=config)

def tokenize(sample):
    return tokenizer(sample['prompt'], max_length=cfg.bert.params.max_length, truncation=True)

ds_train = ds_train.map(tokenize).remove_columns(['prompt'])
ds_eval = ds_eval.map(tokenize).remove_columns(['prompt'])

output_dir = f"{cfg.data.results_dir}/all"

train_args = TrainingArguments(
    output_dir=output_dir,
    fp16=cfg.bert.params.fp16,
    learning_rate=cfg.bert.params.learning_rate,
    num_train_epochs=cfg.bert.params.epochs,
    per_device_train_batch_size=cfg.bert.params.per_device_train_batch_size,
    per_device_eval_batch_size=cfg.bert.params.per_device_eval_batch_size,
    gradient_accumulation_steps=4,
    report_to="none",
    evaluation_strategy="steps",
    do_eval=True,
    eval_steps=cfg.bert.params.steps,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=cfg.bert.params.steps,
    logging_steps=cfg.bert.params.steps,
    load_best_model_at_end=True,
    lr_scheduler_type=cfg.bert.params.lr_scheduler_type,
    metric_for_best_model=cfg.bert.params.metric,
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=cfg.bert.params.weight_decay,
    save_safetensors=True,
    seed=cfg.seed,
    data_seed=cfg.seed,
)

trainer = CustomTrainer(
    model=model,
    args=train_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    data_collator=DataCollatorWithPadding(tokenizer),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    loss_fn=loss_fn,
)

trainer.train()

# pred = torch.softmax(torch.tensor(trainer.predict(ds_eval).predictions), dim=1).numpy()
# oof = pred[:, 1]


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8425 [00:00<?, ? examples/s]

Map:   0%|          | 0/8425 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Auc
500,0.0641,0.037723,0.979078


TrainOutput(global_step=526, training_loss=0.0631503934189394, metrics={'train_runtime': 376.9529, 'train_samples_per_second': 44.701, 'train_steps_per_second': 1.395, 'total_flos': 901971854763084.0, 'train_loss': 0.0631503934189394, 'epoch': 1.9962049335863377})

In [5]:
final_output_dir = f"{output_dir}/final"
trainer.save_model(final_output_dir)
tokenizer.save_pretrained(final_output_dir)


('../../results/013/20240831_055421/all/final/tokenizer_config.json',
 '../../results/013/20240831_055421/all/final/special_tokens_map.json',
 '../../results/013/20240831_055421/all/final/spm.model',
 '../../results/013/20240831_055421/all/final/added_tokens.json',
 '../../results/013/20240831_055421/all/final/tokenizer.json')

### oof

In [6]:
# import seaborn as sns
# from matplotlib import pyplot as plt

# oof_df = pd.DataFrame({"oof": oof})
# oof_df.to_csv(f"{cfg.data.results_dir}/oof.csv", index=False)

# best_score = roc_auc_score(y_train, oof)
# with open(f"{cfg.data.results_dir}/log.txt", "w") as log_file:
#     log_file.write("====== CV Score ======\n")
#     log_file.write(f"best_score: {best_score}\n")
#     log_file.write("\n====== params ======\n")
#     log_file.write(OmegaConf.to_yaml(cfg, resolve=True))

# fig, ax = plt.subplots(figsize=(12, 5))
# sns.histplot(y_train, bins=50)
# sns.histplot(oof, bins=50)
# plt.legend(["true", "oof"])
# plt.show()
# fig.savefig(f"{cfg.data.results_dir}/oof_hist.png")


### SUbmission

In [None]:
import os

def tokenize(sample):
    return tokenizer(sample['prompt'], max_length=cfg.bert.params.max_length, truncation=True)

tokenizer = AutoTokenizer.from_pretrained(final_output_dir)
model = AutoModelForSequenceClassification.from_pretrained(final_output_dir)

ds_test = Dataset.from_pandas(test_df[['prompt']].copy())
ds_test = ds_test.map(tokenize).remove_columns(['prompt'])

test_args = TrainingArguments(
    output_dir=cfg.data.results_dir,
    per_device_eval_batch_size=cfg.bert.params.per_device_eval_batch_size,
    do_predict=True,
    dataloader_drop_last=False,
)
trainer = Trainer(
    model=model,
    args=test_args,
    data_collator=DataCollatorWithPadding(tokenizer),
    tokenizer=tokenizer,
)
predictions = torch.softmax(torch.tensor(trainer.predict(ds_test).predictions), dim=1).numpy()

pred = predictions[:, 1]


In [10]:
test_df = pd.read_csv(cfg.data.test_path)
index = test_df[(test_df["Review Text"].isna()) & (~test_df["Title"].isna())].index

best_sub_df = pd.read_csv("/workspace/20240830_164022_submission.csv")
display(best_sub_df.head(10))
best_sub_df.loc[index, "target"] = pred

best_sub_df.to_csv(f"{cfg.data.results_dir}/{cfg.run_name}_submission.csv", index=False)

display(index[:5])
display(best_sub_df.head(10))


Unnamed: 0,target
0,0.905987
1,0.486497
2,0.907969
3,0.329067
4,0.89087
5,0.899834
6,0.701518
7,0.667197
8,0.296738
9,0.864704


Index([7, 8, 26, 37, 44], dtype='int64')

Unnamed: 0,target
0,0.905987
1,0.486497
2,0.907969
3,0.329067
4,0.89087
5,0.899834
6,0.701518
7,0.525034
8,0.419945
9,0.864704


In [None]:
test_df = pd.read_csv(cfg.data.test_path)
index = test_df[(test_df["Review Text"].isna()) & (~test_df["Title"].isna())].index

best_sub_df = pd.read_csv("/workspace/20240830_164022_submission.csv")
display(best_sub_df.head(10))
best_sub_df.loc[index, "target"] = pred

best_sub_df.to_csv(f"{cfg.data.results_dir}/{cfg.run_name}_submission.csv", index=False)

display(index[:5])
display(best_sub_df.head(10))


In [32]:
import matplotlib.pyplot as plt

best_sub_df = pd.read_csv("/workspace/20240830_164022_submission.csv")

# 正規化
best_sub_df["target"] = (best_sub_df["target"] - best_sub_df["target"].min()) / (best_sub_df["target"].max() - best_sub_df["target"].min())

# predも正規化
new_pred = (pred - pred.min()) / (pred.max() - pred.min())

test_df = pd.read_csv(cfg.data.test_path)
index = test_df[(test_df["Review Text"].isna()) & (~test_df["Title"].isna())].index

display(best_sub_df.head(10))
best_sub_df.loc[index, "target"] = new_pred

best_sub_df.to_csv(f"{cfg.data.results_dir}/{cfg.run_name}_submission2.csv", index=False)

display(index[:5])
display(best_sub_df.head(10))


Unnamed: 0,target
0,0.984046
1,0.436982
2,0.986631
3,0.231675
4,0.964332
5,0.976022
6,0.717395
7,0.672637
8,0.189515
9,0.930209


Index([7, 8, 26, 37, 44], dtype='int64')

Unnamed: 0,target
0,0.984046
1,0.436982
2,0.986631
3,0.231675
4,0.964332
5,0.976022
6,0.717395
7,0.342407
8,0.174855
9,0.930209
