### ⭐️ What this notebook is about
Huggingface `Trainer` by default optimizes CrossEntropy, which neglects the order of the labels.
Whereas this competition uses quadratic weighted kappa as a evaluation metric which takes the order of labels into account. <br>
Weighted kappa loss was introduced by [J de la Torre *et al.*, (2018)](https://www.sciencedirect.com/science/article/abs/pii/S0167865517301666). By using this loss, we can directly optimize a model with respect to quadratic weighted kappa score.

### 📊 Result
#### 5-fold StratifiedKFold cv
Deberta-v3-small was fine-tuned using the weighted kappa loss with quadratic weight. <br>
As for the hyperparameters and the training procedure, I used the ones proposed in [the notebook](https://www.kaggle.com/code/cdeotte/deberta-v3-small-starter-cv-0-820-lb-0-800) by @cdeotte and obtained the following result. (Please refer [version 5](https://www.kaggle.com/code/emiz6413/directly-optimize-quadratic-weighted-kappa-loss?scriptVersionId=174480998) of this notebook to see the full run.)

| Fold | QWK |
| - | - |
| 0 | 0.775 |
| 1 | 0.782 |
| 2 | 0.786 |
| 3 | 0.768 |
| 4 | 0.781 |
| average | 0.778 |

| LB | QWK |
| - | - |
| 5-fold | 0.777 |

### 🚨 Disclaimer
The result using the configuration below did NOT result in as good performance as the ordinary cross-entropy training. The kappa score on the LB was lower by 0.022 than the cross entropy models, but I think it's good to know someone tried it. <br>
I don't have access to the full article of the aforementioned paper. I refered the [TensorFlow implementation](https://www.tensorflow.org/addons/api_docs/python/tfa/losses/WeightedKappaLoss) of the loss function and ported to PyTorch.

In [None]:
import os
import copy
from dataclasses import dataclass

import torch
import numpy as np
import pandas as pd
from datasets.arrow_dataset import Dataset
from tokenizers import AddedToken
from transformers.trainer import Trainer
from transformers.modeling_utils import PreTrainedModel
from transformers.trainer_utils import EvalPrediction
from transformers.training_args import TrainingArguments
from transformers.models.deberta_v2 import DebertaV2ForSequenceClassification, DebertaV2TokenizerFast
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

## Config

In [None]:
@dataclass
class Config:
    checkpoint: str = "microsoft/deberta-v3-small"
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 8
    gradient_accumulation_steps: int = 8 // torch.cuda.device_count() // per_device_train_batch_size
    num_train_epochs: float = 4
    train_max_length: int = 1024
    eval_max_length: int = 2048
    lr: float = 1e-5
    scheduler: str = "linear"
    warmup_ratio: float = 0.0
    weight_decay = 0.01
    amp: bool = True
    n_splits: int = 5
    optim: str = "adamw_torch"
    
config = Config()

In [None]:
args = TrainingArguments(
    output_dir="output",
    report_to="none",
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    num_train_epochs=config.num_train_epochs,
    weight_decay=config.weight_decay,
    evaluation_strategy='epoch',
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=1,
    metric_for_best_model="qwk",
    greater_is_better=True,
    load_best_model_at_end=True,
    fp16=config.amp,
    learning_rate=config.lr,
    lr_scheduler_type=config.scheduler,
    warmup_ratio=config.warmup_ratio,
    optim=config.optim
)

## Instantiate the model & tokenizer

In [None]:
class ModelInit:
    model_class = DebertaV2ForSequenceClassification
    
    def __init__(self, checkpoint: str, num_labels: int = 6) -> None:
        self.model = self.model_class.from_pretrained(checkpoint, num_labels=num_labels)
        self.state_dict = copy.deepcopy(self.model.state_dict())
        
    def __call__(self) -> model_class:
        self.model.load_state_dict(self.state_dict)
        return self.model

model_init = ModelInit(config.checkpoint)
tokenizer = DebertaV2TokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])

## Instantiate the dataset

In [None]:
df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")

#### StratifiedKFold split according to the score

In [None]:
cv = StratifiedKFold(n_splits=config.n_splits, shuffle=True, random_state=42)
folds = list(cv.split(np.zeros(len(df)), y=df["score"].values))
idx2fold = {idx: fold for fold, (_, val_idx) in enumerate(folds) for idx in val_idx}
df["fold"] = [idx2fold[i] for i in df.index]
df.to_csv("train_split.csv", index=False)
ds = Dataset.from_pandas(df)

In [None]:
class Encoder:
    def __init__(self, tokenizer, **encode_kwargs):
        self.tokenizer = tokenizer
        self.kwargs = encode_kwargs
        
    def __call__(self, batch: dict) -> dict:
        encoded = self.tokenizer(batch["full_text"], **self.kwargs)
        encoded["labels"] = [s-1 for s in batch["score"]]  # score is 1~6
        return encoded
    
train_encoder = Encoder(tokenizer, max_length=config.train_max_length, truncation=True)
eval_encoder = Encoder(tokenizer, max_length=config.eval_max_length, truncation=True)

## Compute Metrics

In [None]:
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    predictions = eval_pred.predictions
    y_true = eval_pred.label_ids
    y_pred = predictions.argmax(-1)
    kappa = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    corr = matthews_corrcoef(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    return {"qwk": kappa, "corr": corr, "acc": acc}

## Custom Trainer with Weighted Kappa Loss

In [None]:
class WeightedKappaLoss(torch.nn.Module):
    def __init__(
        self, num_classes: int, weights: str = "quadratic", epsilon: float = 1e-6,
    ) -> None:
        super().__init__()
        label_vec = torch.arange(0, num_classes).float()
        self.row_label_vec = label_vec.view(1, num_classes)
        self.col_label_vec = label_vec.view(num_classes, 1)
        row_mat = torch.tile(self.row_label_vec, (num_classes, 1))
        col_mat = torch.tile(self.col_label_vec, (1, num_classes))
        if weights == 'quadratic':
            self.ops = torch.square
        elif weights == 'linear':
            self.ops = torch.abs
        else:
            raise ValueError()
        self.num_classes = num_classes
        self.weight_mat = self.ops(col_mat - row_mat)
        self.epsilon = epsilon
        
    def forward(self, y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:
        bs = y_true.size(0)
        y_true = torch.nn.functional.one_hot(y_true, self.num_classes)
        y_true = y_true.to(device=y_pred.device, dtype=y_pred.dtype)
        col_label_vec = self.col_label_vec.clone().to(y_pred.device)
        row_label_vec = self.row_label_vec.clone().to(y_pred.device)
        weight_mat = self.weight_mat.clone().to(y_pred.device)
        cat_labels = torch.matmul(y_true, col_label_vec)
        cat_label_mat = torch.tile(cat_labels, (1, self.num_classes))
        row_label_mat = torch.tile(row_label_vec, (bs, 1))
        weight = self.ops(cat_label_mat - row_label_mat)
        numerator = torch.sum(weight * y_pred)
        label_dist = torch.sum(y_true, dim=0, keepdim=True)
        pred_dist = torch.sum(y_pred, dim=0, keepdim=True)
        w_pred_dist = torch.matmul(weight_mat, pred_dist.T)
        dominator = torch.sum(torch.matmul(label_dist, w_pred_dist)) / bs
        loss = torch.log(numerator / dominator + self.epsilon)
        return loss

In [None]:
class QWKLossTrainer(Trainer):
    def compute_loss(self, model: PreTrainedModel, inputs: dict, return_outputs: bool = False) -> tuple:
        ce_loss, outputs = super().compute_loss(model, inputs, True)
        labels = inputs["labels"]
        logits = outputs["logits"]
        y_pred = logits.softmax(-1)
        loss_fn = WeightedKappaLoss(num_classes=y_pred.size(-1))
        loss = loss_fn(y_true=labels, y_pred=y_pred)
        outputs["loss"] = loss
        return (loss, outputs) if return_outputs else loss

## Train

In [None]:
cv_res = []

for fold_idx in sorted(df["fold"].unique()):
    args.output_dir = os.path.join("output", f"fold_{fold_idx}")
    args.run_name = f"{config.checkpoint}_fold-{fold_idx}"
    train_ds = ds.select([i for i, d in enumerate(ds) if d["fold"] != fold_idx])
    eval_ds = ds.select([i for i, d in enumerate(ds) if d["fold"] == fold_idx])
    train_ds = train_ds.map(train_encoder, batched=True)
    eval_ds = eval_ds.map(eval_encoder, batched=True)
    trainer = QWKLossTrainer(
        args=args, 
        train_dataset=train_ds, 
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        model_init=model_init,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    preds = trainer.predict(eval_ds).predictions
    qwk = cohen_kappa_score(y1=np.array(eval_ds["labels"]), y2=preds.argmax(-1), weights="quadratic")
    fig, ax = plt.subplots()
    ConfusionMatrixDisplay.from_predictions(
        y_true=np.array(eval_ds["labels"]), 
        y_pred=preds.argmax(-1),
        ax=ax
    )
    ax.set_title(f"fold-{fold_idx} qwk: {qwk:.3f}")
    fig.show()
    cv_res.append(qwk)
    break  # delete this line for 5-fold cv

In [None]:
pd.DataFrame(
    {
        "fold":[i for i in range(len(cv_res))] + ["mean"],
        "qwk": cv_res + [np.mean(cv_res)]
    }
)