### TL;DR
Many have demonstrated that regression approach works a lot better than classification approach in this comptetion. However, I found classification approach using Focal Loss ([Lin *et al.*, (2017)](https://arxiv.org/abs/1708.02002)) works as good as regression approach.<br>
I used a focal loss with γ=2 to train deberta-v3-small and got LB=0.797 (cv=0.808 please refer [version 4 ](https://www.kaggle.com/code/emiz6413/lb-0-797-deberta-v3-small-focal-loss?scriptVersionId=174316940) of this notebook for the training), which is very close to the LB=0.800 obtained in [the notebook](https://www.kaggle.com/code/cdeotte/deberta-v3-small-starter-cv-0-820-lb-0-800) by @cdeotte. <br>

### 🗒️ Note
I haven't adjusted γ at all so far. I'm planning to tune it and will update the notebook once I find a better value. If you find a better γ and willing to share, please let me know in the comment🙇

### 👀 How focal loss works and motivation to use this
The formulation of focal loss (FL) is actually quite simple, it is cross entropy (CE) multiplied by a coefficient.

$$
\textrm{FL}(p) = - (1 - p)^{\gamma} \log{p} = (1 - p)^{\gamma} \textrm{CE}(p)
$$

<img src="https://storage.googleapis.com/aes2-0/focal_loss_diagram.png" width=50%>

We can think that the coefficient $(1 - p)^{\gamma}$ controlls the amount of cross entropy loss taken into account. When $\gamma = 0$, it matches with the definition of the cross entorpy. As $\gamma$ increases, cross entropy is tweaked in such a way that it becomes increasingly more important to better classify hard (low `p`) examples rather than predicting higher probability on easy (high `p`) examples.

While currently it achieves slightly lower cv/LB than the regression approach under current setting, one advantage of using this over the regression model is that we can compute the softmax probability of each class, which can be used to predict rare classes better with a threshold biased towards the rare classes (e.g. p(score=6) > 0.1).

In [None]:
import os
import copy
from pathlib import Path
from dataclasses import dataclass

import torch
import numpy as np
import pandas as pd
from datasets.arrow_dataset import Dataset
from transformers.trainer import Trainer
from transformers.modeling_utils import PreTrainedModel
from transformers.trainer_utils import EvalPrediction
from transformers.training_args import TrainingArguments
from transformers.models.deberta_v2 import DebertaV2ForSequenceClassification, DebertaV2TokenizerFast
from tokenizers import AddedToken
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

## Config

In [None]:
@dataclass
class Config:
    checkpoint: str = "microsoft/deberta-v3-small"
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 8
    gradient_accumulation_steps: int = 8 // torch.cuda.device_count() / per_device_train_batch_size
    num_train_epochs: float = 4
    train_max_length: int = 1024
    eval_max_length: int = 2048
    lr: float = 1e-5
    scheduler: str = "linear"
    warmup_ratio: float = 0.0
    weight_decay = 0.01
    amp: bool = True
    n_splits: int = 5
    gamma: float = 2.
    optim: str = "adamw_torch"
    inference: bool = True 
    inference_checkpoints_dir: str = "/kaggle/input/aes-2-0-deberta-v3-small-focal-gamma2/output"
    
config = Config()

In [None]:
args = TrainingArguments(
    output_dir="output",
    report_to="none",
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    num_train_epochs=config.num_train_epochs,
    weight_decay=config.weight_decay,
    evaluation_strategy='epoch',
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=1,
    metric_for_best_model="qwk",
    greater_is_better=True,
    load_best_model_at_end=True,
    fp16=config.amp,
    learning_rate=config.lr,
    lr_scheduler_type=config.scheduler,
    warmup_ratio=config.warmup_ratio,
    optim=config.optim
)

## Instantiate the model & tokenizer

In [None]:
class ModelInit:
    model_class = DebertaV2ForSequenceClassification
    
    def __init__(self, checkpoint: str, num_labels: int = 6) -> None:
        self.model = self.model_class.from_pretrained(checkpoint, num_labels=num_labels)
        self.state_dict = copy.deepcopy(self.model.state_dict())
        
    def __call__(self) -> model_class:
        self.model.load_state_dict(self.state_dict)
        return self.model

## Instantiate the dataset

In [None]:
if config.inference:
    df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")
else:
    df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
ds = Dataset.from_pandas(df)

In [None]:
class Encoder:
    def __init__(self, tokenizer, **encode_kwargs):
        self.tokenizer = tokenizer
        self.kwargs = encode_kwargs
        
    def __call__(self, batch: dict) -> dict:
        encoded = self.tokenizer(batch["full_text"], **self.kwargs)
        encoded["labels"] = [s-1 for s in batch["score"]]  # score is 1~6
        return encoded

## Compute Metrics

In [None]:
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    predictions = eval_pred.predictions
    y_true = eval_pred.label_ids
    y_pred = predictions.argmax(-1)
    kappa = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    corr = matthews_corrcoef(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    return {"qwk": kappa, "corr": corr, "acc": acc}

## Custom Trainer with Focal Loss

In [None]:
class FocalLoss(torch.nn.Module):
    def __init__(self, weight: torch.Tensor | None = None, gamma: float = 2,) -> None:
        super().__init__()
        self.ce = torch.nn.CrossEntropyLoss(weight=weight)
        self.gamma = gamma

    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        ce_loss: torch.Tensor = self.ce(input, target)
        pt = torch.exp(-ce_loss)
        f_loss = (1 - pt) ** self.gamma * ce_loss
        f_loss = torch.mean(f_loss)
        return f_loss
    
    
class FocalLossTrainer(Trainer):
    def compute_loss(self, model: PreTrainedModel, inputs: dict, return_outputs: bool = False) -> tuple:
        ce_loss, outputs = super().compute_loss(model, inputs, True)
        labels = inputs["labels"]
        logits = outputs["logits"]
        loss_fn = FocalLoss(gamma=config.gamma)
        loss = loss_fn(input=logits, target=labels)
        outputs["loss"] = loss
        return (loss, outputs) if return_outputs else loss

## Train

In [None]:
if not config.inference:
    model_init = ModelInit(config.checkpoint)
    tokenizer = DebertaV2TokenizerFast.from_pretrained(config.checkpoint)
    tokenizer.add_tokens([AddedToken("\n", normalized=False)])
    tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
    
    train_encoder = Encoder(tokenizer, max_length=config.train_max_length, truncation=True)
    eval_encoder = Encoder(tokenizer, max_length=config.eval_max_length, truncation=True)
    
    # 5-fold stratified cv
    cv = StratifiedKFold(n_splits=config.n_splits, shuffle=True, random_state=42)
    folds = list(cv.split(np.zeros(len(df)), y=df["score"].values))
    idx2fold = {idx: fold for fold, (_, val_idx) in enumerate(folds) for idx in val_idx}
    df["fold"] = [idx2fold[i] for i in df.index]
    df.to_csv("train_split.csv", index=False)
    
    cv_res = []

    for fold_idx in sorted(df["fold"].unique()):
        args.output_dir = os.path.join("output", f"fold_{fold_idx}")
        args.run_name = f"{config.checkpoint}_fold-{fold_idx}"
        train_ds = ds.select([i for i, d in enumerate(ds) if d["fold"] != fold_idx])
        eval_ds = ds.select([i for i, d in enumerate(ds) if d["fold"] == fold_idx])
        train_ds = train_ds.map(train_encoder, batched=True)
        eval_ds = eval_ds.map(eval_encoder, batched=True)
        trainer = FocalLossTrainer(
            args=args, 
            train_dataset=train_ds, 
            eval_dataset=eval_ds,
            tokenizer=tokenizer,
            model_init=model_init,
            compute_metrics=compute_metrics,
        )
        trainer.train()
        preds = trainer.predict(eval_ds).predictions
        qwk = cohen_kappa_score(y1=np.array(eval_ds["labels"]), y2=preds.argmax(-1), weights="quadratic")
        fig, ax = plt.subplots()
        ConfusionMatrixDisplay.from_predictions(
            y_true=np.array(eval_ds["labels"]), 
            y_pred=preds.argmax(-1),
            ax=ax
        )
        ax.set_title(f"fold-{fold_idx} qwk: {qwk:.3f}")
        fig.show()
        cv_res.append(qwk)

        res_df = pd.DataFrame(
            {
                "fold": list(sorted(df["fold"].unique())) + ["mean"],
                "qwk": cv_res + [np.mean(cv_res)]
            }
        )
        display(res_df)

## Inference

In [None]:
if config.inference:
    predictions = 0
    checkpoints = list(Path(config.inference_checkpoints_dir).glob("fold*/checkpoint*"))
    print(checkpoints)

    for checkpoint in checkpoints:
        tokenizer = DebertaV2TokenizerFast.from_pretrained(checkpoint)
        model = DebertaV2ForSequenceClassification.from_pretrained(checkpoint)
        _ds = ds.map(
            lambda i: tokenizer(i["full_text"], max_length=config.eval_max_length, truncation=True), 
            batched=True,
        )
        args = TrainingArguments(
            output_dir=".",
            per_device_eval_batch_size=config.per_device_eval_batch_size,
            fp16=config.amp,
        )
        trainer = Trainer(args=args, model=model, tokenizer=tokenizer)
        preds = trainer.predict(_ds)
        predictions += preds.predictions / len(checkpoints)

    predicted_scores = predictions.argmax(-1) + 1  # [0,5] -> [1,6]
    
    df["score"] = predicted_scores
    df = df[["essay_id", "score"]]
    display(df)
    df.to_csv("submission.csv", index=False)