Inference: [https://www.kaggle.com/code/hashidoyuto/deberta-v3-base-aes2-0-infer](https://www.kaggle.com/code/hashidoyuto/deberta-v3-base-aes2-0-infer)

## Import & Config

In [None]:
import os
import re
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import cohen_kappa_score

warnings.simplefilter('ignore')

In [None]:
class PATHS:
    train_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
    model_path = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'

In [None]:
class CFG:
    n_splits = 5
    seed = 42
    max_length = 512
    lr = 2e-5
    train_batch_size = 8
    eval_batch_size = 4
    train_epochs = 1
    weight_decay = 0.01
    warmup_ratio = 0.1
    num_labels = 6

In [None]:
def seed_everything(seed):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=CFG.seed)

## Data Tokenization

In [None]:
class Tokenize(object):
    def __init__(self, train, valid):
        self.tokenizer = AutoTokenizer.from_pretrained(PATHS.model_path)
        self.train = train
        self.valid = valid
        
    def get_dataset(self, df):
        ds = Dataset.from_dict({
                'essay_id': [e for e in df['essay_id']],
                'full_text': [ft for ft in df['full_text']],
                'label': [s for s in df['label']],
            })
        return ds
        
    def tokenize_function(self, example):
        tokenized_inputs = self.tokenizer(
            example['full_text'], truncation=True, max_length=CFG.max_length
        )
        return tokenized_inputs
    
    def __call__(self):
        train_ds = self.get_dataset(train)
        valid_ds = self.get_dataset(valid)
        
        tokenized_train = train_ds.map(
            self.tokenize_function, batched=True
        )
        tokenized_valid = valid_ds.map(
            self.tokenize_function, batched=True
        )
        
        return tokenized_train, tokenized_valid, self.tokenizer

## Compute metrics

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    qwk = cohen_kappa_score(labels, predictions.argmax(-1), weights='quadratic')
    results = {
        'qwk': qwk
    }
    return results

## Load Data & Set Fold

In [None]:
data = pd.read_csv(PATHS.train_path)
data['label'] = data['score'].apply(lambda x: x-1)

skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)

for i, (_, val_index) in enumerate(skf.split(data, data["score"])):
    data.loc[val_index, "fold"] = i

data.head()

## Set Training Args

In [None]:
training_args = TrainingArguments(
    output_dir='output',
    fp16=True,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.eval_batch_size,
    num_train_epochs=CFG.train_epochs,
    weight_decay=CFG.weight_decay,
    evaluation_strategy='epoch',
    metric_for_best_model='qwk',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to='none',
    warmup_ratio=CFG.warmup_ratio,
    optim='adamw_torch'
)

## Training by hold

In [None]:
for fold in range(len(data['fold'].unique())):
    train = data[data['fold'] != fold]
    valid = data[data['fold'] == fold]
    
    tokenize = Tokenize(train, valid)
    tokenized_train, tokenized_valid, tokenizer = tokenize()
    
    model = AutoModelForSequenceClassification.from_pretrained(PATHS.model_path, num_labels=CFG.num_labels)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )
    
    trainer.train()
    
    y_true = valid['score'].values
    predictions = trainer.predict(tokenized_valid).predictions
    predictions = predictions.argmax(axis=1) + 1
    cm = confusion_matrix(y_true, predictions, labels=[x for x in range(1,7)])
    draw_cm = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=[x for x in range(1,7)])
    draw_cm.plot()
    plt.show()
    
    trainer.save_model(f'deberta-v3-base_AES2_fold_{fold}')
    tokenizer.save_pretrained(f'deberta-v3-base_AES2_fold_{fold}')
    
    valid.to_csv(f'valid_df_fold_{fold}.csv', index=False)