Inference notebook: https://www.kaggle.com/code/idv2005/deberta-baseline-inference

In [None]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding

from datasets import Dataset
from sklearn.metrics import cohen_kappa_score

MODEL_NAME = 'microsoft/deberta-v3-large'
FOLD = 0
MAX_LENGTH = 512

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=MAX_LENGTH, truncation=True)

df_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_train['fold'] = df_train.essay_id.map(lambda x: int(x, base=16) % 5)
df_train['labels'] = df_train.score.map(lambda x: x-1)

# remove .iloc[:3000] and .iloc[:200] to use all data
ds_train = Dataset.from_pandas(df_train[df_train.fold!=FOLD].iloc[:3000].copy())
ds_eval = Dataset.from_pandas(df_train[df_train.fold==FOLD].iloc[:200].copy())

ds_train = ds_train.map(tokenize).remove_columns(['essay_id', 'full_text', 'score', 'fold', '__index_level_0__'])
ds_eval = ds_eval.map(tokenize).remove_columns(['essay_id', 'full_text', 'score', 'fold', '__index_level_0__'])

In [None]:
def compute_metrics(p):
    preds, labels = p
    score = cohen_kappa_score(labels, preds.argmax(-1), weights='quadratic')
    return { 'qwk':score }

train_args = TrainingArguments(
    output_dir='/kaggle/working/deberta-large-fold0', 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    report_to="none",
    evaluation_strategy="steps",
    do_eval=True,
    eval_steps=100,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    lr_scheduler_type='linear',
    metric_for_best_model="qwk",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_safetensors=True
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6)

trainer = Trainer(
    model=model, 
    args=train_args, 
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    data_collator=DataCollatorWithPadding(tokenizer), 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()