Training Notebook: https://www.kaggle.com/code/hodinhtrieu/training-fold0-aes-deberta-model-starter

In [None]:
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset

TEST_DATA_PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"
MAX_LENGTH = 1024
MODEL_FOLD0 = '/kaggle/input/training-fold0-aes-deberta-model-starter/deberta-small-fold0/checkpoint-3000/'
MODEL_FOLD1 = '/kaggle/input/training-fold0-aes-deberta-model-starter/deberta-small-fold1/checkpoint-2500/'
MODEL_FOLD2 = '/kaggle/input/training-fold0-aes-deberta-model-starter/deberta-small-fold2/checkpoint-2500/'
MODEL_FOLD3 = '/kaggle/input/training-fold0-aes-deberta-model-starter/deberta-small-fold3/checkpoint-4000/'
models = [MODEL_FOLD0,MODEL_FOLD1,MODEL_FOLD2,MODEL_FOLD3]
EVAL_BATCH_SIZE = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_FOLD0)

def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=MAX_LENGTH, truncation=True)

df_test = pd.read_csv(TEST_DATA_PATH)
ds = Dataset.from_pandas(df_test).map(tokenize).remove_columns(['essay_id', 'full_text'])

args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=EVAL_BATCH_SIZE, 
    report_to="none"
)

predictions = []
for model in models:
    model = AutoModelForSequenceClassification.from_pretrained(model)
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=DataCollatorWithPadding(tokenizer), 
        tokenizer=tokenizer
    )
    
    preds = trainer.predict(ds).predictions
    predictions.append(softmax(preds, axis=-1))  
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()
    
predicted_score = 0.

for p in predictions:
    predicted_score += p
    
predicted_score /= len(predictions)

In [None]:
df_test['score'] = predicted_score.argmax(-1) + 1
df_test.head()

In [None]:
df_test[['essay_id', 'score']].to_csv('submission.csv', index=False)