Training: [https://www.kaggle.com/code/hashidoyuto/text-clustering-deberta-aes2-0/notebook](https://www.kaggle.com/code/hashidoyuto/text-clustering-deberta-aes2-0/notebook)

# Import & Config

In [None]:
import os
import random
import glob
import warnings
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset

warnings.simplefilter('ignore')

In [None]:
class PATHS:
    test_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'
    model_dir = '/kaggle/input/groupkfold-deberta-aes2-0/'

In [None]:
class CFG:
    max_length = 512
    num_labels = 6

# Define Tokenization & Load Data

In [None]:
class Tokenize(object):
    def __init__(self, test, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.test = test
        
    def get_dataset(self, df):
        ds = Dataset.from_dict({
                'essay_id': [e for e in df['essay_id']],
                'full_text': [ft for ft in df['full_text']]
            })
        return ds
        
    def tokenize_function(self, example):
        tokenized_inputs = self.tokenizer(
            example['full_text'], truncation=True, max_length=CFG.max_length
        )
        return tokenized_inputs
    
    def __call__(self):
        test_ds = self.get_dataset(self.test)
        
        tokenized_test = test_ds.map(
            self.tokenize_function, batched=True
        )
        return tokenized_test, self.tokenizer

In [None]:
test = pd.read_csv(PATHS.test_path)

model_paths = glob.glob(PATHS.model_dir + '*fold*')
model_paths.sort()

# Inference

In [None]:
predictions = []
for i, model_path in enumerate(model_paths):
    tokenize = Tokenize(test, model_path)
    tokenized_test, tokenizer = tokenize()
    
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=CFG.num_labels)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    training_args = TrainingArguments(
    ".",
    per_device_eval_batch_size=1,
    report_to="none",
    fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )
    
    pre_preds = trainer.predict(tokenized_test).predictions
    predictions.append(pre_preds)

# Hand in Submission

In [None]:
final_pred = np.array(0)
for p in predictions:
    final_pred = final_pred + p
final_pred = final_pred / 5
final_pred = final_pred.argmax(axis=1) + 1

submission = pd.DataFrame({
    'essay_id': test['essay_id'].values,
    'score': final_pred
})
submission.to_csv('submission.csv', index=False)

In [None]:
submission