Hi everyone! I took the idea of finetuning Funnel Transformer from discussion [here](https://www.kaggle.com/competitions/learning-agency-lab-automated-essay-scoring-2/discussion/498571). 

This is a 5-fold small baseline for [U-net like transformer model](https://arxiv.org/pdf/2006.03236). Hit the upvote if you find it useful. You can also notice that I've changed scheduler type to cosine.

Here is a notebook which helps to build a good CV strategy (running it rn so see how it works). Check it out https://www.kaggle.com/code/emiz6413/predict-the-prompts

There is a lot more to be done here but unfortunately I'm running out of GPU:
* Including persuade dataset
* Regression instead of classification (Done, looks way better than classification)
* Hyperparameters tuning
* Bigger model and **even more** folds
* Experiments with sequence length (no good)

In [None]:
import random
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tokenizers import AddedToken
from transformers import (AutoTokenizer, FunnelForSequenceClassification, AutoConfig,
                          DataCollatorWithPadding, Trainer, TrainingArguments)
from datasets import Dataset
from sklearn.metrics import cohen_kappa_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold

In [None]:
REGRESSION = True
MODEL_SIZE = 'large'
MODEL_NAME = f'funnel-transformer/{MODEL_SIZE}'
VERSION = 8
EPOCHS = 1
LR = 5e-5
BATCH_SIZE = 2 if MODEL_SIZE == 'small' else 1
MAX_LENGTH = 3072
num_labels = 1 if REGRESSION else 6
SEED = 0

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(SEED)

In [None]:
data = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
sample = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
prompts = pd.read_csv('/kaggle/input/predict-the-prompts/predicted_prompt.csv')

In [None]:
data = data.merge(prompts, on='essay_id').drop('predicted', axis=1)
data['score_prompt'] = data['score'].astype(str) + '_' + data['prompt_name']

In [None]:
data['full_text'] = data['full_text'].apply(lambda x: x.strip())
data['full_text'].replace('', "'") \
                 .replace('', '“') \
                 .replace('', '”') \
                 .replace('', '')
data['labels'] = data['score'].map(lambda x: x-1)
if REGRESSION:
    data["labels"] = data["labels"].astype('float32')
else:
    data["labels"] = data["labels"].astype('int32')

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for i, (_, val_index) in enumerate(skf.split(data, data["score_prompt"])):
    data.loc[val_index, "fold"] = i

data.head()

In [None]:
class Tokenize:
    def __init__(self, train, valid, tokenizer):
        self.tokenizer = tokenizer
        self.train = train
        self.valid = valid
        
    def get_dataset(self, df):
        ds = Dataset.from_dict({
                'essay_id': [e for e in df['essay_id']],
                'full_text': [ft for ft in df['full_text']],
                'label': [s for s in df['labels']],
            })
        return ds
        
    def tokenize_function(self, sample):
        tokenized_inputs = self.tokenizer(
            sample['full_text'], truncation=True, max_length=MAX_LENGTH
        )
        return tokenized_inputs
    
    def __call__(self):
        train_ds = self.get_dataset(train)
        valid_ds = self.get_dataset(valid)
        
        tokenized_train = train_ds.map(
            self.tokenize_function, batched=True
        )
        tokenized_valid = valid_ds.map(
            self.tokenize_function, batched=True
        )
        
        return tokenized_train, tokenized_valid

In [None]:
if REGRESSION:
    def compute_metrics(p):
        predictions, labels = p
        qwk = cohen_kappa_score(labels, predictions.clip(0, 5).round(0), weights='quadratic')
        return {'qwk': qwk}
else:
    def compute_metrics(p):
        predictions, labels = p
        qwk = cohen_kappa_score(labels, predictions.argmax(-1), weights='quadratic')
        return {'qwk': qwk}

In [None]:
training_args = TrainingArguments(
    output_dir=f'/kaggle/working/funnel-{MODEL_SIZE}-ft_ver{VERSION}', 
    fp16=True,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    report_to="none",
    evaluation_strategy="steps",
    do_eval=True,
    eval_steps=100,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    lr_scheduler_type='cosine',
    metric_for_best_model="qwk",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_safetensors=True
)

In [None]:
for fold in range(len(data['fold'].unique())):
    train = data[data['fold'] != fold]
    valid = data[data['fold'] == fold]
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenize = Tokenize(train, valid, tokenizer)
    tokenized_train, tokenized_valid = tokenize()
    
    config = AutoConfig.from_pretrained(MODEL_NAME)
    if REGRESSION:
        config.attention_dropout = 0.0 # attention_probs_dropout_prob
        config.hidden_dropout = 0.0 # hidden_dropout_prob

    config.num_labels = num_labels
    config.truncate_seq = False
    
    model = FunnelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
    collator = DataCollatorWithPadding(tokenizer)
    
    trainer = Trainer(
        args=training_args,
        model=model,
        data_collator=collator,
        eval_dataset=tokenized_valid,
        train_dataset=tokenized_train,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )
    
    trainer.train()
    
    y_true = valid['score'].values
    predictions = trainer.predict(tokenized_valid).predictions
    predictions = predictions.round(0) + 1
    cm = confusion_matrix(y_true, predictions, labels=[x for x in range(1, 7)])
    draw_cm = ConfusionMatrixDisplay(confusion_matrix=cm,
                                     display_labels=[x for x in range(1, 7)])
    draw_cm.plot()
    plt.show()
    
    trainer.save_model(f'f-tfm-{MODEL_SIZE}_AES2_fold_{fold}')
    tokenizer.save_pretrained(f'f-tfm-{MODEL_SIZE}_AES2_fold_{fold}')
    
    valid.to_csv(f'valid_df_fold_{fold}.csv', index=False)