In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### ~~Idea 1: What if we feed the rubric to a pretrained LLM?~~
A: It's difficult to get working for a number of reasons. The rubric is not very semantically helpful and the LLM's default calibration is quite bad.
### Idea 2: Let's finetune an additional classifier head on top of a lightweight pretrained LLM.

### Preliminary: Exploratory Data Analysis:

In [None]:
DATA_DIR = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
train_df = pd.read_csv(DATA_DIR + "train.csv")
test_df = pd.read_csv(DATA_DIR + 'test.csv')

train_df.head()

In [None]:
# Examine manual features

def annotate_with_features(df):
    mean = lambda x: sum(x) / len(x)
    df["char_length"] = df["full_text"].apply(lambda x: len(x))
    df["word_length"] = df["full_text"].apply(lambda x: len(x.split(' ')))
    df["num_sentences"] = df["full_text"].apply(lambda x: len(x.split('.')))
    df["mean_sentence_wordlength"] = df["full_text"].apply(lambda x: mean([len(i.split(' ')) for i in x.split('.')]))
    df["mean_word_charlength"] = df["full_text"].apply(lambda x: mean([len(i) for i in x.split(' ')]))
    return df

train_df = annotate_with_features(train_df)
test_df = annotate_with_features(test_df)

In [None]:
train_df.head()

In [None]:
test_df.head()

Looks like test.csv is just the first three rows of train.csv. We can discard it for now.

In [None]:
train_df.describe()

Are the variables we extracted correlated with score?

In [None]:
import seaborn as sns
sns.pairplot(train_df.drop("full_text", axis = 1))

Looking at the top row, we can see that essays with more characters, more sentences, and longer words typcially have higher scores. It's not that long essays with polysyllabic words are always good - but it's nearly always the case that all good essays have decent length and word complexity.

### Local Evaluation Framework
Let's write a local evaluation framework such that we don't have to upload our results to examine our model's performance. Faster iteration = faster improvement!

The metric used in this competition is quadratic weighted Kappa. To understand what this means, let's construct a dummy submission:

In [None]:
from sklearn.metrics import confusion_matrix
import random

labels = train_df["score"]
fake_preds = train_df["score"].apply(lambda x: random.randint(1, 6))
cmat = confusion_matrix(labels, fake_preds)
sns.heatmap(cmat, annot = True)

In [None]:
import matplotlib.pyplot as plt
weight_matrix = np.zeros((6, 6))
for i in range(6):
    for j in range(6):
        weight_matrix[i, j] = (i-j)**2 / 25
plt.suptitle("Weight penalty (e.g larger mispredictions are penalized more heavily)")
sns.heatmap(weight_matrix, annot = True)

We've since created the confusion matrix $O$ and the weight matrix $W$. We then calculate the matrix of expected outcomes $E$ as the outer product of the value counts of the labels and predictions (sorry, I don't have a good intuition for what this operation does).

We then use the equation provided in *data* to calculate the quadratic weighted Kappa:

$$ \kappa = 1 - \frac{\sum_{i,j}W_{i, j} O_{i,j}}{\sum_{i,j} W_{i,j} E_{i,j}}$$

In [None]:
label_count_vector, pred_count_vector = np.zeros(6), np.zeros(6)
for i in labels.value_counts().index:
    label_count_vector[i-1] = labels.value_counts()[i]
for i in fake_preds.value_counts().index:
    pred_count_vector[i-1] = fake_preds.value_counts()[i]
expected_val = np.outer(label_count_vector, pred_count_vector).astype(np.float32)
expected_val /= expected_val.sum()
cmat = cmat.astype(np.float32)
cmat /= cmat.sum()

numerator, denominator = 0, 0
for i in range(6):
    for j in range(6):
        numerator += weight_matrix[i, j] * cmat[i, j]
        denominator += weight_matrix[i, j] * expected_val[i, j]
weighted_kappa = (1 - (numerator/denominator))
weighted_kappa

Wrapping it up in a function:

In [None]:
from sklearn.metrics import confusion_matrix
import random
import matplotlib.pyplot as plt
import torch as t

def quadratic_weighted_kappa(labels, pred):
    cmat = confusion_matrix(y_true = labels, y_pred = pred, labels = range(6))
    labels = pd.Series(labels, dtype = int); pred = pd.Series(pred, dtype = int)
    weight_matrix = np.zeros((6, 6))
    for i in range(6):
        for j in range(6):
            weight_matrix[i, j] = (i-j)**2 / 25
            
    label_count_vector, pred_count_vector = np.zeros(6), np.zeros(6)
    for i in labels.value_counts().index:
        label_count_vector[i-1] = labels.value_counts()[i]
    for i in pred.value_counts().index:
        pred_count_vector[i-1] = pred.value_counts()[i]
    expected_val = np.outer(label_count_vector, pred_count_vector).astype(np.float32)
    expected_val /= expected_val.sum()
    cmat = cmat.astype(np.float32)
    cmat /= cmat.sum()

    numerator, denominator = 0, 0
    for i in range(6):
        for j in range(6):
            numerator += weight_matrix[i, j] * cmat[i, j]
            denominator += weight_matrix[i, j] * expected_val[i, j]
    weighted_kappa = (1 - (numerator/denominator))
    return weighted_kappa

def tensor_QWK(y_true: t.Tensor, y_pred: t.Tensor) -> t.Tensor:
    assert y_true.shape == y_pred.shape
    y_true = y_true.cpu().detach().numpy()
    y_pred = y_pred.cpu().detach().numpy()
    return quadratic_weighted_kappa(y_true, y_pred)

Let's benchmark!

In [None]:
labels = train_df["score"]
random_preds = train_df["score"].apply(lambda x: random.randint(1, 6))
all_3s = train_df["score"].apply(lambda x: 3)
perfect_preds = train_df["score"]
print("Random preds:", quadratic_weighted_kappa(labels, random_preds))
print("All 3s:", quadratic_weighted_kappa(labels, all_3s))
print("Perfect preds:", quadratic_weighted_kappa(labels, perfect_preds))

Looks like being off by a large margin is heavily penalized. Let's try to make sure that doesn't happen.

### Flan-T5 series:

In [None]:
import torch as t
import torch.nn.functional as F
import torch.nn as nn

device = t.device("cuda:0" if t.cuda.is_available() else "cpu")

from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_df, test_size = 0.2)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import transformers

# MODEL_NAME = "pszemraj/flan-t5-large-grammar-synthesis"
MODEL_NAME = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, device_map=device, torch_dtype=t.float32)
tokenizer.model_max_length=4096

In [None]:
prompt = """After reading the following essay and completing the analytical rating form, assign a holistic score based on the rubric
below. For the following evaluations you will need to use a grading scale between 1 (minimum) and 50
(maximum). The distance between each grade should be considered equal. Please provide a single numerical score instead of 
a qualitative assessment. Note: 1 should be poor elementary level writing, 50 should be university level writing.
The essay is as follows:\n"""

decoder_prefix = "The essay is a "
decoder_input_ids = tokenizer(decoder_prefix, return_tensors = "pt").input_ids.to(device)

def get_scoring_logits(essay: str):
    input_text = prompt + essay
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    return model(input_ids = input_ids, decoder_input_ids = decoder_input_ids).logits[:, -1, :] # last token 

tokenizer_elements = []
for element in range(1, 7):
    tokenizer_elements.append(tokenizer(str(element), return_tensors = "pt").input_ids[0][0].item())
    
sample_essay = train_df["full_text"].iloc[0]
logits = get_scoring_logits(sample_essay)
logits.shape

Let's visualize the distribution of essay lengths (in tokens) such that we can define a good cutoff point to pad/truncate input sequences to.

In [None]:
from tqdm import tqdm, trange

length_distribution = [len(tokenizer(essay).input_ids) for essay in tqdm(train_df["full_text"])]
import seaborn as sns
sns.histplot(length_distribution)

Eyeballing the distribution above, 1250 looks like a good cutoff point; it'll include the majority of our essays while not overly limiting our batch size. Note: Other notebooks have used different token cutoff lengths. While different models use different tokenization schemes, tuning the essay cutoff length may be good to explore.

In [None]:
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange
class EssayDataset(Dataset):
    def __init__(self, df, train = True):
        self.df = df
        self.train = train
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        essay = self.df["full_text"].iloc[idx]
        input_text = prompt + essay
        input_ids = tokenizer(input_text, 
            return_tensors="pt", 
            padding = "max_length", 
            max_length = 1250, 
            truncation = True).input_ids[0].to(device)
        if self.train: 
            return input_ids, self.df["score"].iloc[idx] - 1 # 0-indexed
        return input_ids

class TransformerClassifier(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.linear = nn.Linear(model.config.vocab_size, 6).to(device).to(t.float32)
    def forward(self, input_ids):
        x = self.model(input_ids = input_ids, decoder_input_ids = decoder_input_ids.repeat(input_ids.shape[0], 1),
                          output_hidden_states = False, output_attentions = False).logits[:, -1]#, tokenizer_elements]
        return self.linear(x)
    def get_score(self, input_ids):
        logits = self.forward(input_ids)
        return t.argmax(logits).item()

clf = TransformerClassifier(model)
for param in model.parameters():
    param.requires_grad = False
# for param in model.get_submodule("lm_head").parameters():
#     param.requires_grad = True

class EMAMetric:
    def __init__(self, value = None, gamma = 0.95, sigfigs = 3):
        self.value = value
        self.gamma = gamma
        self.sigfigs = sigfigs
    def set(self, update):
        if self.value is None: 
            self.value = update
        else: 
            self.value = self.gamma * self.value + (1 - self.gamma) * update
        return self.value
    def __repr__(self):
        return str(round(self.value, self.sigfigs))

import collections
class History:
    def __init__(self):
        self.train_metrics = collections.defaultdict(list)
        self.val_metrics = collections.defaultdict(list)
    def append(self, **kwargs):
        assert type(kwargs) == dict
        assert all([isinstance(value, EMAMetric) for value in kwargs.values()]), "All values must be EMAMetric objects"
        for key, value in kwargs.items():
            if "val" in key:
                self.val_metrics[key].append(value.value)
            else:
                self.train_metrics[key].append(value.value)
    def plot(self):
        fig, axes = plt.subplots(2, figsize = (10, 10))
        sns.set()
        for key, value in self.train_metrics.items():
            sns.lineplot(value, label = key, ax = axes[0])
        for key, value in self.val_metrics.items():
            sns.lineplot(value, label = key, ax = axes[1])
        plt.legend()
        plt.show()

def train(train_df, val_df, optimizer = t.optim.AdamW(clf.parameters(), lr = 1e-5), BATCH_SIZE = 32, EPOCHS = 1):
    train_metrics = {
        "loss": EMAMetric(), "acc": EMAMetric(), "QWK": EMAMetric(),
    }
    val_metrics = {
        "val_loss": EMAMetric(), "val_acc": EMAMetric(), "val_QWK": EMAMetric()
    }
    history = History()

    train_dataset = EssayDataset(train_df)
    val_dataset = EssayDataset(val_df)
    train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = False)
    CELoss = nn.CrossEntropyLoss()
    # scaler = t.cuda.amp.GradScaler()

    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}")
        with tqdm(train_loader) as pbar:
            for input_ids, target in pbar:
                target = target.to(device)
                # all_logits = model(input_ids = input_ids, decoder_input_ids = decoder_input_ids.repeat(BATCH_SIZE, 1),
                #     output_hidden_states = False, output_attentions = False)["logits"]
                # num_logits = all_logits[:, -1,  tokenizer_elements]
                num_logits = clf(input_ids)
                optimizer.zero_grad()
                loss = CELoss(num_logits, target)
                loss.backward()
                optimizer.step()
                acc = (t.argmax(num_logits, dim = -1) == target).to(t.float32).mean()
                q_weighted_kappa = tensor_QWK(target, t.argmax(num_logits, dim = -1))
                train_metrics["loss"].set(loss.item())
                train_metrics["acc"].set(acc.item())
                train_metrics["QWK"].set(q_weighted_kappa)
                history.append(**train_metrics)
                desc = f"Loss: {train_metrics['loss']}, Accuracy: {train_metrics['acc']}, QWK: {train_metrics['QWK']}"
                pbar.set_description(desc)
        with t.no_grad():
            with tqdm(val_loader) as pbar:
                for input_ids, target in pbar:
                    target = target.to(device)
                    num_logits = clf(input_ids)
                    loss = CELoss(num_logits, target)
                    acc = (t.argmax(num_logits, dim = -1) == target).to(t.float32).mean()
                    q_weighted_kappa = tensor_QWK(target, t.argmax(num_logits, dim = -1))
                    val_metrics["val_loss"].set(loss.item())
                    val_metrics["val_acc"].set(acc.item())
                    val_metrics["val_QWK"].set(q_weighted_kappa)
                    history.append(**val_metrics)
                    desc = f"Val Loss: {val_metrics['val_loss']}, Val Accuracy: {val_metrics['val_acc']}, Val QWK: {val_metrics['val_QWK']}"
                    pbar.set_description(desc)

    return history

### Training the classifier head -- all other weights frozen.

In [None]:
history = train(train_df, val_df, EPOCHS = 5, BATCH_SIZE = 32)
history.plot()

### Finetuning the whole model, including the transformer weights.

In [None]:
for param in model.parameters():
    param.requires_grad = True

history = train(train_df, val_df, EPOCHS = 5, BATCH_SIZE = 8, 
                optimizer = t.optim.AdamW(clf.parameters(), lr = 5e-7))
history.plot()

In [None]:
t.save(clf, "transformer_classifier_May_1.pt")

### Creating a submission file

In [None]:
test_ds = EssayDataset(test_df, train = False)
scores = []
for input_ids in test_ds:
    scores.append(clf.get_score(input_ids.unsqueeze(0)) + 1)
test_df["score"] = scores
test_df[["essay_id", "score"]].to_csv("submission.csv", index = False)