In [1]:
from utils import load_asap_dataset, load_toefl_dataset, get_score_range

In [2]:
import numpy as np
import torch
import json

import polars as pl
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset # Rename to avoid conflict

In [3]:
TASK = "ASAP"
# TASK = "TOEFL11"

PROMPT = 2

In [4]:
if TASK == "ASAP":
    df_test = load_asap_dataset('datasets/ASAP', stratify=True)
    df_train = load_asap_dataset('datasets/ASAP', stratify=False).filter(~pl.col("essay_id").is_in(df_test['essay_id']))
elif TASK == "TOEFL11":
    df = load_toefl_dataset('datasets/TOEFL11')

In [5]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=12)

In [6]:
df_train = df_train.filter(pl.col("essay_set") == PROMPT)
df_val = df_val.filter(pl.col("essay_set") == PROMPT)
df_test = df_test.filter(pl.col("essay_set") == PROMPT)

In [7]:
min_score, max_score = get_score_range(TASK, PROMPT)
df_train = df_train.with_columns(
    ((pl.col("score") - min_score) / (max_score - min_score)).alias("normalized_score")
)
df_val = df_val.with_columns(
    ((pl.col("score") - min_score) / (max_score - min_score)).alias("normalized_score")
)
df_test = df_test.with_columns(
    ((pl.col("score") - min_score) / (max_score - min_score)).alias("normalized_score")
)


In [8]:
df_train

essay_set,essay_id,essay,score,normalized_score
i64,i64,str,i64,f64
1,1628,"""Dear local newspaper, I think …",10,0.8
1,562,"""Dear @CAPS1 @CAPS2 @CAPS3, @CA…",10,0.8
1,1746,"""Computers can take a lot of a …",10,0.8
1,1430,"""Dear Local Newspaper, @CAPS1 y…",8,0.6
1,441,"""Dear Local Newspaper: Computer…",8,0.6
…,…,…,…,…
1,1107,"""Computer are good and bad beca…",4,0.2
1,1300,"""I do not beileve that computer…",7,0.5
1,245,"""I believe computers are a bene…",7,0.5
1,1266,"""Guess what! Do you like to use…",8,0.6


In [9]:
# --- Configuration ---
# Specify the pre-trained model name. Can be changed to "bert-base-uncased", "FacebookAI/roberta-base", "microsoft/deberta-v3-large", etc.
model_name = "bert-base-uncased"
num_train_epochs = 10 # Number of training epochs (adjust as needed)
batch_size = 32 # Batch size per device (adjust based on GPU memory)
max_length = 512 # Max sequence length for tokenizer

# --- 3. Load Tokenizer and Model ---
# Load the tokenizer associated with the chosen pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the pre-trained model for sequence classification.
# Set num_labels=1 for regression tasks.
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# --- 4. Tokenize Data ---
# Tokenize the texts using the loaded tokenizer
train_encodings = tokenizer(df_train['essay'].to_list(), truncation=True, padding="max_length", max_length=max_length)
dev_encodings = tokenizer(df_val['essay'].to_list(), truncation=True, padding="max_length", max_length=max_length)
test_encodings = tokenizer(df_test['essay'].to_list(), truncation=True, padding="max_length", max_length=max_length)

# --- 5. Create Custom PyTorch Dataset ---
class EssayDatasetTmp(TorchDataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Retrieve tokenized inputs for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the corresponding label, converting it to a tensor
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) # Ensure label is float tensor
        return item

    def __len__(self):
        # Return the total number of samples
        return len(self.labels)

# Instantiate the custom dataset for training and evaluation sets
train_dataset = EssayDatasetTmp(train_encodings, df_train['normalized_score'])
dev_dataset = EssayDatasetTmp(dev_encodings, df_val['normalized_score'])
test_dataset = EssayDatasetTmp(test_encodings, df_test['normalized_score'])

# --- 6. Define Training Arguments ---
# Configure the training process using TrainingArguments (remains the same)
training_args = TrainingArguments(
    output_dir='./outputs/prompt-specific',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1, # Number of steps for learning rate warmup
    weight_decay=2e-5, # Strength of weight decay regularization
    optim="adamw_torch", # Use the AdamW optimizer
    logging_strategy="steps",  # Log metrics at the end of each epoch
    logging_steps=10, # Log every 10 steps
    eval_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=False, # Load the best model found during training at the end
    metric_for_best_model="eval_qwk", # Use Mean Squared Error to determine the best model
    greater_is_better=True, # Lower MSE is better
    report_to="none", # Disable external reporting integrations like WandB/TensorBoard for simplicity
    fp16=torch.cuda.is_available(), # Use mixed precision training if a GPU is available
)

# --- 7. Define Compute Metrics Function ---
# Define a function to compute metrics during evaluation (MSE, MAE, and QWK for regression)
def prepare_compute_metrics(minscore, maxscore):
    def compute_metrics(eval_pred: EvalPrediction):
        predictions, labels = eval_pred
        # Predictions might be logits or regression outputs, squeeze them if necessary
        if len(predictions.shape) > 1:
            predictions = predictions.squeeze(-1)

        # Calculate standard regression metrics
        rmse = np.sqrt(mean_squared_error(labels, predictions))
        mae = mean_absolute_error(labels, predictions)

        # Convert predictions and labels to scores based on min/max values
        # This step is necessary for calculating QWK
        predictions = predictions * (maxscore - minscore) + minscore
        labels = labels * (maxscore - minscore) + minscore
        qwk = cohen_kappa_score(np.round(predictions), np.round(labels), weights="quadratic", labels=[i for i in range(minscore, maxscore + 1)])
        lwk = cohen_kappa_score(np.round(predictions), np.round(labels), weights="linear", labels=[i for i in range(minscore, maxscore + 1)])

        # Calculate Correlation Coefficient
        corr = np.corrcoef(predictions, labels)[0, 1]

        return {"rmse": rmse, "mae": mae, "qwk": qwk, "lwk": lwk, "corr": corr}
    return compute_metrics

# --- 8. Instantiate Trainer ---
# Initialize the Trainer with the model, arguments, custom datasets, tokenizer, and metrics function
# Note: The tokenizer is still passed for potential use cases like saving, but not strictly needed for data loading now.
min_score, max_score = get_score_range(TASK, PROMPT)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=prepare_compute_metrics(min_score, max_score),
)

# --- 9. Train the Model ---
print(f"Starting fine-tuning for {model_name}...")
trainer.train()
print("Fine-tuning completed.")

# --- (Optional) Evaluate the Best Model ---
print("Evaluating the best model on the validation set...")
eval_results = trainer.evaluate(eval_dataset=test_dataset)
print("Evaluation results:", eval_results)

# Save Metrics with json
print("Saving metrics...")
with open(f"outputs/prompt-specific/prompt{PROMPT}.json", "w") as metrics_file:
    json.dump(eval_results, metrics_file)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting fine-tuning for bert-base-uncased...


Epoch,Training Loss,Validation Loss,Rmse,Mae,Qwk,Lwk,Corr
1,0.0165,0.015033,0.122609,0.09904,0.653756,0.423783,0.804534
2,0.0102,0.007968,0.089262,0.068933,0.827873,0.620772,0.841496
3,0.0094,0.007433,0.086216,0.065524,0.80746,0.609521,0.854762
4,0.0078,0.007348,0.085721,0.066331,0.823129,0.616575,0.850087
5,0.0063,0.007672,0.087591,0.067754,0.812728,0.599584,0.847692
6,0.0049,0.007666,0.087557,0.067467,0.80983,0.60623,0.84444
7,0.0043,0.009341,0.096649,0.0768,0.784794,0.55937,0.841634
8,0.0034,0.008482,0.092097,0.072645,0.791346,0.566434,0.839802
9,0.0031,0.008358,0.09142,0.071898,0.800701,0.572546,0.838835
10,0.0028,0.00823,0.090719,0.071317,0.798546,0.576075,0.837811


Fine-tuning completed.
Evaluating the best model on the validation set...


Evaluation results: {'eval_loss': 0.00664439145475626, 'eval_rmse': 0.08151313662528992, 'eval_mae': 0.0624455027282238, 'eval_qwk': 0.851268068576551, 'eval_lwk': 0.6590455787840732, 'eval_corr': 0.8718656008644179, 'eval_runtime': 0.4587, 'eval_samples_per_second': 394.557, 'eval_steps_per_second': 13.079, 'epoch': 10.0}
Saving metrics...
