In [1]:
import pandas as pd
import numpy as np
import transformers
import datasets

In [2]:
DATA = "../data/"

In [None]:
from transformers import (
    LongformerTokenizer,
    LongformerForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

import torch

seed = 42
model_name = "allenai/longformer-base-4096"
tokenizer = LongformerTokenizer.from_pretrained(model_name, max_length=2048)

device = torch.device("cuda")

In [4]:
def tokenize_inputs(example):
    return tokenizer(example["text"])

In [5]:
baa_ds = datasets.load_from_disk("../data/baa_ds.hf").map(tokenize_inputs, batched=True)
control_ds = datasets.load_from_disk("../data/control_ds.hf").map(
    tokenize_inputs, batched=True
)
hl_ds = datasets.load_from_disk("../data/hl_ds.hf").map(tokenize_inputs, batched=True)
w_ds = datasets.load_from_disk("../data/w_ds.hf").map(tokenize_inputs, batched=True)

Loading cached processed dataset at /home/jovyan/active-projects/persuade-bias/data/baa_ds.hf/train/cache-20055cbe7cc9eb65.arrow
Loading cached processed dataset at /home/jovyan/active-projects/persuade-bias/data/baa_ds.hf/valid/cache-d57a96b9850810e1.arrow


Map:   0%|          | 0/3199 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [7]:
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_squared_error,
    mean_absolute_error,
)


def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = (
        1
        / len(labels)
        * np.sum(2 * np.abs(logits - labels) / (np.abs(labels) + np.abs(logits)) * 100)
    )
    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
learning_rate = 3e-05
batch_size = 8
seed = 42
num_epochs = 3


def model_init():
    return LongformerForSequenceClassification.from_pretrained(
        model_name, num_labels=1
    ).to(device)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=2048)

In [14]:
def train_model(name, ds):
    trainer = Trainer(model_init=model_init)

    training_args = TrainingArguments(
        output_dir=f"./results/" + name + "_checkpoints",
        optim="adamw_torch",
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        learning_rate=learning_rate,
        logging_dir=f"./logs/content",
        save_total_limit=10,
        load_best_model_at_end=True,
        metric_for_best_model="mse",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        greater_is_better=False,
        seed=seed,
        log_level="error",  # took me ages to find these options
        disable_tqdm=False,  # enable output cell scrolling in JupyterLab for even more beautiful output :D
    )

    # Call the Trainer
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        data_collator=data_collator,
        train_dataset=ds["train"],
        eval_dataset=ds["valid"],
        compute_metrics=compute_metrics_for_regression,
        # callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model
    trainer.train()

    trainer.save_model("../bin/" + name)

In [15]:
train_model("baa_model", baa_ds)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bi

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,No log,0.424339,0.424339,0.651413,0.518137,0.672631,34483.265
2,0.672600,0.370605,0.370605,0.608774,0.476378,0.714086,31337.315
3,0.326500,0.3616,0.3616,0.601332,0.465458,0.721033,30896.0275




In [16]:
train_model("w_model", w_ds)



Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,No log,0.352101,0.352101,0.593381,0.47631,0.732467,28407.6625
2,0.695700,0.312372,0.312372,0.558902,0.442592,0.762654,28837.3625
3,0.332800,0.343782,0.343782,0.586329,0.453595,0.738787,29531.0475




In [17]:
train_model("hl_model", hl_ds)



Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,No log,0.364114,0.364114,0.603419,0.457996,0.66751,29077.5075
2,0.732100,0.581462,0.581462,0.762536,0.587729,0.469039,31728.81
3,0.320700,0.389924,0.389924,0.624439,0.46612,0.643941,30537.5175




In [None]:
train_model("control_model", control_ds)



Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,No log,0.638159,0.638159,0.798848,0.645488,0.475196,30318.89
2,0.658000,0.407477,0.407477,0.63834,0.500693,0.664902,31002.12


