<a href="https://colab.research.google.com/github/trgscott/LELA60332_Coursework/blob/main/CL2_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Computational Linguistics 2 - NER Assignment**

**Install libraries / prerequisites**

A GPU runtime is also required.

In [None]:
!pip install datasets

In [None]:
from collections import defaultdict, Counter
from urllib import request
import json
import pandas as pd
import random
from random import shuffle
from math import ceil
import torch
import torch.nn as nn
from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_scheduler, BitsAndBytesConfig
import datasets
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from glob import glob
import os

**Set the devices**

In [None]:
encoder_device = "cuda"  # set to 0 when on MCR system, "cuda" when using Colab
clf_head_device = "cuda" # set to 0 when on MCR system, "cuda" when using Colab
device = "cuda" if torch.cuda.is_available() else 'cpu' # set to 0 when on MCR system, "cuda" when using Colab

**Fix random seeds for reproducibility**

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

**Code provided with assignment instructions to download the data**

In [None]:
def parse_conllu_using_pandas(block):
    records = []
    for line in block.splitlines():
        if not line.startswith('#'):
            records.append(line.strip().split('\t'))
    return pd.DataFrame.from_records(
        records,
        columns=['ID', 'FORM', 'TAG', 'Misc1', 'Misc2'])

In [None]:
def tokens_to_labels(df):
    return (
        df.FORM.tolist(),
        df.TAG.tolist()
    )

In [None]:
PREFIX = "https://raw.githubusercontent.com/UniversalNER/"
DATA_URLS = {
    "en_ewt": {
        "train": "UNER_English-EWT/master/en_ewt-ud-train.iob2",
        "dev": "UNER_English-EWT/master/en_ewt-ud-dev.iob2",
        "test": "UNER_English-EWT/master/en_ewt-ud-test.iob2"
    },
    "en_pud": {
        "test": "UNER_English-PUD/master/en_pud-ud-test.iob2"
    }
}

In [None]:
# en_ewt is the main train-dev-test split
# en_pud is the OOD test set
data_dict = defaultdict(dict)
for corpus, split_dict in DATA_URLS.items():
    for split, url_suffix in split_dict.items():
        url = PREFIX + url_suffix
        with request.urlopen(url) as response:
            txt = response.read().decode('utf-8')
            data_frames = map(parse_conllu_using_pandas,
                              txt.split('\n\n'))
            token_label_alignments = list(map(tokens_to_labels,
                                              data_frames))
            data_dict[corpus][split] = token_label_alignments

In [None]:
# Saving the data so that you don't have to redownload it each time.
with open('ner_data_dict.json', 'w', encoding='utf-8') as out:
    json.dump(data_dict, out, indent=2, ensure_ascii=False)

In [None]:
# Each subset of each corpus is a list of tuples where each tuple
# is a list of tokens with a corresponding list of labels.

# Train on data_dict['en_ewt']['train']; validate on data_dict['en_ewt']['dev']
# and test on data_dict['en_ewt']['test'] and data_dict['en_pud']['test']
data_dict['en_ewt']['train'][0], data_dict['en_pud']['test'][6]

((['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?'],
  ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']),
 (['$', '5,000', 'per', 'person', ',', 'the', 'maximum', 'allowed', '.'],
  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']))

# **Data preparation**

**Pair the tokens with the labels in sentences, with seven labels**

In [None]:
# Method to pair the tokens with the labels within sentences
def pair_data(data):
  paired_data = []
  for tokens, labels in data:
    paired_example = []
    for i in range(len(tokens)):
      paired_example.append([tokens[i], labels[i]])
    paired_data.append(paired_example)

  labels = [label for sentence in paired_data for _, label in sentence]
  label_counts = Counter(labels)
  print(label_counts)

  return paired_data

In [None]:
#Pair the data and remove the final empty sentence from each dataset
paired_data_train = pair_data(data_dict['en_ewt']['train'][0:12543])
paired_data_dev = pair_data(data_dict['en_ewt']['dev'][0:2001])
paired_data_test = pair_data(data_dict['en_ewt']['test'][0:2077])
paired_data_OoD = pair_data(data_dict['en_pud']['test'][0:1000])

Counter({'O': 194219, 'B-PER': 2874, 'B-LOC': 2712, 'B-ORG': 1436, 'I-PER': 1294, 'I-ORG': 1167, 'I-LOC': 877})
Counter({'O': 23653, 'B-LOC': 399, 'B-PER': 343, 'B-ORG': 224, 'I-PER': 196, 'I-ORG': 186, 'I-LOC': 148})
Counter({'O': 23418, 'B-PER': 449, 'B-ORG': 322, 'B-LOC': 317, 'I-ORG': 276, 'I-PER': 243, 'I-LOC': 72})
Counter({'O': 19611, 'B-LOC': 425, 'B-PER': 415, 'B-ORG': 235, 'I-LOC': 175, 'I-PER': 160, 'I-ORG': 155})


In [None]:
#Check lengths now match the Universal NER paper / original source of data (12543, 2001, 2077, 1000)
print(len(paired_data_train))
print(len(paired_data_dev))
print(len(paired_data_test))
print(len(paired_data_OoD))

12543
2001
2077
1000


In [None]:
#Shuffle by sentence
random.shuffle(paired_data_train)
random.shuffle(paired_data_dev)
random.shuffle(paired_data_test)
random.shuffle(paired_data_OoD)

In [None]:
#Confirm format / random seed matching - should read 'It would have been more than one could bear!'
paired_data_train[1:2]

[[['It', 'O'],
  ['would', 'O'],
  ['have', 'O'],
  ['been', 'O'],
  ['more', 'O'],
  ['than', 'O'],
  ['one', 'O'],
  ['could', 'O'],
  ['bear', 'O'],
  ['!', 'O']]]

**Create new versions of the shuffled paired data with three labels - 'B', 'I', 'O'**

In [None]:
### Method to subsitute one label ###
def substitute_labels(data, old_label, new_label):
  new_data = []
  for sentence in data:
    new_example = []
    for token, label in sentence:
      if label == old_label:
        new_example.append([token, new_label])
      else:
        new_example.append([token, label])
    new_data.append(new_example)

  return new_data

In [None]:
#Method to substitute all labels
def substitute_all_labels(data):
  new_one = substitute_labels(data, "B-LOC", "B")
  new_two = substitute_labels(new_one, "B-ORG", "B")
  new_three = substitute_labels(new_two, "B-PER", "B")
  new_four = substitute_labels(new_three, "I-LOC", "I")
  new_five = substitute_labels(new_four, "I-ORG", "I")
  new_six = substitute_labels(new_five, "I-PER", "I")
  return new_six

In [None]:
# Substitute the labels and create the new data versions
three_data_train = substitute_all_labels(paired_data_train)
three_data_dev = substitute_all_labels(paired_data_dev)
three_data_test = substitute_all_labels(paired_data_test)
three_data_OoD = substitute_all_labels(paired_data_OoD)

In [None]:
# check the new labels
three_labels = [label for sentence in three_data_train for _, label in sentence]
three_label_counts = Counter(three_labels)
print(f'Training data label counts: {three_label_counts}')

Training data label counts: Counter({'O': 194219, 'B': 7022, 'I': 3338})


**Create relevant labels and number of classes**

In [None]:
def create_labels_and_classes(data):
  labels = set()
  for ex in data:
    labels.update([el[1] for el in ex])
  n_classes = len(labels)
  return sorted(labels), n_classes

In [None]:
seven = create_labels_and_classes(paired_data_train)
three = create_labels_and_classes(three_data_train)
print(seven)
print(three)

(['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'], 7)
(['B', 'I', 'O'], 3)


# **Set the task - three labels or seven labels - reset as needed**

In [None]:
#Change THIS_MANY_LABELS to three or seven depending on the task



THIS_MANY_LABELS = seven


# Do not change the below
labels = three[0] if THIS_MANY_LABELS == three else seven[0]
n_classes = three[1] if THIS_MANY_LABELS == three else seven[1]

# **Encoder only - BERT**

In [None]:
#Using cased version of BERT as likely better for NER / proper nouns
model_tag = 'google-bert/bert-base-cased'
tokeniser = AutoTokenizer.from_pretrained(model_tag)
encoder = AutoModel.from_pretrained(model_tag).to(encoder_device)

In [None]:
# Classification head, with dropout and hidden layer,
# in similar vein to encoder structure (Gelu and 0.1 dropout)
class ClassificationHead(nn.Module):
    def __init__(self, model_dim=768, n_classes=n_classes): # classes defined above
        super().__init__()
        self.linear = nn.Linear(model_dim, model_dim)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(model_dim, n_classes)

    def forward(self, x):
        x = self.linear(x)
        x = self.gelu(x)
        x = self.dropout(x)
        return self.linear2(x)

In [None]:
clf_head = ClassificationHead(n_classes=n_classes) # classes defined above
clf_head.to(clf_head_device);

In [None]:
# Experiment with freezing first few layers of the encoder:

#for name, param in encoder.named_parameters():
#    if name.startswith("bert.encoder.layer.0") or name.startswith("bert.encoder.layer.1"):
#        param.requires_grad = False

#optimizer_parameters = [param for name, param in encoder.named_parameters()
#    if not (name.startswith("bert.encoder.layer.0") or name.startswith("bert.encoder.layer.1"))
#] + list(clf_head.parameters())

#https://datascientistsdiary.com/fine-tuning-bert-a-practical-guide/#:~:text=Schedulers%20fix%20this.%20The%20best%20scheduler%20for,for%2010%25%20of%20training%20lr_scheduler%20=%20get_scheduler

In [None]:
def process_sentence(sentence, label_to_i, tokeniser, encoder, clf_head,
                     encoder_device, clf_head_device):
    gold_labels = torch.tensor(
        [label_to_i[label] for _, label in sentence]).to(clf_head_device)
    words = [word for word, _ in sentence]
    tokenisation = tokeniser(words, is_split_into_words=True,
                             return_tensors='pt')
    inputs = {k: v.to(encoder_device) for k, v in tokenisation.items()}

    # Don't need the embeddings of the CLS or SEP tokens
    outputs = encoder(**inputs).last_hidden_state[0, 1:-1, :]

    # Take the first subword. The logic is that we will fine-tune the
    # encoder as well, and we hope that it will learn to channel all the
    # necessary information into first subwords.
    # Note that word_ids are found only in the original tokeniser output,
    # in the dictionary with tensors copied to the GPU.
    # We ignore the CLS and the SEP tokens
    word_ids = tokenisation.word_ids()[1:-1]
    processed_words = set()
    first_subword_embeddings = []
    # Indices of subwords in outputs are aligned with word_ids, so we can use
    # the same indices in both arrays.
    for i, word_id in enumerate(word_ids):
        if word_id not in processed_words:
            first_subword_embeddings.append(outputs[i])
            processed_words.add(word_id)

    # Check that we aligned words and labels correctly.
    assert len(first_subword_embeddings) == gold_labels.size(0)

    # Combine subword embeddings into a tensor and copy to the device
    # where the classifier head resides.
    clf_head_inputs = torch.vstack(
        first_subword_embeddings).to(clf_head_device)

    # Return the logits and gold labels for subsequent processing
    return clf_head(clf_head_inputs), gold_labels

In [None]:
def train_epoch(data, label_to_i, tokeniser, encoder, clf_head,
                encoder_device, clf_head_device, loss_fn, optimiser):
    encoder.train()
    epoch_losses = torch.empty(len(data))
    for step_n, sentence in tqdm(
        enumerate(data),
        total=len(data),
        desc='Train',
        leave=False
    ):
        optimiser.zero_grad()
        logits, gold_labels = process_sentence(
            sentence, label_to_i, tokeniser,
            encoder, clf_head, encoder_device,
            clf_head_device)
        loss = loss_fn(logits, gold_labels)
        loss.backward()
        optimiser.step()
        epoch_losses[step_n] = loss.item()
    return epoch_losses.mean().item()

In [None]:
def validate_epoch(data, label_to_i, tokeniser, encoder, clf_head,
                   encoder_device, clf_head_device):
    encoder.eval()
    epoch_accuracies = torch.empty(len(data))
    all_predictions = [] # for the f1 scoring
    all_labels = [] # for the f1 scoring
    for step_n, sentence in tqdm(
        enumerate(data),
        total=len(data),
        desc='Eval',
        leave=False
    ):
        with torch.no_grad():
            logits, gold_labels = process_sentence(
                sentence, label_to_i, tokeniser,
                encoder, clf_head, encoder_device,
                clf_head_device)
        predicted_labels = torch.argmax(logits, dim=-1)
        epoch_accuracies[step_n] = (predicted_labels == gold_labels).sum().item() / len(sentence)
        # collect the predictions and gold labels for the f1 scoring
        all_predictions.extend(predicted_labels.cpu().numpy())
        all_labels.extend(gold_labels.cpu().numpy())

    f1_scores = {}
    precision_scores = {}
    recall_scores = {}
    # calculate the scores for each class
    for label_index in range(n_classes):
      #f1 calculation
      f1 = f1_score(np.array(all_labels) == label_index, np.array(all_predictions) == label_index, average='binary')
      f1_scores[i_to_label[label_index]] = f1
      #precision calculation
      precision = precision_score(np.array(all_labels) == label_index, np.array(all_predictions) == label_index, average='binary')
      precision_scores[i_to_label[label_index]] = precision
      #recall calculation
      recall = recall_score(np.array(all_labels) == label_index, np.array(all_predictions) == label_index, average='binary')
      recall_scores[i_to_label[label_index]] = recall

    # calculate the f1 macro average
    macro_f1 = f1_score(all_labels, all_predictions, average='macro')
    # calculate the f1 micro average
    micro_f1 = f1_score(all_labels, all_predictions, average='micro')

    return epoch_accuracies.mean().item(), f1_scores, macro_f1, predicted_labels, micro_f1, precision_scores, recall_scores

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

In [None]:
#Method to compute span accuracy - correct sentences / total sentences
def span_accuracy_encoder(data):
  encoder.eval()
  correct_sentences = 0
  total_sentences = len(data)
  for sentence in data:
      with torch.no_grad():
          logits, gold_labels = process_sentence(
              sentence, label_to_i, tokeniser,
              encoder, clf_head, encoder_device,
              clf_head_device
          )
      predicted_labels = torch.argmax(logits, dim=-1)
      if torch.all(predicted_labels == gold_labels):
          correct_sentences += 1
  span_accuracy_encoder = correct_sentences / total_sentences
  return span_accuracy_encoder

In [None]:
# TRAINING #

#Ensure correct data is being used based on label choice made further above
if THIS_MANY_LABELS == three:
  training_data = three_data_train
  dev_data = three_data_dev
  test_data = three_data_test
  OoD_data = three_data_OoD
elif THIS_MANY_LABELS == seven:
    training_data = paired_data_train
    dev_data = paired_data_dev
    test_data = paired_data_test
    OoD_data = paired_data_OoD

print(f'You are running this with {n_classes} labels, are you sure?')
print(f'Your labels are {labels}')

# The models expect class numbers, not strings
# Code is here to make sure this changes depending on the number of labels chosen
label_to_i = {
    label: i
    for i, label in enumerate(sorted(labels))
}
i_to_label = {
    i: label
    for label, i in label_to_i.items()
}

#Hyperparameters
n_epochs = 8
loss_fn = nn.CrossEntropyLoss()

#change to (optimizer_parameters, lr=10**(-5)) if freezing layers
optimiser = torch.optim.AdamW(list(encoder.parameters()) + list(clf_head.parameters()), lr=10**(-5))

#Early stopping set up, based on improving macro_f1
best_f1 = 0
last_epoch_with_dev_improvement = 0
n_epochs_without_improvement = 0
early_stopping_threshold = 3

#Training, validation and early stopping
for epoch_n in tqdm(range(n_epochs)):

    loss = train_epoch(training_data, label_to_i, tokeniser, encoder, clf_head, encoder_device, clf_head_device, loss_fn, optimiser)
    print(f'Epoch {epoch_n+1} training loss: {loss:.2f}')

    accuracy, _, macro_f1, _, micro_f1, _, _ = validate_epoch(dev_data, label_to_i, tokeniser, encoder, clf_head, encoder_device, clf_head_device)
    print(f'Epoch {epoch_n+1} dev accuracy: {accuracy:.2f}')
    print(f'Epoch {epoch_n+1} dev macro f1: {macro_f1:.2f}')
    print(f'Epoch {epoch_n+1} dev micro f1: {micro_f1:.2f}')

    if macro_f1 > best_f1:
      best_f1 = macro_f1
      last_epoch_with_dev_improvement = epoch_n
      for path in glob('*.pt'):
          os.remove(path)
      torch.save(encoder.state_dict(), 'best_encoder.pt')
      torch.save(clf_head.state_dict(), 'best_clf_head.pt')
    else:
      n_epochs_without_improvement = epoch_n - last_epoch_with_dev_improvement
      if n_epochs_without_improvement == early_stopping_threshold:
          print(f'{n_epochs_without_improvement} without improvement; early stopping.')
          break

You are running this with 7 labels, are you sure?
Your labels are ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']


  0%|          | 0/8 [00:00<?, ?it/s]

Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 1 training loss: 0.10


Eval:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 1 dev accuracy: 0.98
Epoch 1 dev macro f1: 0.81
Epoch 1 dev micro f1: 0.98


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 2 training loss: 0.04


Eval:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 2 dev accuracy: 0.98
Epoch 2 dev macro f1: 0.83
Epoch 2 dev micro f1: 0.98


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 3 training loss: 0.02


Eval:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 3 dev accuracy: 0.98
Epoch 3 dev macro f1: 0.84
Epoch 3 dev micro f1: 0.98


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 4 training loss: 0.01


Eval:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 4 dev accuracy: 0.98
Epoch 4 dev macro f1: 0.83
Epoch 4 dev micro f1: 0.98


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 5 training loss: 0.01


Eval:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 5 dev accuracy: 0.98
Epoch 5 dev macro f1: 0.82
Epoch 5 dev micro f1: 0.98


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 6 training loss: 0.01


Eval:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 6 dev accuracy: 0.98
Epoch 6 dev macro f1: 0.84
Epoch 6 dev micro f1: 0.98
3 without improvement; early stopping.


In [None]:
#Loading the saved model as required
# encoder_best = encoder()
# encoder_best.load_state_dict(torch.load('best_encoder.pt', weights_only=True))

# **TESTING ON THE TEST SETS**

In [None]:
#Test set
_, f1_scores, macro_f1, _, micro_f1, precision_scores, recall_scores = validate_epoch(test_data, label_to_i, tokeniser, encoder, clf_head, encoder_device, clf_head_device)

print(f'Epoch {epoch_n+1} test accuracy: {accuracy:.2f}')

print(f'Epoch {epoch_n+1} test macro f1: {macro_f1:.2f}')

print("F1")
for label, f1 in f1_scores.items():
  print(f"{label}: {f1}")

print("Precision")
for label, precision in precision_scores.items():
  print(f"{label}: {precision}")

print("Recall")
for label, recall in recall_scores.items():
  print(f"{label}: {recall}")

print(f"Span accuracy on test set: {span_accuracy_encoder(test_data)}")

Eval:   0%|          | 0/2077 [00:00<?, ?it/s]

Epoch 6 test accuracy: 0.98
Epoch 6 test macro f1: 0.83
F1
B-LOC: 0.8892508143322475
B-ORG: 0.7221350078492935
B-PER: 0.9082774049217002
I-LOC: 0.6935483870967742
I-ORG: 0.7113594040968343
I-PER: 0.915057915057915
O: 0.9918924685299765
Precision
B-LOC: 0.9191919191919192
B-ORG: 0.7301587301587301
B-PER: 0.9123595505617977
I-LOC: 0.8269230769230769
I-ORG: 0.7318007662835249
I-PER: 0.8618181818181818
O: 0.9911734606856558
Recall
B-LOC: 0.861198738170347
B-ORG: 0.7142857142857143
B-PER: 0.9042316258351893
I-LOC: 0.5972222222222222
I-ORG: 0.6920289855072463
I-PER: 0.9753086419753086
O: 0.9926125202835425
Span accuracy on test set: 0.8791526239768898


In [None]:
#Out of Domain test set
_, f1_scores, macro_f1, _, micro_f1, precision_scores, recall_scores = validate_epoch(OoD_data, label_to_i, tokeniser, encoder, clf_head, encoder_device, clf_head_device)

print(f'Epoch {epoch_n+1} OoD accuracy: {accuracy:.2f}')

print(f'Epoch {epoch_n+1} OoD macro f1: {macro_f1:.2f}')

print("F1")
for label, f1 in f1_scores.items():
  print(f"{label}: {f1}")

print("Precision")
for label, precision in precision_scores.items():
  print(f"{label}: {precision}")

print("Recall")
for label, recall in recall_scores.items():
  print(f"{label}: {recall}")

print(f"Span accuracy on OoD set: {span_accuracy_encoder(OoD_data)}")

Eval:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 6 OoD accuracy: 0.98
Epoch 6 OoD macro f1: 0.83
F1
B-LOC: 0.8078175895765473
B-ORG: 0.6130653266331658
B-PER: 0.9391727493917275
I-LOC: 0.7261904761904762
I-ORG: 0.7763157894736842
I-PER: 0.9408284023668639
O: 0.9936533020671373
Precision
B-LOC: 0.75
B-ORG: 0.7484662576687117
B-PER: 0.9484029484029484
I-LOC: 0.7577639751552795
I-ORG: 0.7919463087248322
I-PER: 0.8932584269662921
O: 0.9933747834063806
Recall
B-LOC: 0.8752941176470588
B-ORG: 0.5191489361702127
B-PER: 0.9301204819277108
I-LOC: 0.6971428571428572
I-ORG: 0.7612903225806451
I-PER: 0.99375
O: 0.9939319769517108
Span accuracy on OoD set: 0.779


# **BERT model configuration**

In [None]:
encoder_config = encoder.config
encoder_config

BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [None]:
encoder

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
clf_head

ClassificationHead(
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (gelu): GELU(approximate='none')
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=768, out_features=7, bias=True)
)

# **Encoder-Decoder - T5-Small**

In [None]:
model_tag = 'google-t5/t5-small'

model = AutoModelForSeq2SeqLM.from_pretrained(model_tag, cache_dir='./hf_cache').to(device)
tokeniser = AutoTokenizer.from_pretrained(model_tag)

optim = torch.optim.AdamW(model.parameters(), lr=10**(-4)) # T5 paper suggests 10**(-3) for fine-tune lr, but 10**(-4) appears to work better

In [None]:
def process_batch(batch_inputs, batch_labels,
                  tokeniser, model, device,
                  optimiser, max_len=512):
    optimiser.zero_grad()
    tokenisation = tokeniser(
        batch_inputs,
        return_tensors='pt',
        max_length=max_len,
        padding='longest',
        truncation=True
    )
    input_ids = tokenisation.input_ids.to(device)
    attention_mask = tokenisation.attention_mask.to(device)
    labels = tokeniser(
        batch_labels,
        return_tensors='pt',
        max_length=max_len,
        padding='longest',
        truncation=True
    ).input_ids.to(device)
    # Stop the model from generating pad tokens
    labels[labels == tokeniser.pad_token_id] = -100
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

    loss = model(**inputs).loss
    #print(f'loss: {loss.item():.3f}') # checking if loss is reducing since it always seems to be zero - conclude it's to do with how averaged over sentences
    loss.backward()
    optimiser.step()
    return loss.item()

In [None]:
# Convert each sentence to a batch of examples, one per target word,
# and highlight the target word using tildes, e.g.:
# '~ I ~ see a dog .' -> 'PRON'
# 'I ~ see ~ a dog .' -> 'VERB'
# 'I see ~ a ~ dog .' -> 'DET'
# etc.
# Batch size will now control how many words we will analyse at the
# same time.

def prepare_sentence(sentence_array):
    words = []; labels = []
    for word, label in sentence_array:
        words.append(word); labels.append(label)
    prepared_inputs = []
    for i in range(len(words)):
        tmp = words[:i] + ['~', words[i], '~'] + words[i+1:]
        prepared_inputs.append(' '.join(tmp))
    return prepared_inputs, labels

In [None]:
prepare_sentence(paired_data_train[1]) # confirm prepare_sentence works and matches random shuffle above - should read 'It would have been more than one could bear!'

(['~ It ~ would have been more than one could bear !',
  'It ~ would ~ have been more than one could bear !',
  'It would ~ have ~ been more than one could bear !',
  'It would have ~ been ~ more than one could bear !',
  'It would have been ~ more ~ than one could bear !',
  'It would have been more ~ than ~ one could bear !',
  'It would have been more than ~ one ~ could bear !',
  'It would have been more than one ~ could ~ bear !',
  'It would have been more than one could ~ bear ~ !',
  'It would have been more than one could bear ~ ! ~'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])

In [None]:
def train_epoch(train_inputs, batch_size,
                tokeniser, model, device, optimizer):
    model.train()

    n_steps = len(train_inputs)
    epoch_losses = torch.zeros(n_steps)
    for step_n in tqdm(range(n_steps), leave=False, desc='Train'):
        prepared_inputs, labels = prepare_sentence(train_inputs[step_n])
        # Split the sentence in batches if it is long enough
        n_batches = ceil(len(prepared_inputs) / batch_size)
        sentence_losses_accum = 0.0 # since sentences are repeated multiple times, need to get losses for each version of sentence and accumulate then average
        for step_n in range(n_batches):
            lo = step_n * batch_size
            hi = lo + batch_size
            batch_texts = prepared_inputs[lo:hi]
            batch_labels = labels[lo:hi]
            loss = process_batch(batch_texts, batch_labels,
                                 tokeniser, model, device,
                                 optimizer)
            #print(f'{loss:.3f}', end=' ') # checking losses again
            sentence_losses_accum += loss
        epoch_losses[step_n] = sentence_losses_accum / n_batches
    return epoch_losses.mean().item()

In [None]:
def get_class_prediction(prompt, tokeniser, model, device, max_len=512):
    tokenisation = tokeniser(
        prompt,
        return_tensors='pt',
        max_length=max_len,
        truncation=True
    )
    input_ids = tokenisation.input_ids.to(device)
    attention_mask = tokenisation.attention_mask.to(device)
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
    max_new_tokens=4).squeeze() # squeeze deletes batch dimension, max_new_tokens limited to 4 as only need one - just in case of random special token

    # Take the first word
    output_string = tokeniser.decode(
        output,
        skip_special_tokens=True
    ).strip()
    if not output_string:
        # Empty output
        return None
    return output_string.split()[0]

In [None]:
def validate_epoch(dev_inputs, tokeniser, model, device, max_len=512):
    model.eval()
    n_steps = len(dev_inputs)
    epoch_hits = []
    all_predictions = []
    all_labels = []
    for step_n in tqdm(range(n_steps), leave=False, desc='Validate'):
        prepared_inputs, labels = prepare_sentence(dev_inputs[step_n])
        with torch.no_grad():
            for input_sentence, gold_label in zip(prepared_inputs, labels):
                predicted_label = get_class_prediction(
                    input_sentence, tokeniser, model, device,
                    max_len=max_len)
                epoch_hits.append(int(predicted_label == gold_label))
                all_predictions.append(predicted_label)
                all_labels.append(gold_label)

    labels_set = set(all_labels)
    f1_scores = {}
    for label in labels_set:
        f1 = f1_score(np.array(all_labels) == label, np.array(all_predictions) == label, average="binary")
        f1_scores[label] = f1

    macro_f1 = f1_score(all_labels, all_predictions, average="macro")

    return sum(epoch_hits) / len(epoch_hits), f1_scores, macro_f1

In [None]:
#Method to calculate f1 scores for encoder-decoder
def calculate_f1_scores(data, tokeniser, model, device):
    model.eval()
    all_predictions = []
    all_labels = []
    for sentence in tqdm(data, leave=False, desc='F1 scores'):
        prepared_inputs, labels = prepare_sentence(sentence)
        with torch.no_grad():
            for input_sentence, gold_label in zip(prepared_inputs, labels):
                predicted_label = get_class_prediction(input_sentence, tokeniser, model, device)
                all_predictions.append(predicted_label)
                all_labels.append(gold_label)

    labels_set = set(all_labels)
    f1_scores = {}
    for label in labels_set:
        f1 = f1_score(np.array(all_labels) == label, np.array(all_predictions) == label, average="binary")
        f1_scores[label] = f1

    macro_f1 = f1_score(all_labels, all_predictions, average="macro")
    return f1_scores, macro_f1

In [None]:
#Method to calculate the span accuracy for encoder-decoder
def span_accuracy(data):
  model.eval()
  correct_sentences = 0
  total_sentences = len(data)
  for sentence in data:
      prepared_inputs, labels = prepare_sentence(sentence)
      sentence_correct = True
      for input_sentence, gold_label in zip(prepared_inputs, labels):
          with torch.no_grad():
              predicted_label = get_class_prediction(input_sentence, tokeniser, model, device)
          if predicted_label != gold_label:
              sentence_correct = False
              break
      if sentence_correct:
          correct_sentences += 1

  accuracy = correct_sentences / total_sentences
  return accuracy

In [None]:
# TRAINING #

#Ensure using correct data based on label choice further above
if THIS_MANY_LABELS == three:
  training_data = three_data_train
  dev_data = three_data_dev
  test_data = three_data_test
  OoD_data = three_data_OoD
elif THIS_MANY_LABELS == seven:
    training_data = paired_data_train
    dev_data = paired_data_dev
    test_data = paired_data_test
    OoD_data = paired_data_OoD

print(f'You are running this with {n_classes} labels, are you sure?')
print(f'Your labels are {labels}')

n_epochs = 4
batch_size = 256 # batch is just how many words so can be large

#Early stopping set up, based on improving macro_f1
best_f1 = 0
last_epoch_with_dev_improvement = 0
n_epochs_without_improvement = 0
early_stopping_threshold = 2

for epoch_n in tqdm(range(n_epochs)):
    epoch_loss = train_epoch(training_data, batch_size, tokeniser, model, device, optim)
    print(f'Epoch {epoch_n+1} loss:', round(epoch_loss, 2))

    epoch_dev_accuracy, _, macro_f1 = validate_epoch(dev_data, tokeniser, model, device)
    print(f'Epoch {epoch_n+1} dev accuracy: {epoch_dev_accuracy:.2f}')
    print(f'Epoch {epoch_n+1} dev macro F1: {macro_f1:.2f}')

    if macro_f1 > best_f1:
      best_f1 = macro_f1
      last_epoch_with_dev_improvement = epoch_n
      print('Saving the model.')
      for path in glob('*.pt'):
          os.remove(path)
      torch.save(model.state_dict(), 'best_t5_model.pt')
    else:
      n_epochs_without_improvement = epoch_n - last_epoch_with_dev_improvement
      if n_epochs_without_improvement == early_stopping_threshold:
          print(f'{n_epochs_without_improvement} without improvement; early stopping.')
          break

#loss is not a good indicator of how well the model is performing because of the way it is averaging over multiple sentences
#dev accuaracy is not a good indicator either because if it's just predicting O it will get amazing accuracy
#Macro f1 is a better measure to check performance because it solves for the above issues

You are running this with 7 labels, are you sure?
Your labels are ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']


  0%|          | 0/4 [00:00<?, ?it/s]

Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 loss: 0.0


Validate:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 1 dev accuracy: 0.98
Epoch 1 dev macro F1: 0.79
Saving the model.


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 2 loss: 0.0


Validate:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 2 dev accuracy: 0.98
Epoch 2 dev macro F1: 0.80
Saving the model.


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 3 loss: 0.0


Validate:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 3 dev accuracy: 0.98
Epoch 3 dev macro F1: 0.80
Saving the model.


Train:   0%|          | 0/12543 [00:00<?, ?it/s]

Epoch 4 loss: 0.0


Validate:   0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 4 dev accuracy: 0.98
Epoch 4 dev macro F1: 0.82
Saving the model.


# **Testing on the test sets**

In [None]:
#Test set
print(f"Span Accuracy on test set: {span_accuracy(test_data)}")

f1_scores, macro_f1 = calculate_f1_scores(test_data, tokeniser, model, device)

print("F1 scores per label on test data:")
for label, score in f1_scores.items():
    print(f"{label}: {score}")
print(f"Macro F1 score on test data: {macro_f1}")

Span Accuracy on test set: 0.8526721232546943


F1 scores:   0%|          | 0/2077 [00:00<?, ?it/s]

F1 scores per label on test data:
B-ORG: 0.6416275430359938
O: 0.9907904015043056
I-PER: 0.9046653144016227
B-LOC: 0.8274809160305343
B-PER: 0.9004237288135594
I-ORG: 0.5971943887775552
I-LOC: 0.49696969696969695
Macro F1 score on test data: 0.7655931413618955


In [None]:
# Out of Domain test set
print(f"Span Accuracy on Out of Domain set: {span_accuracy(OoD_data)}")

f1_scores, macro_f1 = calculate_f1_scores(OoD_data, tokeniser, model, device)

print("F1 scores per label on Out of Domain data:")
for label, score in f1_scores.items():
    print(f"{label}: {score}")
print(f"Macro F1 score on Out of Domain data: {macro_f1}")

Span Accuracy on Out of Domain set: 0.733


F1 scores:   0%|          | 0/1000 [00:00<?, ?it/s]

F1 scores per label on Out of Domain data:
B-ORG: 0.6415094339622641
O: 0.9919268305145368
I-PER: 0.8942598187311178
B-LOC: 0.7934065934065934
B-PER: 0.8754208754208754
I-ORG: 0.7133333333333334
I-LOC: 0.7062146892655368
Macro F1 score on Out of Domain data: 0.8022959392334653


In [None]:
model.config

T5Config {
  "_attn_implementation_autoset": true,
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 3

In [None]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop