In [None]:
# Following this Course:
# https://huggingface.co/learn/nlp-course/en/chapter7/2

In [None]:
!pip install seqeval

In [None]:
!pip install evaluate

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os

path = "/content/drive/MyDrive/NLP_Project_New"
sys.path.append(os.path.abspath(path))

In [None]:
import nlp_project_functions as functions

from transformers import BertTokenizerFast
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline

import torch
import numpy as np
import evaluate
import pandas as pd

from sklearn.model_selection import train_test_split

In [None]:
# training and validation data
texts, tags = functions.read_conll_data('/content/drive/MyDrive/NLP_Project_New/data/train_test_val/train.tsv')

In [None]:
# split data
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [None]:
# label mappings
label_names_dict = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'I-LOC': 4}
label_names_list = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC"]

In [None]:
model_checkpoint = "dbmdz/bert-base-german-cased"

In [None]:
# tokenize
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=False)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=False)

In [None]:
unique_labels = label_names_list
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

In [None]:
def encode_tags(tags, encodings):
    labels = [[label2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []

    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())
    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
# create dataset

class SermonDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")

train_dataset = SermonDataset(train_encodings, train_labels)
val_dataset = SermonDataset(val_encodings, val_labels)

In [None]:
# model evaluation
metric = evaluate.load("seqeval")

In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    results = {
        "per_precision": all_metrics["PER"]["precision"],
        "per_recall": all_metrics["PER"]["recall"],
        "per_f1": all_metrics["PER"]["f1"],
        "loc_precision": all_metrics["LOC"]["precision"],
        "loc_recall":all_metrics["LOC"]["recall"],
        "loc_f1": all_metrics["LOC"]["f1"],
        "overall_precision": all_metrics["overall_precision"],
        "overall_recall": all_metrics["overall_recall"],
        "overall_f1": all_metrics["overall_f1"],
        "overall_accuracy": all_metrics["overall_accuracy"],
    }

    return results

In [None]:
# define the model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label = id2label,
    label2id = label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments(
    "/content/drive/MyDrive/NLP_Project_New/models/bert_base_german_cased_finetuned_3ep",
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=600,
    load_best_model_at_end=True,
    save_total_limit=5,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    logging_strategy="steps",
    logging_dir='/content/drive/MyDrive/NLP_Project_New/logs/bert_base_german_cased_finetuned_3ep',
    logging_steps=200,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Per Precision,Per Recall,Per F1,Loc Precision,Loc Recall,Loc F1,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
200,0.4417,0.052198,0.480631,0.481322,0.480976,0.0,0.0,0.0,0.480631,0.366922,0.416149,0.984861
400,0.0494,0.037054,0.707666,0.603448,0.651415,0.662162,0.451613,0.536986,0.698584,0.56736,0.626171,0.989408
600,0.0411,0.033521,0.707071,0.704023,0.705544,0.627204,0.573733,0.599278,0.689288,0.673056,0.681075,0.990125
800,0.0382,0.03237,0.752294,0.647989,0.696256,0.567347,0.640553,0.601732,0.698638,0.646221,0.671408,0.990078
1000,0.0376,0.029067,0.751799,0.750718,0.751258,0.609091,0.617512,0.613272,0.717486,0.719058,0.718271,0.991059
1200,0.0303,0.029047,0.746971,0.752874,0.749911,0.647355,0.592166,0.618532,0.725,0.714677,0.719801,0.991154
1400,0.026,0.029944,0.694002,0.814655,0.749504,0.570909,0.723502,0.638211,0.663004,0.79299,0.722195,0.9907
1600,0.0301,0.027939,0.739218,0.788075,0.762865,0.61987,0.66129,0.639911,0.710837,0.757941,0.733634,0.99137
1800,0.0222,0.029126,0.700887,0.79454,0.744781,0.614894,0.665899,0.639381,0.681152,0.763965,0.720186,0.990843
2000,0.0237,0.027136,0.774628,0.785201,0.779879,0.557692,0.735023,0.634195,0.712052,0.773275,0.741402,0.991633


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3516, training_loss=0.04951901616903702, metrics={'train_runtime': 1703.7336, 'train_samples_per_second': 33.019, 'train_steps_per_second': 2.064, 'total_flos': 1.346534636586048e+16, 'train_loss': 0.04951901616903702, 'epoch': 3.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/NLP_Project_New/models/bert_base_german_cased_finetuned_3ep")

In [None]:
model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/NLP_Project_New/models/bert_base_german_cased_finetuned_3ep")

In [None]:
token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="first"
)

In [None]:
test_texts, test_labels = functions.read_conll_data('/content/drive/MyDrive/NLP_Project_New/data/train_test_val/test.tsv')

In [None]:
all_sentences = []

for text in test_texts:
  sentence = " ".join(text)
  all_sentences.append(sentence)

In [None]:
all_predictions = []
for sentence in all_sentences:
  prediction = token_classifier(sentence)
  all_predictions.append(prediction)

In [None]:
def list_transformer_results(sentence: str, predictions: list) -> tuple:
  words = sentence.split(" ")

  results = []

  running_char = 0
  for word in words:
    found_match = next((d for d in predictions if d.get("start") == running_char), None)
    if found_match:

      results.append(found_match.get("entity_group"))
    else:
      results.append("O")
    running_char += len(word) + 1

  return words, results

In [None]:
tokens = []
labels = []
predictions = []
for text, prediction, labels_list in zip(all_sentences, all_predictions, test_labels):
  word_list, prediction_list = list_transformer_results(text, prediction)
  tokens.extend(word_list)
  labels.extend(labels_list)
  predictions.extend(prediction_list)

In [None]:
prediction_comparison = pd.DataFrame(
    {"TOKEN": tokens,
     "NER": labels,
     "bert_base_german_cased_finetuned_3ep": predictions
     })

In [None]:
prediction_comparison.head()

Unnamed: 0,TOKEN,NER,bert_base_german_cased_finetuned_3ep
0,Darum,O,O
1,auch,O,O
2,der,O,O
3,selige,O,O
4,Lutherus,PER,PER


In [None]:
simple_ne = {"B-PER": "PER", "I-PER": "PER", "B-LOC": "LOC", "I-LOC": "LOC", "O": "O"}

In [None]:
prediction_comparison["NER"] = prediction_comparison["NER"].apply(lambda x: simple_ne[x])

In [None]:
prediction_comparison

Unnamed: 0,TOKEN,NER,bert_base_german_cased_finetuned_3ep
0,Darum,O,O
1,auch,O,O
2,der,O,O
3,selige,O,O
4,Lutherus,PER,PER
...,...,...,...
157045,Verbot,O,O
157046,in,O,O
157047,sich,O,O
157048,halten,O,O
