In [1]:
# following this tutorial:
# https://flairnlp.github.io/docs/tutorial-training/how-to-train-sequence-tagger

In [None]:
!pip install flair

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys
import os

path = "/content/drive/MyDrive/NLP_Project_New"
sys.path.append(os.path.abspath(path))

In [5]:
import nlp_project_functions as functions
from flair.data import Corpus
from flair.datasets import ColumnCorpus

from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

from flair.data import Sentence

import pandas as pd

from datetime import datetime
import os.path

import logging

logfile = f"{path}/logs/run_model.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("run_model.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [6]:
# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = f'{path}/data/train_test_val/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.tsv')

2024-03-26 09:10:02,527 Reading data from /content/drive/MyDrive/NLP_Project_New/data/train_test_val
2024-03-26 09:10:02,528 Train: /content/drive/MyDrive/NLP_Project_New/data/train_test_val/train.tsv
2024-03-26 09:10:02,529 Dev: None
2024-03-26 09:10:02,531 Test: None
2024-03-26 09:10:10,625 No test split found. Using 0% (i.e. 2344 samples) of the train split as test data
2024-03-26 09:10:10,641 No dev split found. Using 0% (i.e. 2110 samples) of the train split as dev data


In [7]:
label_type = 'ner'
label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print(label_dict)

2024-03-26 09:10:12,154 Computing label dictionary. Progress:


10it [00:00, 6108.80it/s]
18986it [00:00, 45525.99it/s]

2024-03-26 09:10:12,634 Dictionary created for label 'ner' with 2 values: PER (seen 5909 times), LOC (seen 1761 times)
Dictionary with 2 tags: PER, LOC





In [None]:
# 5. initialize sequence tagger
pretrained_model = SequenceTagger.load("dbmdz/flair-historic-ner-onb")

In [9]:
# 6. initialize trainer
trainer : ModelTrainer = ModelTrainer(pretrained_model, corpus)

In [10]:
trainer.fine_tune(f'{path}/models/flair-historic-ner-lft-finetuned-3ep_2e-5',
                  learning_rate = 2e-5,
                  mini_batch_size=16,
                  weight_decay=0.01,
                  max_epochs=3,
                  save_final_model=False)

2024-03-26 09:10:39,331 ----------------------------------------------------------------------------------------------------
2024-03-26 09:10:39,332 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): BytePairEmbeddings(model=0-bpe-de-100000-300)
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(134, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=134, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(134, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=134, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4696, out_features=4696, bias=True)
  (rnn): LSTM(4696, 512,

100%|██████████| 132/132 [00:36<00:00,  3.58it/s]

2024-03-26 09:16:43,419 DEV : loss 0.03176604583859444 - f1-score (micro avg)  0.6111
2024-03-26 09:16:43,471 ----------------------------------------------------------------------------------------------------





2024-03-26 09:17:15,289 epoch 2 - iter 118/1187 - loss 0.04186360 - time (sec): 31.82 - samples/sec: 1598.81 - lr: 0.000014 - momentum: 0.000000
2024-03-26 09:17:47,364 epoch 2 - iter 236/1187 - loss 0.03972006 - time (sec): 63.89 - samples/sec: 1596.56 - lr: 0.000013 - momentum: 0.000000
2024-03-26 09:18:20,119 epoch 2 - iter 354/1187 - loss 0.03984297 - time (sec): 96.65 - samples/sec: 1586.90 - lr: 0.000013 - momentum: 0.000000
2024-03-26 09:18:52,316 epoch 2 - iter 472/1187 - loss 0.03878738 - time (sec): 128.84 - samples/sec: 1587.65 - lr: 0.000012 - momentum: 0.000000
2024-03-26 09:19:25,004 epoch 2 - iter 590/1187 - loss 0.03845291 - time (sec): 161.53 - samples/sec: 1585.92 - lr: 0.000011 - momentum: 0.000000
2024-03-26 09:19:59,249 epoch 2 - iter 708/1187 - loss 0.03805330 - time (sec): 195.78 - samples/sec: 1575.86 - lr: 0.000010 - momentum: 0.000000
2024-03-26 09:20:31,562 epoch 2 - iter 826/1187 - loss 0.03769214 - time (sec): 228.09 - samples/sec: 1580.61 - lr: 0.000010 - 

100%|██████████| 132/132 [00:37<00:00,  3.50it/s]

2024-03-26 09:22:49,293 DEV : loss 0.029416603967547417 - f1-score (micro avg)  0.6433
2024-03-26 09:22:49,347 ----------------------------------------------------------------------------------------------------





2024-03-26 09:23:20,642 epoch 3 - iter 118/1187 - loss 0.03155002 - time (sec): 31.29 - samples/sec: 1610.69 - lr: 0.000007 - momentum: 0.000000
2024-03-26 09:23:52,289 epoch 3 - iter 236/1187 - loss 0.03324761 - time (sec): 62.94 - samples/sec: 1624.39 - lr: 0.000006 - momentum: 0.000000
2024-03-26 09:24:24,911 epoch 3 - iter 354/1187 - loss 0.03413439 - time (sec): 95.56 - samples/sec: 1609.56 - lr: 0.000005 - momentum: 0.000000
2024-03-26 09:24:58,977 epoch 3 - iter 472/1187 - loss 0.03447898 - time (sec): 129.63 - samples/sec: 1590.93 - lr: 0.000004 - momentum: 0.000000
2024-03-26 09:25:29,078 epoch 3 - iter 590/1187 - loss 0.03481413 - time (sec): 159.73 - samples/sec: 1612.97 - lr: 0.000004 - momentum: 0.000000
2024-03-26 09:26:01,953 epoch 3 - iter 708/1187 - loss 0.03460518 - time (sec): 192.60 - samples/sec: 1606.09 - lr: 0.000003 - momentum: 0.000000
2024-03-26 09:26:34,235 epoch 3 - iter 826/1187 - loss 0.03447727 - time (sec): 224.89 - samples/sec: 1604.69 - lr: 0.000002 - 

100%|██████████| 132/132 [00:38<00:00,  3.42it/s]

2024-03-26 09:28:54,590 DEV : loss 0.02878720499575138 - f1-score (micro avg)  0.6546
2024-03-26 09:28:54,647 ----------------------------------------------------------------------------------------------------
2024-03-26 09:28:54,649 Testing using last state of model ...



100%|██████████| 147/147 [00:38<00:00,  3.81it/s]

2024-03-26 09:29:33,277 
Results:
- F-score (micro) 0.6487
- F-score (macro) 0.6269
- Accuracy 0.4833

By class:
              precision    recall  f1-score   support

         PER     0.7667    0.5913    0.6677       756
         LOC     0.7438    0.4837    0.5862       246

   micro avg     0.7618    0.5649    0.6487      1002
   macro avg     0.7552    0.5375    0.6269      1002
weighted avg     0.7611    0.5649    0.6477      1002

2024-03-26 09:29:33,279 ----------------------------------------------------------------------------------------------------





{'test_score': 0.6487106017191977}

In [23]:
# get current date, time, and model
now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M-%S")
day = now.strftime("%d-%m")

model_name = "dbmdz/flair-historic-ner-onb_finetuned_3ep_2e-5"

model_checkpoint = model_name
attempt = model_name + '--' + timestamp

logger.info(f"Initiated model test. Attempt: {attempt}")

2024-03-26 09:37:54,704 - run_model.log - INFO - Initiated model test. Attempt: dbmdz/flair-historic-ner-onb_finetuned_3ep_2e-5--03-26-09-37-54
INFO:run_model.log:Initiated model test. Attempt: dbmdz/flair-historic-ner-onb_finetuned_3ep_2e-5--03-26-09-37-54


In [12]:
def create_checksum(df):
    test_string = ""
    for i in range(5):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    for i in range(len(df) - 5, len(df)):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    return test_string

In [13]:
test_texts, test_labels = functions.read_conll_data('/content/drive/MyDrive/NLP_Project_New/data/train_test_val/test.tsv')

In [14]:
all_sentences = []

for text in test_texts:
  sentence = " ".join(text)
  all_sentences.append(sentence)

In [15]:
all_predictions = []
total = len(all_sentences)
for idx, item in enumerate(all_sentences):

  if idx % 500 == 0:
    print(f"On prediction {idx} ({((idx/total)*100):.2f}% done).")
  sentence = Sentence(item)
  pretrained_model.predict(sentence)
  entities = sentence.get_spans('ner')
  if len(entities) > 0:
    ent_list = []
    for entity in entities:
      ent_list.append(entity.to_dict())
    all_predictions.append(ent_list)
  else:
    all_predictions.append([])

On prediction 0 (0.00% done).
On prediction 500 (8.53% done).
On prediction 1000 (17.07% done).
On prediction 1500 (25.60% done).
On prediction 2000 (34.14% done).
On prediction 2500 (42.67% done).
On prediction 3000 (51.20% done).
On prediction 3500 (59.74% done).
On prediction 4000 (68.27% done).
On prediction 4500 (76.80% done).
On prediction 5000 (85.34% done).
On prediction 5500 (93.87% done).


In [16]:
def list_flair_results(sentence: str, predictions: list) -> tuple:
    words = sentence.split(" ")
    results = []

    running_char = 0

    for word in words:
      found_match = next((d for d in predictions if d.get("start_pos") == running_char), None)
      if found_match:
        results.append(found_match.get('labels')[0]["value"])
      else:
        results.append("O")
      running_char += len(word) + 1

    return words, results

In [17]:
label_transl = {"LABEL_0": "PER", "LABEL_1": "LOC", "O": "O", "_": ""}

In [18]:
tokens = []
labels = []
predictions = []
for text, prediction, labels_list in zip(all_sentences, all_predictions, test_labels):
  word_list, prediction_list = list_flair_results(text, prediction)
  tokens.extend(word_list)
  tokens.append("")
  labels.extend(labels_list)
  labels.append("")
  predictions.extend(prediction_list)
  predictions.append("_")

label_vocab = set(predictions)

if "PER" not in label_vocab:
  predictions = [label_transl[item] for item in predictions]
else:
  predictions = [item.replace('_', '') for item in predictions]


predictions = functions.transform_to_BIO(predictions)

In [19]:
prediction_comparison = pd.DataFrame(
    {"TOKEN": tokens,
     "NER": labels,
     attempt: predictions
     })

In [20]:
len(prediction_comparison)

162909

In [21]:
pred_checksum = create_checksum(prediction_comparison)

In [24]:
comparison_path = f'{path}/model_comparisons/test-predictions_comparison.tsv'

if os.path.isfile(comparison_path):
    pred_comp_df = pd.read_csv(comparison_path, sep='\t')
    file_checksum = create_checksum(pred_comp_df)
    if file_checksum == pred_checksum:
        pred_comp_df[attempt] = predictions
        pred_comp_df.to_csv(comparison_path, sep='\t', index=False)
        logger.info("The predictions were added to the predictions table.")
    else:
        logger.error("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
        raise Exception("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
else:
    prediction_comparison.to_csv(comparison_path, sep='\t', index=False)
    logger.info(f"No predictions table was found at location {comparison_path}, so a new one was created.")

2024-03-26 09:38:10,303 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.


In [25]:
for i in range(1,8):
  test_texts, test_labels = functions.read_conll_data(f'{path}/data/advers_tests/perturbation_{i}.tsv')
  all_sentences = []

  for text in test_texts:
    sentence = " ".join(text)
    all_sentences.append(sentence)

  all_predictions = []
  total = len(all_sentences)

  for idx, item in enumerate(all_sentences):

    if idx % 500 == 0:
      print(f"On prediction {idx} ({((idx/total)*100):.2f}% done).")
    sentence = Sentence(item)
    pretrained_model.predict(sentence)
    entities = sentence.get_spans('ner')
    if len(entities) > 0:
      ent_list = []
      for entity in entities:
        ent_list.append(entity.to_dict())
      all_predictions.append(ent_list)
    else:
      all_predictions.append([])

  tokens = []
  labels = []
  predictions = []
  for text, prediction, labels_list in zip(all_sentences, all_predictions, test_labels):
    word_list, prediction_list = list_flair_results(text, prediction)
    tokens.extend(word_list)
    tokens.append("")
    labels.extend(labels_list)
    labels.append("")
    predictions.extend(prediction_list)
    predictions.append("_")

  label_vocab = set(predictions)

  if "PER" not in label_vocab:
    predictions = [label_transl[item] for item in predictions]
  else:
    predictions = [item.replace('_', '') for item in predictions]


  predictions = functions.transform_to_BIO(predictions)

  prediction_comparison = pd.DataFrame(
    {"TOKEN": tokens,
     "NER": labels,
     attempt: predictions
     })

  prediction_comparison.drop(prediction_comparison.tail(1).index,inplace=True)

  pred_checksum = create_checksum(prediction_comparison)

  comparison_path = f'{path}/advers_model_comparisons/perturb_{i}_comparison.tsv'

  if os.path.isfile(comparison_path):
      pred_comp_df = pd.read_csv(comparison_path, sep='\t')
      file_checksum = create_checksum(pred_comp_df)
      if file_checksum == pred_checksum:
          pred_comp_df[attempt] = predictions[:-1]
          pred_comp_df.to_csv(comparison_path, sep='\t', index=False)
          logger.info("The predictions were added to the predictions table.")
      else:
          logger.error("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
          raise Exception("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
  else:
      prediction_comparison.to_csv(comparison_path, sep='\t', index=False)
      logger.info(f"No predictions table was found at location {comparison_path}, so a new one was created.")

On prediction 0 (0.00% done).
On prediction 500 (25.67% done).
On prediction 1000 (51.33% done).
On prediction 1500 (77.00% done).


2024-03-26 09:41:56,471 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.


On prediction 0 (0.00% done).
On prediction 500 (25.67% done).
On prediction 1000 (51.33% done).
On prediction 1500 (77.00% done).


2024-03-26 09:43:19,240 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.


On prediction 0 (0.00% done).
On prediction 500 (25.67% done).
On prediction 1000 (51.33% done).
On prediction 1500 (77.00% done).


2024-03-26 09:44:42,300 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.


On prediction 0 (0.00% done).
On prediction 500 (25.67% done).
On prediction 1000 (51.33% done).
On prediction 1500 (77.00% done).


2024-03-26 09:46:05,128 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.


On prediction 0 (0.00% done).
On prediction 500 (25.67% done).
On prediction 1000 (51.33% done).
On prediction 1500 (77.00% done).


2024-03-26 09:47:27,652 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.


On prediction 0 (0.00% done).
On prediction 500 (25.67% done).
On prediction 1000 (51.33% done).
On prediction 1500 (77.00% done).


2024-03-26 09:48:50,387 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.


On prediction 0 (0.00% done).
On prediction 500 (25.67% done).
On prediction 1000 (51.33% done).
On prediction 1500 (77.00% done).


2024-03-26 09:50:12,999 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.
