In [None]:
# Based on the following article:
# https://medium.com/@lokaregns/named-entity-recognition-with-hugging-face-transformers-a-beginners-guide-e1ac6085fb3c

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os

path = "/content/drive/MyDrive/NLP_Project_New"
sys.path.append(os.path.abspath(path))

In [None]:
import nlp_project_functions as functions

from transformers import BertTokenizerFast
from transformers import AutoModelForTokenClassification
from transformers import pipeline

import pandas as pd
import math

from datetime import datetime
import os.path

import logging

logfile = f"{path}/logs/run_model.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("run_model.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [None]:
# get current date, time, and model
now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M-%S")
day = now.strftime("%d-%m")

model_name = "dbmdz/bert-base-german-cased"

model_checkpoint = model_name
attempt = model_name + '--' + timestamp

logger.info(f"Initiated model test. Attempt: {attempt}")

2024-03-24 14:56:59,669 - run_model.log - INFO - Initiated model test. Attempt: dbmdz/bert-base-german-cased--03-24-14-56-59
2024-03-24 14:56:59,669 - run_model.log - INFO - Initiated model test. Attempt: dbmdz/bert-base-german-cased--03-24-14-56-59
INFO:run_model.log:Initiated model test. Attempt: dbmdz/bert-base-german-cased--03-24-14-56-59


In [None]:
def create_checksum(df):
    test_string = ""
    for i in range(5):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    for i in range(len(df) - 5, len(df)):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    return test_string

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

In [None]:
# tokenize
tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-german-cased")

In [None]:
token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="first"
)

In [None]:
test_texts, test_labels = functions.read_conll_data('/content/drive/MyDrive/NLP_Project_New/data/train_test_val/test.tsv')

In [None]:
all_sentences = []

for text in test_texts:
  sentence = " ".join(text)
  all_sentences.append(sentence)

In [None]:
all_predictions = []
for sentence in all_sentences:
  prediction = token_classifier(sentence)
  all_predictions.append(prediction)

In [None]:
label_transl = {"LABEL_0": "PER", "LABEL_1": "LOC", "O": "O", "_": ""}

In [None]:
tokens = []
labels = []
predictions = []
for text, prediction, labels_list in zip(all_sentences, all_predictions, test_labels):
  word_list, prediction_list = functions.list_transformer_results(text, prediction)
  tokens.extend(word_list)
  tokens.append("")
  labels.extend(labels_list)
  labels.append("")
  predictions.extend(prediction_list)
  predictions.append("_")

label_vocab = set(predictions)

if "PER" not in label_vocab:
  predictions = [label_transl[item] for item in predictions]
else:
  predictions = [item.replace('_', '') for item in predictions]


predictions = functions.transform_to_BIO(predictions)

In [None]:
prediction_comparison = pd.DataFrame(
    {"TOKEN": tokens,
     "NER": labels,
     attempt: predictions
     })

In [None]:
pred_checksum = create_checksum(prediction_comparison)

In [None]:
comparison_path = f'{path}/model_comparisons/test-predictions_comparison.tsv'

if os.path.isfile(comparison_path):
    pred_comp_df = pd.read_csv(comparison_path, sep='\t')
    file_checksum = create_checksum(pred_comp_df)
    if file_checksum == pred_checksum:
        pred_comp_df[attempt] = predictions
        pred_comp_df.to_csv(comparison_path, sep='\t', index=False)
        logger.info("The predictions were added to the predictions table.")
    else:
        logger.error("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
        raise Exception("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
else:
    prediction_comparison.to_csv(comparison_path, sep='\t', index=False)
    logger.info(f"No predictions table was found at location {comparison_path}, so a new one was created.")

2024-03-24 15:17:51,471 - run_model.log - INFO - The predictions were added to the predictions table.
2024-03-24 15:17:51,471 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.
