In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os

path = "/content/drive/MyDrive/NLP_Project_New"
sys.path.append(os.path.abspath(path))

In [None]:
import nlp_project_functions as functions

from transformers import BertTokenizerFast
from transformers import AutoModelForTokenClassification
from transformers import pipeline

import pandas as pd
import math

from datetime import datetime
import os.path

import logging

logfile = f"{path}/logs/run_model.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("run_model.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [None]:
def create_checksum(df):
    test_string = ""
    for i in range(5):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    for i in range(len(df) - 5, len(df)):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    return test_string

In [None]:
label_transl = {"LABEL_0": "PER", "LABEL_1": "LOC", "O": "O", "_": ""}

In [None]:
comparison_path = f'{path}/model_comparisons/smaller_test-predictions_comparison_new.tsv'

In [None]:
# get current date, time, and model
now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M-%S")
day = now.strftime("%d-%m")

In [None]:
# tokenize
tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-german-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

In [None]:
test_texts, test_labels = functions.read_conll_data('/content/drive/MyDrive/NLP_Project_New/data/train_test_val/test.tsv')

In [None]:
for i in [2, 4, 8, 16, 32, 64]:
  print(f"Working on slice {i}")
  model_checkpoint = f"{path}/models/bert_finetuned_{i}_part"

  attempt = model_checkpoint + '--' + timestamp

  logger.info(f"Initiated model test. Attempt: {attempt}")

  model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

  token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="first"
  )

  all_sentences = []

  for text in test_texts:
    sentence = " ".join(text)
    all_sentences.append(sentence)

  all_predictions = []

  for sentence in all_sentences:
    prediction = token_classifier(sentence)
    all_predictions.append(prediction)

  tokens = []
  labels = []
  predictions = []
  for text, prediction, labels_list in zip(all_sentences, all_predictions, test_labels):
    word_list, prediction_list = functions.list_transformer_results(text, prediction)
    tokens.extend(word_list)
    tokens.append("")
    labels.extend(labels_list)
    labels.append("")
    predictions.extend(prediction_list)
    predictions.append("_")

  label_vocab = set(predictions)

  if "PER" not in label_vocab:
    predictions = [label_transl[item] for item in predictions]
  else:
    predictions = [item.replace('_', '') for item in predictions]


  predictions = functions.transform_to_BIO(predictions)

  prediction_comparison = pd.DataFrame(
    {"TOKEN": tokens,
     "NER": labels,
     attempt: predictions
     })

  pred_checksum = create_checksum(prediction_comparison)

  if os.path.isfile(comparison_path):
    pred_comp_df = pd.read_csv(comparison_path, sep='\t')
    file_checksum = create_checksum(pred_comp_df)
    if file_checksum == pred_checksum:
        pred_comp_df[attempt] = predictions
        pred_comp_df.to_csv(comparison_path, sep='\t', index=False)
        logger.info("The predictions were added to the predictions table.")
    else:
        logger.error("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
        raise Exception("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
  else:
      prediction_comparison.to_csv(comparison_path, sep='\t', index=False)
      logger.info(f"No predictions table was found at location {comparison_path}, so a new one was created.")


2024-03-27 17:34:08,199 - run_model.log - INFO - Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_2_part--03-27-17-33-57
INFO:run_model.log:Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_2_part--03-27-17-33-57


Working on slice 2


2024-03-27 17:43:14,198 - run_model.log - INFO - No predictions table was found at location /content/drive/MyDrive/NLP_Project_New/model_comparisons/smaller_test-predictions_comparison_new.tsv, so a new one was created.
INFO:run_model.log:No predictions table was found at location /content/drive/MyDrive/NLP_Project_New/model_comparisons/smaller_test-predictions_comparison_new.tsv, so a new one was created.
2024-03-27 17:43:14,204 - run_model.log - INFO - Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_4_part--03-27-17-33-57
INFO:run_model.log:Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_4_part--03-27-17-33-57


Working on slice 4


2024-03-27 17:50:58,329 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.
2024-03-27 17:50:58,332 - run_model.log - INFO - Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_8_part--03-27-17-33-57
INFO:run_model.log:Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_8_part--03-27-17-33-57


Working on slice 8


2024-03-27 17:58:25,370 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.
2024-03-27 17:58:25,374 - run_model.log - INFO - Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_16_part--03-27-17-33-57
INFO:run_model.log:Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_16_part--03-27-17-33-57


Working on slice 16


2024-03-27 18:05:29,464 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.
2024-03-27 18:05:29,469 - run_model.log - INFO - Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_32_part--03-27-17-33-57
INFO:run_model.log:Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_32_part--03-27-17-33-57


Working on slice 32


2024-03-27 18:12:25,694 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.
2024-03-27 18:12:25,698 - run_model.log - INFO - Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_64_part--03-27-17-33-57
INFO:run_model.log:Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_finetuned_64_part--03-27-17-33-57


Working on slice 64


2024-03-27 18:19:26,746 - run_model.log - INFO - The predictions were added to the predictions table.
INFO:run_model.log:The predictions were added to the predictions table.
