In [21]:
import spacy
import pandas as pd
import math

from datetime import datetime
import os.path

import nlp_project_functions as functions

import logging

logfile = "./logs/run_model.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("run_model.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [23]:
# get current date, time, and model
now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M-%S")
day = now.strftime("%d-%m")

model = "models/bert_tuned_4e"
attempt = model + '--' + timestamp

logger.info(f"Initiated model test. Attempt: {attempt}")

2024-03-21 19:33:30,462 - run_model.log - INFO - Initiated model test. Attempt: models/bert_tuned_4e--03-21-19-33-30
2024-03-21 19:33:30,462 - run_model.log - INFO - Initiated model test. Attempt: models/bert_tuned_4e--03-21-19-33-30


In [None]:
try:
    nlp = spacy.load(model)
except OSError:
    logger.exception('')

In [5]:
def create_checksum(df):
    test_string = ""
    for i in range(5):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    for i in range(len(df) - 5, len(df)):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    return test_string

In [6]:
path_to_testfile = './data/train_test_val/test.tsv'
df = pd.read_csv(path_to_testfile, sep='\t', skip_blank_lines=False, names=["TOKEN", "NER"])

df_checksum = create_checksum(df)

tokens = df['TOKEN'].tolist()
sentences = functions.make_sentences(tokens)

logger.info(f"Loaded test file from location {path_to_testfile}. Size: {len(df):,} tokens.")

2024-03-21 19:11:29,815 - run_model.log - INFO - Loaded test file from location ./data/train_test_val/test.tsv. Size: 280,975 tokens.


In [7]:
spacy_predictions = []

for sentence in sentences:
    doc = nlp(sentence)
    for token in doc:
        if token.text != "." and token.text[-1] == ".":
            if token.ent_type_ in ['LOC', 'PER']:
                spacy_predictions.append([token.text[:-1], token.ent_type_])
                spacy_predictions.append([".", "O"])
            else:
                spacy_predictions.append([token.text[:-1], "O"])
                spacy_predictions.append([".", "."])
        elif token.text != " ":
            if token.ent_type_ in ['LOC', 'PER']:
                spacy_predictions.append([token.text, token.ent_type_])
            else:
                spacy_predictions.append([token.text, "O"])
    spacy_predictions.append([math.nan, math.nan])

spacy_predictions = spacy_predictions[:-1]

In [8]:
spacy_df = pd.DataFrame(spacy_predictions, columns=["TOKEN", "NE"])
if len(spacy_df) != len(df):
    logger.error(f"test tokens and spacy predictions don't align (test tokens: {len(df):,}, spacy predictions: {len(spacy_df):,})""")

In [9]:
spacy_predictions = spacy_df["NE"].values.tolist()
spacy_predictions_bio = functions.transform_to_BIO(spacy_predictions)

In [10]:
path = './model_comparisons/test-predictions_comparison.tsv'

if os.path.isfile(path):
    pred_comp_df = pd.read_csv(path, sep='\t')
    file_checksum = create_checksum(pred_comp_df)
    if file_checksum == df_checksum:
        pred_comp_df[attempt] = spacy_predictions_bio
        pred_comp_df.to_csv(path, sep='\t', index=False)
        logger.info("The predictions were added to the predictions table.")
    else:
        logger.error("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
        raise Exception("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
else:
    pred_comp_df = df.copy()
    pred_comp_df[attempt] = spacy_predictions_bio
    pred_comp_df.to_csv(path, sep='\t', index=False)
    logger.info(f"No predictions table was found at location {path}, so a new one was created.")

2024-03-21 19:19:25,247 - run_model.log - INFO - The predictions were added to the predictions table.


In [18]:
pred_comp_df.head()

Unnamed: 0,TOKEN,NER,models/sermons_30it--03-11-19-06-36,models/sermons_60it--03-13-15-28-41,de_dep_news_trf--03-21-19-11-10
0,Einweihungs,O,O,O,O
1,Predigt,O,O,O,O
2,Der,O,O,O,O
3,Neuen,O,O,O,O
4,Orgel,O,O,O,O
