In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
import os

path = '/content/drive/MyDrive/NLP_Project_New'
sys.path.append(os.path.abspath(path))

In [3]:
import nlp_project_functions as functions

from transformers import BertTokenizerFast
from transformers import AutoModelForTokenClassification
from transformers import pipeline

import pandas as pd
import math

from datetime import datetime
import os.path

import logging

logfile = f"{path}/logs/run_adv_model.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("run_adv_model.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [24]:
# get current date, time, and model
now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M-%S")
day = now.strftime("%d-%m")

model_name = "/content/drive/MyDrive/NLP_Project_New/models/bert_base_german_cased_finetuned_3ep"

model_checkpoint = model_name
attempt = model_name + '--' + timestamp

logger.info(f"Initiated model test. Attempt: {attempt}")

2024-03-26 07:51:24,773 - run_adv_model.log - INFO - Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_base_german_cased_finetuned_3ep--03-26-07-51-24
INFO:run_adv_model.log:Initiated model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/bert_base_german_cased_finetuned_3ep--03-26-07-51-24


In [25]:
def create_checksum(df):
    test_string = ""
    for i in range(5):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    for i in range(len(df) - 5, len(df)):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    return test_string

In [26]:
def ner_label_sentence(pipeline, sentence):
    # Perform NER on the input sentence
    ner_results = pipeline(sentence)

    # Initialize an empty list to store the word-label pairs
    word_labels = []

    # Iterate through each token and its label
    for token in ner_results:
        word = token['word']
        label = token['entity'] if token.get('entity') else 'O'
        word_labels.append((word, label))

    return word_labels

In [27]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

In [28]:
# tokenize
tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-german-cased")

In [29]:
token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="first"
)

In [30]:
label_transl = {"LABEL_0": "PER", "LABEL_1": "LOC", "O": "O", "_": ""}

In [31]:
for idx in range(1,8):
    print(f"On perturbation {idx}")
    test_texts, test_labels = functions.read_conll_data(f'{path}/data/advers_tests/perturbation_{idx}.tsv')
    all_sentences = []

    for text in test_texts:
        sentence = " ".join(text)
        all_sentences.append(sentence)

    all_predictions = []
    for sentence in all_sentences:
        prediction = token_classifier(sentence)
        all_predictions.append(prediction)

    tokens = []
    labels = []
    predictions = []
    for text, prediction, labels_list in zip(all_sentences, all_predictions, test_labels):
        word_list, prediction_list = functions.list_transformer_results(text, prediction)
        tokens.extend(word_list)
        tokens.append("")
        labels.extend(labels_list)
        labels.append("")
        predictions.extend(prediction_list)
        predictions.append("_")

    label_vocab = set(predictions)

    if "PER" not in label_vocab:
        predictions = [label_transl[item] for item in predictions]
    else:
        predictions = [item.replace('_', '') for item in predictions]

    predictions = functions.transform_to_BIO(predictions)

    prediction_comparison = pd.DataFrame(
        {"TOKEN": tokens,
        "NER": labels,
        attempt: predictions
        })

    prediction_comparison.drop(prediction_comparison.tail(1).index,inplace=True)

    pred_checksum = create_checksum(prediction_comparison)

    comparison_path = f'{path}/advers_model_comparisons/perturb_{idx}_comparison.tsv'

    if os.path.isfile(comparison_path):
        pred_comp_df = pd.read_csv(comparison_path, sep='\t')
        file_checksum = create_checksum(pred_comp_df)
        if file_checksum == pred_checksum:
            pred_comp_df[attempt] = predictions[:-1]
            pred_comp_df.to_csv(comparison_path, sep='\t', index=False)
            logger.info("The predictions were added to the predictions table.")
        else:
            logger.error("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
            raise Exception("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
    else:
        prediction_comparison.to_csv(comparison_path, sep='\t', index=False)
        logger.info(f"No predictions table was found at location {comparison_path}, so a new one was created.")

    predictions = []
    tokens = []
    labels = []

On perturbation 1


2024-03-26 07:59:24,570 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.


On perturbation 2


2024-03-26 08:06:10,061 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.


On perturbation 3


2024-03-26 08:12:49,574 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.


On perturbation 4


2024-03-26 08:19:29,276 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.


On perturbation 5


2024-03-26 08:26:09,201 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.


On perturbation 6


2024-03-26 08:32:54,021 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.


On perturbation 7


2024-03-26 08:39:42,036 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.
