In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
import os

path = "/content/drive/MyDrive/NLP_Project_New"
sys.path.append(os.path.abspath(path))

In [3]:
import spacy
import pandas as pd
import math

from datetime import datetime
import os
import os.path

import nlp_project_functions as functions

import logging

logfile = f"{path}/logs/run_adv_model.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("run_adv_model.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [4]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
# get current date, time, and model
now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M-%S")
day = now.strftime("%d-%m")

#model = "de_core_news_sm"
model = f"{path}/models/sermons_60it"

attempt = model + '--' + timestamp

logger.info(f"Initiated adversarial model test. Attempt: {attempt}")

2024-03-26 06:15:17,321 - run_adv_model.log - INFO - Initiated adversarial model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/sermons_60it--03-26-06-15-17
INFO:run_adv_model.log:Initiated adversarial model test. Attempt: /content/drive/MyDrive/NLP_Project_New/models/sermons_60it--03-26-06-15-17


In [11]:
try:
    nlp = spacy.load(model)
except OSError:
    logger.exception('')



In [12]:
def create_checksum(df):
    test_string = ""
    for i in range(5):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    for i in range(len(df) - 5, len(df)):
        if isinstance(df.iloc[i]["TOKEN"], str):
            test_string += df.iloc[i]["TOKEN"]

    return test_string

In [14]:
for idx in range(1,8):
    file_path = f"{path}/data/advers_tests/perturbation_{idx}.tsv"
    df = pd.read_csv(file_path, sep='\t', skip_blank_lines=False, names=["TOKEN", "NER"])

    df_checksum = create_checksum(df)

    logger.info(f"Loaded test file from location {file_path}. Size: {len(df):,} tokens.")
    print(f"Length of df: {len(df)}")
    tokens = df['TOKEN'].tolist()
    print(f"Length of tokens: {len(tokens)}")
    sentences = functions.make_sentences(tokens)

    spacy_predictions = []

    for sentence in sentences:
        doc = nlp(sentence)
        for token in doc:
            if token.text != "." and token.text[-1] == ".":
                if token.ent_type_ in ['LOC', 'PER']:
                    spacy_predictions.append([token.text[:-1], token.ent_type_])
                    spacy_predictions.append([".", "O"])
                else:
                    spacy_predictions.append([token.text[:-1], "O"])
                    spacy_predictions.append([".", "."])
            elif token.text != " ":
                if token.ent_type_ in ['LOC', 'PER']:
                    spacy_predictions.append([token.text, token.ent_type_])
                else:
                    spacy_predictions.append([token.text, "O"])
        spacy_predictions.append([math.nan, math.nan])

    spacy_predictions = spacy_predictions[:-1]

    spacy_df = pd.DataFrame(spacy_predictions, columns=["TOKEN", "NE"])

    spacy_predictions = spacy_df["NE"].values.tolist()
    spacy_predictions_bio = functions.transform_to_BIO(spacy_predictions)
    print(f"Length of spacy preds: {len(spacy_predictions)}")


    save_path = f'{path}/advers_model_comparisons/perturb_{idx}_comparison.tsv'

    if os.path.isfile(save_path):
        pred_comp_df = pd.read_csv(save_path, sep='\t')
        file_checksum = create_checksum(pred_comp_df)
        if df_checksum == df_checksum:
            pred_comp_df[attempt] = spacy_predictions_bio
            pred_comp_df.to_csv(save_path, sep='\t', index=False)
            logger.info("The predictions were added to the predictions table.")
        else:
            logger.error("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
            raise Exception("The checksums for the loaded test data and the predictions table do not match. Maybe the train/test/dev split has changed?")
    else:
        pred_comp_df = df.copy()
        pred_comp_df[attempt] = spacy_predictions_bio
        pred_comp_df.to_csv(save_path, sep='\t', index=False)
        logger.info(f"No predictions table was found at location {save_path}, so a new one was created.")

    spacy_predictions = []
    spacy_predictions_bio = []
    sentences = []

2024-03-26 06:19:41,516 - run_adv_model.log - INFO - Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_1.tsv. Size: 54,303 tokens.
INFO:run_adv_model.log:Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_1.tsv. Size: 54,303 tokens.


Length of df: 54303
Length of tokens: 54303


2024-03-26 06:19:52,143 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.
2024-03-26 06:19:52,191 - run_adv_model.log - INFO - Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_2.tsv. Size: 54,303 tokens.
INFO:run_adv_model.log:Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_2.tsv. Size: 54,303 tokens.


Length of spacy preds: 54303
Length of df: 54303
Length of tokens: 54303


2024-03-26 06:20:02,971 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.
2024-03-26 06:20:03,010 - run_adv_model.log - INFO - Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_3.tsv. Size: 54,303 tokens.
INFO:run_adv_model.log:Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_3.tsv. Size: 54,303 tokens.


Length of spacy preds: 54303
Length of df: 54303
Length of tokens: 54303
Length of spacy preds: 54303


2024-03-26 06:20:13,159 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.
2024-03-26 06:20:13,207 - run_adv_model.log - INFO - Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_4.tsv. Size: 54,303 tokens.
INFO:run_adv_model.log:Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_4.tsv. Size: 54,303 tokens.


Length of df: 54303
Length of tokens: 54303


2024-03-26 06:20:23,731 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.
2024-03-26 06:20:23,770 - run_adv_model.log - INFO - Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_5.tsv. Size: 54,303 tokens.
INFO:run_adv_model.log:Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_5.tsv. Size: 54,303 tokens.


Length of spacy preds: 54303
Length of df: 54303
Length of tokens: 54303


2024-03-26 06:20:34,556 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.
2024-03-26 06:20:34,595 - run_adv_model.log - INFO - Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_6.tsv. Size: 54,303 tokens.
INFO:run_adv_model.log:Loaded test file from location /content/drive/MyDrive/NLP_Project_New/data/advers_tests/perturbation_6.tsv. Size: 54,303 tokens.


Length of spacy preds: 54303
Length of df: 54303
Length of tokens: 54303


2024-03-26 06:20:45,330 - run_adv_model.log - INFO - The predictions were added to the predictions table.
INFO:run_adv_model.log:The predictions were added to the predictions table.


Length of spacy preds: 54303
