In [10]:
import data
import spacy
import os
import importlib

# change cwd to project root
while os.path.split(os.getcwd())[-1] != "project":
    os.chdir("..")

NER, SENTIMENT_TOPIC = data.load()
NER_ARRAY = data.NER_data_to_sentence_array(NER)[:1]

In [11]:
model_name = "en_core_web_trf"  
if importlib.util.find_spec(model_name) is None:
    spacy.cli.download(model_name)
    
nlp_spacy = spacy.load(model_name)

In [12]:
spacy_bio_to_true_bio_mapping = {
    "PERSON": "PER",
    "NORP": "MISC",
    "FAC": "LOCATION",
    "ORG": "ORG",
    "GPE": "LOCATION",
    "LOC": "LOCATION",
    "PRODUCT": "MISC",
    "EVENT": "MISC",
    "WORK_OF_ART": "MISC",
    "LAW": "MISC",
    "LANGUAGE": "MISC",
    "DATE": "MISC",
    "TIME": "MISC",
    "PERCENT": "MISC",
    "MONEY": "MISC",
    "QUANTITY": "MISC",
    "ORDINAL": "MISC",
    "CARDINAL": "MISC",
}

correct_splits = 0
correct_bio= 0

total_splits = 0 
total_bio = 0

for sentence, metadata_array in NER_ARRAY:
    spacy_doc = nlp_spacy(sentence)
    total_splits += len(metadata_array)


    spacy_doc_offset = 0
    metadata_array_offset = 0
    while True:
        true_token = metadata_array[metadata_array_offset]
        inferred_token = spacy_doc[spacy_doc_offset]

        true_text = true_token[0]
        inferred_text = inferred_token.text
        if true_text != inferred_text:
            print(f"Splitting mismatch: T: {true_text} I: {inferred_text}")
            longest_token = max(true_text, inferred_text)
            if longest_token == true_text:
                # test data made the token longer, thus spacy must've split something
                spacy_doc_offset += 1
            elif longest_token == inferred_text:
                # test data made the token longer, thus spacy must've split something
                metadata_array_offset += 1
        else:
            correct_splits += 1
        
        spacy_doc_offset += 1
        metadata_array_offset += 1

        if spacy_doc_offset >= len(spacy_doc):
            break
        if metadata_array_offset >= len(metadata_array):
            break

        true_type = true_token[1].split("-")[-1]
        inferred_type = inferred_token.ent_type_ if inferred_token.ent_type_ else "O"

        mapped_type = spacy_bio_to_true_bio_mapping.get(inferred_type)
        total_bio += 1
        if inferred_type == true_type or true_type == mapped_type:
            correct_bio += 1
        else:
            print(f"BIO mismatch: T: {true_text}-{true_type} I: {inferred_text}-{inferred_type}")


Splitting mismatch: T: you're I: you
BIO mismatch: T: Louvre-ORG I: Louvre-FAC
BIO mismatch: T: the-O I: the-WORK_OF_ART


In [13]:

print(f"Correct splits: {correct_splits / total_splits}")
print(f"Correct predictions: {correct_bio / total_bio}")

Correct splits: 0.9473684210526315
Correct predictions: 0.8888888888888888
