In [3]:
import spacy
import stanza
from time import perf_counter

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
spacy_en = spacy.load("en_core_web_md")
spacy_es = spacy.load("es_core_news_md")
stanza_en = stanza.Pipeline("en", processors="tokenize,ner", package={"ner": ["conll03"]}, tokenize_pretokenized=False)
stanza_es = stanza.Pipeline("es", processors="tokenize,ner", package={"ner": ["conll02"]}, tokenize_pretokenized=False)

2023-08-29 14:16:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 809kB/s]                     
2023-08-29 14:16:22 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| ner       | conll03  |

2023-08-29 14:16:22 INFO: Using device: cpu
2023-08-29 14:16:22 INFO: Loading: tokenize
2023-08-29 14:16:22 INFO: Loading: ner
2023-08-29 14:16:23 INFO: Done loading processors!
2023-08-29 14:16:23 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resourc

In [5]:
def load_europarl(filepath):
    """Load the data from a europarl conll02-file

    args: filepath (string, full path of the europarl file)

    return: words (list of all words in the file), labels (list of all labels), text (string of continuous text)

    note: the file path depends on the storage location of the file on the computer, and can vary from computer to computer
    """
    
    words = []
    labels = []

    with open(filepath, "r", encoding="utf-8") as infile:
        for line in infile:
            parts = line.split("\t")

            if len(parts) > 1:
                label = parts[1]
                label = label[:-1]

                words.append(parts[0])
                labels.append(label)

    text = " ".join(words)

    return words, labels, text

In [6]:
def load_subtitles(filepath):
    """Load movie subtitle txt-file, and remove blank lines and line breaks

    args: filepath (string, full path of the subtitle file)

    return: text (string of continuous text without blank lines and line breaks)

    note: the file path depends on the storage location of the file on the computer, and can vary from computer to computer
    """

    text = ""

    with open(filepath, "r", encoding="utf-8") as infile:
        for line in infile:
            if line.strip():
                text += line.strip("\n") + " "

    return text

In [87]:
def ner(text, model, lang, pretoken):
    """Process the given text, and return the list of recognized Named Entities

    args: text (string, continuous text), model (string, language model to be used i.e. spaCy or Stanza), language (string, language of the text), pretoken (boolean, determins the tokenizer)

    return: list (list of all recognized Named Entities and non recognized Named Entities in the BIOES).

    note: when specifying the language, please use "en" for English and "es" for Spanish, and please write the names of the language models in lower case letters only
    """

    if model == "spacy":
        if lang == "en":
            doc = spacy_en(text)
        elif lang == "es":
            doc = spacy_es(text)
    elif model == "stanza":
        if lang == "en":
            if pretoken:
                stanza_en = stanza.Pipeline("en", processors="tokenize,ner", package={"ner": ["conll03"]}, tokenize_pretokenized=True)
            doc = stanza_en(text)
            return [f'{token.text}\t{token.ner}\n' for sent in doc.sentences for token in sent.tokens] 
        elif lang == "es":
            if pretoken:
                stanza_es = stanza.Pipeline("es", processors="tokenize,ner", package={"ner": ["conll02"]}, tokenize_pretokenized=True)
            doc = stanza_es(text)
            return [f'{token.text}\t{token.ner}\n' for sent in doc.sentences for token in sent.tokens]
    return doc.ner

In [83]:
def compare_ner(first_ner, second_ner):
    '''Compares each line/word from both NER one at a time and compare them for differences. Placing the differences into a list.

    args: first_ner (the list of the first NER to be compared), second_ner (list of the second NER)

    return: differences (a list of all found differences in label(s) or word(s))   
    '''
    differences = []
    for i in range(len(first_ner)):
        if not first_ner[i] == second_ner[i]:
            differences.append(f"line: {i}\ntest: {first_ner[i]}stanza: {second_ner[i]}\n")
    return differences

ONLY FOR TEST PURPOSES, TO BE REMOVED LATER

In [9]:
#           Declarating the file paths and loading the europarl

path_en = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Europarl Corpus/en-europarl.test.conll02"
path_es = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Europarl Corpus/es-europarl.test.conll02"
##path_en = "C:/Users/Tim.O/Documents/Studium/4. Semester/Advanced Python for NLP/ABSCHLUSSPROJEKT/Europarl Corpus/en-europarl.test.conll02"
#path_es = "C:/Users/Tim.O/Documents/Studium/4. Semester/Advanced Python for NLP/ABSCHLUSSPROJEKT/Europarl Corpus/es-europarl.test.conll02"
w_en, l_en, t_en = load_europarl(path_en)
w_es, l_es, t_es = load_europarl(path_es)

In [10]:
#                                           Spacy's NER

#entities_sp = ner(t_es, "spacy", "es")
#print("spaCy: " + str(len(entities_sp)))
#for ent in entities_sp:
#    print(f"{ent.text:<25}{ent.label_:<15}")

In [None]:
#entities_sp = ner(t_en, "spacy", "en")
#for ent in entities_sp:
#    print(f"{ent.text:<25}{ent.label_:<15}")

In [88]:
#                                         Stanza's NER

entities_st = ner(t_en, "stanza", "en",True)
#print(*entities_st, sep = '\n')

2023-08-29 14:37:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:01, 127kB/s]                     
2023-08-29 14:37:42 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| ner       | conll03  |

2023-08-29 14:37:42 INFO: Using device: cpu
2023-08-29 14:37:42 INFO: Loading: tokenize
2023-08-29 14:37:42 INFO: Loading: ner
2023-08-29 14:37:43 INFO: Done loading processors!


In [None]:
#entities_st = ner(t_es, "stanza", "es")
#print(*entities_st, sep = '\n')

In [89]:
print(len(w_en))
print(len(entities_st))
print(w_en)
print(entities_st)

22320
22320


In [71]:
def load_europarl_testpurp(filepath):
    """Load the data from a europarl conll02-file without splitting the file into: words, labels, text 

    args: filepath (string, full path of the europarl file)

    return: the file (as a string)

    note: Only here for test purposes to see about comparing the NER outputs, since turning spacy into the BIOES format as yet to be done (will take somem more time (I assume)). 
    """
    text = []

    with open(filepath, "r", encoding="utf-8") as infile:
        for line in infile:
            if line != "\n": 
                parts = line.split("\t")
                text.append(line)

    return text

In [90]:
test_path_en = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Europarl Corpus/en-europarl.test.conll02"
test_text = load_europarl_testpurp(test_path_en)
#print(*test_text, sep = "\n")

In [91]:
print(*compare_ner(entities_st,test_text))

line: 11
test: European	S-MISC
stanza: European	B-ORG

 line: 12
test: Parliament	O
stanza: Parliament	I-ORG

 line: 55
test: millennium	O
stanza: millennium	B-MISC

 line: 56
test: bug	O
stanza: bug	I-MISC

 line: 147
test: Union	E-ORG
stanza: Union	I-ORG

 line: 163
test: House	S-ORG
stanza: House	B-ORG

 line: 204
test: Lanka	E-LOC
stanza: Lanka	I-LOC

 line: 215
test: Lanka	E-LOC
stanza: Lanka	I-LOC

 line: 219
test: Ponnambalam	E-PER
stanza: Ponnambalam	I-PER

 line: 225
test: European	S-MISC
stanza: European	B-ORG

 line: 226
test: Parliament	O
stanza: Parliament	I-ORG

 line: 250
test: Lankan	E-MISC
stanza: Lankan	I-MISC

 line: 253
test: Parliament	O
stanza: Parliament	B-ORG

 line: 265
test: Lanka	E-LOC
stanza: Lanka	I-LOC

 line: 289
test: Evans	S-PER
stanza: Evans	B-PER

 line: 309
test: House	S-ORG
stanza: House	O

 line: 317
test: Evans	S-PER
stanza: Evans	B-PER

 line: 336
test: Rule	B-MISC
stanza: Rule	O

 line: 337
test: 143	E-MISC
stanza: 143	O

 line: 404
test: Europe

In [32]:
#           These texts are there to help to check if differences can be properly checked.

text2 = "Chris Smith Manning teaches at Stanford University. He lives in the Tauern Tunnel."
text3 = "All the same, we must content ourselves with enacting European law to ensure greater safety."

entities_test_st = ner(text3, "stanza", "en")
#print(type(entities_st))
print(*entities_test_st)



['All\tO\n', 'the\tO\n', 'same\tO\n', ',\tO\n', 'we\tO\n', 'must\tO\n', 'content\tO\n', 'ourselves\tO\n', 'with\tO\n', 'enacting\tO\n', 'European\tS-MISC\n', 'law\tO\n', 'to\tO\n', 'ensure\tO\n', 'greater\tO\n', 'safety\tO\n', '.\tO\n'] 0.4461886000353843


In [None]:
difference = list(set(entities_st) - set(test_text))
print(*difference)

Checking for difference works, although it just shows which words are not included but not what is different about it.

Just down below here, we can see the text extract from the europarl file. "law" is labled here (with "European") as a Misc, to be exact as a I-MISC.

All	O
 the	O
 same	O
 ,	O
 we	O
 must	O
 content	O
 ourselves	O
 with	O
 enacting	O
 European	B-MISC
 law	I-MISC 
 to	O
 ensure	O
 greater	O
 safety	O

With our NER outcome "law" isn't labled as a (I-)MISC but rather as a O.

Now, while checking for differences, the method has correctly determined that there is no instance of "European" being labled with S-MISC in the file. 
However, it doesn't mark "law" labled with O as a difference, the reason for that is that in the file there are instances of "law" being labled as such.
This will make duplicate differences appearing in different spots slip under the radar of the method.


Suggestion solution: 
We compare each line/word from both NER outcomes one at a time and compare them for differences. Placing the differences into a list each time a difference is found.
The sequence will stay correct and multiple instances the same difference in different spots will be recognized.  


In [None]:
test_text2 = ['All\tO\n', 'the\tO\n', 'same\tO\n', ',\tO\n', 'wes\tO\n', 'must\tO\n', 'content\tO\n', 'ourselves\tO\n', 'with\tO\n', 'enacting\tO\n', 'European\tB-MISC\n', 'law\tI-MISC\n', 'law\tO\n', 'ensure\tO\n', 'greater\tO\n', 'safety\tO\n']
#Absichtlicher Fehler in "wes"/"we".

print(*compare_ner(test_text2, entities_st))

In [None]:
#           Checking that the Subtitles 

path_eh = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Movie subtitles/El Hoyo (ES).txt"
##path_bttf = "C:/Users/Tim.O/Documents/Studium/4. Semester/Advanced Python for NLP/ABSCHLUSSPROJEKT/Movie subtitles/Back To The Future (ES).txt"

#print(load_subtitles(path_eh))
path_bttf = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Movie subtitles/El Hoyo (EN).txt"
##path_eh = "C:/Users/Tim.O/Documents/Studium/4. Semester/Advanced Python for NLP/ABSCHLUSSPROJEKT/Movie subtitles/El Hoyo (EN).txt"
#text = load_subtitles(path_bttf)