In [None]:
import spacy
import stanza
from time import perf_counter

In [None]:
spacy_en = spacy.load("en_core_web_md")
spacy_es = spacy.load("es_core_news_md")
stanza_en = stanza.Pipeline("en", processors="tokenize,ner", package={"ner": ["conll03"]})
stanza_es = stanza.Pipeline("es", processors="tokenize,ner", package={"ner": ["conll02"]})

In [None]:
def load_europarl(filepath):
    """Load the data from a europarl conll02-file

    args: filepath (string, full path of the europarl file)

    return: words (list of all words in the file), labels (list of all labels), text (string of continuous text)

    note: the file path depends on the storage location of the file on the computer, and can vary from computer to computer
    """
    
    words = []
    labels = []

    with open(filepath, "r", encoding="utf-8") as infile:
        for line in infile:
            parts = line.split("\t")

            if len(parts) > 1:
                label = parts[1]
                label = label[:-1]

                words.append(parts[0])
                labels.append(label)

    text = " ".join(words)

    return words, labels, text

In [None]:
def load_subtitles(filepath):
    """Load movie subtitle txt-file, and remove blank lines and line breaks

    args: filepath (string, full path of the subtitle file)

    return: text (string of continuous text without blank lines and line breaks)

    note: the file path depends on the storage location of the file on the computer, and can vary from computer to computer
    """

    text = ""

    with open(filepath, "r", encoding="utf-8") as infile:
        for line in infile:
            if line.strip():
                text += line.strip("\n") + " "

    return text

In [None]:
def ner(text, model, lang):
    """Process the given text, and return the list of recognized Named Entities

    args: text (string, continuous text), model (string, language model to be used i.e. spaCy or Stanza), language (string, language of the text)

    return: tuple (list of all recognized Named Entities and non recognized Named Entities in the BIOES and the time to process the NER with the given NLP tool).

    note: when specifying the language, please use "en" for English and "es" for Spanish, and please write the names of the language models in lower case letters only
    """

    if model == "spacy":
        if lang == "en":
            doc = spacy_en(text)
        elif lang == "es":
            doc = spacy_es(text)
    elif model == "stanza":
        if lang == "en":
            #   Note regarding TIMER: The time will be different on each ussage (making each time output an estimation of the general performance time), 
            #       since the time is affected by the PC's performance and background application might "spike" the time/performance of the function.
            #       Instead we could use the modul "timeit", although we would need to perform stanza_en(text) twice or more,
            #       because timeit runs the method multiple times (without returning anything besides time),
            #       therefore it increasies the overall performance of the program.   
            start = perf_counter()
            doc = stanza_en(text)
            end = perf_counter()
            execution_time = (end - start)
            return [f'{token.text}\t{token.ner}\n' for sent in doc.sentences for token in sent.tokens], execution_time
        elif lang == "es":
            start = perf_counter()
            doc = stanza_es(text)
            end = perf_counter()
            execution_time = (end - start)
            return [f'{token.text}\t{token.ner}\n' for sent in doc.sentences for token in sent.tokens], execution_time
    return doc.ents

In [None]:
path_en = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Europarl Corpus/en-europarl.test.conll02"
w_en, l_en, t_en = load_europarl(path_en)
path_es = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Europarl Corpus/es-europarl.test.conll02"
w_es, l_es, t_es = load_europarl(path_es)

In [None]:
entities_sp = ner(t_es, "spacy", "es")
print("spaCy: " + str(len(entities_sp)))
#for ent in entities_sp:
#    print(f"{ent.text:<25}{ent.label_:<15}")

In [None]:
entities_st, time = ner(t_es, "stanza", "es")
print(*entities_st, sep = '\n')
#print(time)
#print("Stanza: " + str(len(entities_st)))
#for ent in entities_st:
#    print(f"{ent.text:<25}{ent.type:<15}")

In [None]:
#entities_sp = ner(text, "spacy", "en")
#for ent in entities_sp:
#    print(f"{ent.text:<25}{ent.label_:<15}")

In [None]:
def load_europarl_testpurp(filepath):
    """Load the data from a europarl conll02-file without splitting the file into: words, labels, text 

    args: filepath (string, full path of the europarl file)

    return: the file (as a string)

    note: Only here for test purposes to see about comparing the NER outputs, since turning spacy into the BIOES format as yet to be done (will take somem more time (I assume)). 
    """
    text = []

    with open(filepath, "r", encoding="utf-8") as infile:
        for line in infile:
            parts = line.split("\t")
            text.append(line)

    return text

In [None]:
test_path_en = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Europarl Corpus/en-europarl.test.conll02"
test_text = load_europarl_testpurp(test_path_en)
print(*test_text, sep = "\n")

In [None]:
#           These texts are there to help to check if differences can be properly checked.

#text2 = "Chris Manning teaches at Stanford University. He lives in the Tauern Tunnel."
text3 = "All the same, we must content ourselves with enacting European law to ensure greater safety"

entities_st, time = ner(text3, "stanza", "en")
print(time)
print(type(entities_st))
print(*entities_st)



In [None]:
difference = list(set(entities_st) - set(test_text))
print(*difference)

Checking for difference works, although it just shows which words are not included but not what is different about it.

Just down below here, we can see the text extract from the europarl file. "law" is labled here (with "European") as a Misc, to be exact as a I-MISC.
All	O
 the	O
 same	O
 ,	O
 we	O
 must	O
 content	O
 ourselves	O
 with	O
 enacting	O
 European	B-MISC
 law	I-MISC     *HERE
 to	O
 ensure	O
 greater	O
 safety	O

With our NER outcome "law" isn't labled as a (I-)MISC but rather as a O.

Now, while checking for differences, the method has correctly determined that there is no instance of "European" being labled with S-MISC in the file. 
However, it doesn't mark "law" labled with O as a difference, the reason for that is that in the file there are instances of "law" being labled as such.
This will make duplicate differences appearing in different spots slip under the radar of the method.


Suggestion solution: 
We compare each line/word from both NER outcomes one at a time and compare them for differences. Placing the differences into a list each time a difference is found.
The sequence will stay correct and multiple instances the same difference in different spots will be recognized.  


In [124]:
test_text2 = ['All\tO\n', 'the\tO\n', 'same\tO\n', ',\tO\n', 'wes\tO\n', 'must\tO\n', 'content\tO\n', 'ourselves\tO\n', 'with\tO\n', 'enacting\tO\n', 'European\tB-MISC\n', 'law\tI-MISC\n', 'law\tO\n', 'ensure\tO\n', 'greater\tO\n', 'safety\tO\n']
#Absichtlicher Fehler in "wes"/"we".
differences = []
for i in range(len(test_text2)):
    if not test_text2[i] == entities_st[i]:
        differences.append(f"test: {test_text2[i]}stanza: {entities_st[i]}\n")
print(*differences)

test: wes	O
stanza: we	O

 test: European	B-MISC
stanza: European	S-MISC

 test: law	I-MISC
stanza: law	O

 test: law	O
stanza: to	O




In [None]:
#           Checking that the Subtitles 

path_eh = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Movie subtitles/El Hoyo (ES).txt"

#print(load_subtitles(path_eh))
path_bttf = "D:/OperaDownloads/Multilingual_NER-main/Multilingual_NER-main/Movie subtitles/El Hoyo (EN).txt"

#text = load_subtitles(path_bttf)