# **utils.py**
Auxiliary methods for loading the data, performing Named Entity Recognition (NER) and evaluating the results

In [1]:
# install Stanza
!pip install stanza==1.4.0

Collecting stanza==1.4.0
  Downloading stanza-1.4.0-py3-none-any.whl (574 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/574.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/574.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m574.7/574.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza==1.4.0)
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
Collecting transformers (from stanza==1.4.0)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers->stanza==1.4.0)
  Downloading huggingface_h

In [2]:
# download SpaCy language models
!python -m spacy download en_core_web_md    # for English
!python -m spacy download es_core_news_md   # for Spanish

2023-10-12 12:52:19.916522: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-12 12:52:23.180360: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-12 12:52:23.180866: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-

In [3]:
import spacy
import stanza
import re
from spacy.tokens import Doc

# SOURCE: https://stackoverflow.com/questions/65160277/spacy-tokenizer-with-only-whitespace-rule
# by user "Sofie VL"
# START
class WhitespaceTokenizer(object):
    """Tokenizer splitting text on whitespaces only, for processing of texts
    with the SpaCy language model. With the default SpaCy tokenizer, the length
    of the doc was bigger than the length of the list with the gold labels.
    """

    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split()
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

# loading the spaCy and Stanza language models for English and Spanish
spacy_en = spacy.load("en_core_web_md")
spacy_en.tokenizer = WhitespaceTokenizer(spacy_en.vocab)
spacy_es = spacy.load("es_core_news_md")
spacy_es.tokenizer = WhitespaceTokenizer(spacy_es.vocab)
# END

stanza_en = stanza.Pipeline("en", processors="tokenize,ner",
                            package={"ner": ["conll03"]},
                            tokenize_pretokenized=True)
stanza_es = stanza.Pipeline("es", processors="tokenize,ner",
                            package={"ner": ["conll02"]},
                            tokenize_pretokenized=True)

def load_europarl(filepath):
    """Load the data from a europarl conll02-file

    args: filepath (string, full path of the europarl file)

    return: words (list of all words in the file), labels (list of all labels),
    text (string of continuous text)

    note: the file path depends on the storage location of the file on the
    computer, and can vary from computer to computer
    """

    words = []
    labels = []

    with open(filepath, "r", encoding="utf-8") as infile:
        for line in infile:
            parts = line.split("\t")

            if len(parts) > 1:
                label = parts[1]

                # remove new line
                label = label[:-1]

                words.append(parts[0])
                labels.append(label)

    text = " ".join(words)

    return words, labels, text


def load_subtitles(filepath):
    """Load movie subtitle txt-file, and remove blank lines and line breaks

    args: filepath (string, full path of the subtitle file)

    return: words (list of all words in the file), text (string of continuous
    text without blank lines and line breaks)

    note: the file path depends on the storage location of the file on the
    computer, and can vary from computer to computer
    """

    text = ""

    with open(filepath, "r", encoding="latin-1") as infile:
        for line in infile:

            # if the line is not empty add it to the text
            if line.strip():
                text += line.strip("\n") + " "

    # add whitespaces before/after punctuation marks to facilitate tokenization
    text = re.sub(r'([.,:!?)])', r' \1', text)
    text = re.sub(r'([¿¡(])', r'\1 ', text)
    text = re.sub(r'\'', r' \' ', text)
    text = re.sub(r'\\', r'', text)

    words = text.split()

    return words, text

def ner(text, model, lang):
    """Process the given text, and return the list of predicted labels

    args: text (string, continuous text), model (string, language model to be
    used i.e. spaCy or Stanza), lang (string, language of the text)

    return: list of all predicted labels (including recognized Named Entities
    as well as words which are not Named Entities) in the BIO(ES) format

    note: when specifying the language, please use "en" for English and "es"
    for Spanish, and please write the names of the language models in lower
    case letters only
    """

    if model == "spacy":
        if lang == "en":
            doc = spacy_en(text)
            # ent_iob_: return the Named Entities in the BIO format, and the
            # non-entities as well ("O")
            # ent_type_: type of the entity according to the SpaCy tag set
            preds = [doc[i].ent_iob_ + "-" + doc[i].ent_type_ for i in range(len(doc))]
        elif lang == "es":
            doc = spacy_es(text)
            preds = [doc[i].ent_iob_ + "-" + doc[i].ent_type_ for i in range(len(doc))]
    elif model == "stanza":
        if lang == "en":
            doc = stanza_en(text)
            # token.ner: return the Named Entity tag of the current token
            preds = [token.ner for sent in doc.sentences for token in sent.tokens]
        elif lang == "es":
            doc = stanza_es(text)
            preds = [token.ner for sent in doc.sentences for token in sent.tokens]
    return preds


def postprocess_labels(pred_labels):
    """Transform the fine-grained labels predicted by SpaCy into the 4 label
    format (PER, LOC, ORG, MISC), in which the europarl-data is annotated

    args: pred_labels (list of all predicted labels, fine-grained)

    return: preprocessed (list of all labels, transformed into 4 label format)

    note: the transformation is done according to our findings on the europarl-
    data: PER = PERSON; LOC = GPE, LOC; ORG = ORG; MISC = NORP, LANGUAGE, EVENT,
    (ORG), LAW; O = O, FAC, PRODUCT, WORK_OF_ART, DATE, TIME, PERCENT, MONEY,
    QUANTITY, ORDINAL, CARDINAL"""

    # replacement_label as a dictonary.
    replacement_label = {"PERSON": "PER", "GPE": "LOC"}

    # updating the dictionary. The labels in the list are those that need to be
    # replaced. Whilst the second argument is the label they're being replaced
    # with.
    replacement_label.update(dict.fromkeys(['NORP', 'LANGUAGE', 'EVENT', 'LAW'],
                                           'MISC'))
    replacement_label.update(dict.fromkeys(['FAC', 'PRODUCT', 'WORK_OF_ART',
                                            'DATE', 'TIME', 'PERCENT', 'MONEY',
                                            'QUANTITY', 'ORDINAL', 'CARDINAL'],
                                            'O'))

    # re.sub() from the "re" (regular expresion) modul. Replaces ALL instances
    # of the FIRST argument with the SECOND argument that are in the THIRD
    # argument
    # '\b' is needed as it indicated the begining and end of a word, so that we
    # only replace labels that are exactly like the given argument and not
    # words that contain the label.
    for old, new in replacement_label.items():
        pred_labels = [re.sub(r'\b{}\b'.format(old), new, label) for label in pred_labels]

    # replaces the hanging '-' and hanging iob's from the labels.
    # For loop through each label, checked if a hanging '-' or iob. O in the
    # beginning means '-' at the end means iob. Then replace them with O
    # One could also put each iob + label varient in the dic. though that get's
    # quite big; 'O' only appears if it's a non-entity.
    postprocessed = ['O' if label.startswith('O-') or label.endswith('-O') else label for label in pred_labels]

    return postprocessed


def label_match(label1, label2):
    """Check if two labels are matching

    args: label1, label2 (strings)

    return: a truth value"""

    if ((label1 == "O" and label2 == "O") or
        ("PER" in label1 and "PER" in label2) or
        ("LOC" in label1 and "LOC" in label2) or
        ("ORG" in label1 and "ORG" in label2) or
        ("MISC" in label1 and "MISC" in label2)):
        return True
    else:
        return False


def eval_europarl(word_list, gold_labels, pred_labels, model):
    """Evaluate the europarl-file: compare predicted labels and the gold
    labels, i.e. give the accuracy and return a list of all words which were
    annotated with a different label as their gold label

    args: word_list (list of all words in the europarl-file), gold_labels
    (list of all gold labels), pred_labels (list of all predicted labels
    [including recognized Named Entities as well as words which are not Named
    Entities] in the BIOES format), model (string, language model to be used
    i.e. spaCy or Stanza)

    return: accuracy (float, accuracy of pred_labels with respect to the
    gold_labels), differences (list of lists, consisting of word, gold label
    and predicted label)

    note: please write the names of the language models in lower case letters
    only
    """

    if len(gold_labels) == len(pred_labels):
        accuracy = 0.0
        diff_indexes = []
        differences = []

        if model == "spacy":
            pred_labels = postprocess_labels(pred_labels)

        for i in range(len(gold_labels)):
            # if the current label matches with the corresponding gold label
            # add 1 to the accuracy counter else add the current index to the
            # list of indexes of words which differ from the gold labels
            if label_match(gold_labels[i], pred_labels[i]):
                accuracy += 1.0
            else:
                diff_indexes.append(i)

        # create the list differences
        for index in diff_indexes:
            diff = [index, word_list[index], gold_labels[index], pred_labels[index]]
            differences.append(diff)

        # divide the accuracy counter by the length of the label list to get
        # the accuracy in percent
        accuracy = accuracy / len(gold_labels)

        return accuracy, differences


def eval_subtitles(word_list, spacy_labels, stanza_labels):
    """Evaluate the subtitle-file: measure the concordance between the labels
    predicted by the SpaCy and Stanza language models, and return a list of
    all words which were annotated differently with the two models

    args: word_list (list of all words in the subtitle-file), spacy_labels
    (list of all labels predicted by SpaCy), stanza_labels (list of all labels
    predicted by Stanza)

    return: concordance (float, concordance of the lables predicted by the two
    language models), differences (list of lists, consisting of word and the
    labels predicted by SpaCy and Stanza)"""

    if len(spacy_labels) == len(stanza_labels):
        concordance = 0.0
        diff_indexes = []
        differences = []

        spacy_labels = postprocess_labels(spacy_labels)

        for i in range(len(spacy_labels)):
            # if the current SpaCy and Stanza labels match add 1 to the
            # concordance counter else add the current index to the
            # list of indexes of words with different SpaCy and Stanza
            # predictions
            if label_match(spacy_labels[i], stanza_labels[i]):
                concordance += 1.0
            else:
                diff_indexes.append(i)

        # create the list differences
        for index in diff_indexes:
            diff = [index, word_list[index], spacy_labels[index], stanza_labels[index]]
            differences.append(diff)

        # divide the concordance counter by the length of the label list to get
        # the concordance in percent
        concordance = concordance / len(spacy_labels)

        return concordance, differences

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/ner/conll03.pt:   0%|          …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/backward_charlm/1billion.pt:   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| ner       | conll03  |

INFO:stanza:Use device: gpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …



Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.4.0/models/tokenize/ancora.pt:   0%|      …

Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.4.0/models/mwt/ancora.pt:   0%|          |…

Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.4.0/models/ner/conll02.pt:   0%|          …

Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.4.0/models/backward_charlm/newswiki.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.4.0/models/forward_charlm/newswiki.pt:   0…

INFO:stanza:Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| ner       | conll02 |

INFO:stanza:Use device: gpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


# Europarl

## English

In [4]:
# import the files
from google.colab import files
uploaded = files.upload()

Saving en-europarl.test.conll02 to en-europarl.test.conll02
Saving es-europarl.test.conll02 to es-europarl.test.conll02


In [5]:
from time import perf_counter
import os

# Load the English europarl-data
path_en = "en-europarl.test.conll02"
words, labels, text = load_europarl(path_en)

"""---------------------------------SPACY-----------------------------------"""

# Perform the Named Entity Recognition with SpaCy and measure the time it takes
start_spacy = perf_counter()
entities_spacy = ner(text, "spacy", "en")
stop_spacy = perf_counter()
duration_spacy = stop_spacy - start_spacy

# Evaluate the labels predicted by SpaCy
accuracy_spacy, differences_spacy = eval_europarl(words, labels, entities_spacy, "spacy")

# Print the SpaCy results
print(f"Duration of the SpaCy NER in seconds: {round(duration_spacy, 3)} sec")
print(f"Accuracy of the SpaCy NER in percent: {round(accuracy_spacy * 100, 3)} %")
#print("Differences:")
#print("Index   |Word                     |Gold Label     |Prediction     \n")
#print("------------------------------------------------------------------\n")
#for i in range(len(differences_spacy)):
#    print(f"{differences_spacy[i][0]:<8}|{differences_spacy[i][1]:<25}|{differences_spacy[i][2]:<15}|{differences_spacy[i][3]:<15}\n")

print("SpaCy DONE!")

"""---------------------------------STANZA----------------------------------"""

# Perform the Named Entity Recognition with Stanza and measure the time it takes
start_stanza = perf_counter()
entities_stanza = ner(text, "stanza", "en")
stop_stanza = perf_counter()
duration_stanza = stop_stanza - start_stanza

# Evaluate the labels predicted by Stanza
accuracy_stanza, differences_stanza = eval_europarl(words, labels, entities_stanza, "stanza")

# Print the Stanza results
print(f"Duration of the Stanza NER in seconds: {round(duration_stanza, 3)} sec")
print(f"Accuracy of the Stanza NER in percent: {round(accuracy_stanza * 100, 3)} %")
#print("Differences:")
#print("Index   |Word                     |Gold Label     |Prediction     \n")
#print("------------------------------------------------------------------\n")
#for i in range(len(differences_stanza)):
#    print(f"{differences_stanza[i][0]:<8}|{differences_stanza[i][1]:<25}|{differences_stanza[i][2]:<15}|{differences_stanza[i][3]:<15}\n")

print("Stanza DONE!")

Duration of the SpaCy NER in seconds: 4.631 sec
Accuracy of the SpaCy NER in percent: 95.909 %
SpaCy DONE!
Duration of the Stanza NER in seconds: 30.32 sec
Accuracy of the Stanza NER in percent: 96.698 %
Stanza DONE!


## Spanish

In [6]:
from time import perf_counter
import os

# Load the English europarl-data
path_es = "es-europarl.test.conll02"
words, labels, text = load_europarl(path_es)

"""---------------------------------SPACY-----------------------------------"""

# Perform the Named Entity Recognition with SpaCy and measure the time it takes
start_spacy = perf_counter()
entities_spacy = ner(text, "spacy", "es")
stop_spacy = perf_counter()
duration_spacy = stop_spacy - start_spacy

# Evaluate the labels predicted by SpaCy
accuracy_spacy, differences_spacy = eval_europarl(words, labels, entities_spacy, "spacy")

# Print the SpaCy results
print(f"Duration of the SpaCy NER in seconds: {round(duration_spacy, 3)} sec")
print(f"Accuracy of the SpaCy NER in percent: {round(accuracy_spacy * 100, 3)} %")
#print("Differences:")
#print("Index   |Word                     |Gold Label     |Prediction     \n")
#print("------------------------------------------------------------------\n")
#for i in range(len(differences_spacy)):
#    print(f"{differences_spacy[i][0]:<8}|{differences_spacy[i][1]:<25}|{differences_spacy[i][2]:<15}|{differences_spacy[i][3]:<15}\n")

print("SpaCy DONE!")

"""---------------------------------STANZA----------------------------------"""

# Perform the Named Entity Recognition with Stanza and measure the time it takes
start_stanza = perf_counter()
entities_stanza = ner(text, "stanza", "es")
stop_stanza = perf_counter()
duration_stanza = stop_stanza - start_stanza

# Evaluate the labels predicted by Stanza
accuracy_stanza, differences_stanza = eval_europarl(words, labels, entities_stanza, "stanza")

# Print the Stanza results
print(f"Duration of the Stanza NER in seconds: {round(duration_stanza, 3)} sec")
print(f"Accuracy of the Stanza NER in percent: {round(accuracy_stanza * 100, 3)} %")
#print("Differences:")
#print("Index   |Word                     |Gold Label     |Prediction     \n")
#print("------------------------------------------------------------------\n")
#for i in range(len(differences_stanza)):
#    print(f"{differences_stanza[i][0]:<8}|{differences_stanza[i][1]:<25}|{differences_stanza[i][2]:<15}|{differences_stanza[i][3]:<15}\n")

print("Stanza DONE!")

Duration of the SpaCy NER in seconds: 4.37 sec
Accuracy of the SpaCy NER in percent: 88.758 %
SpaCy DONE!
Duration of the Stanza NER in seconds: 31.502 sec
Accuracy of the Stanza NER in percent: 95.386 %
Stanza DONE!


# El Hoyo

In [7]:
# import the files
from google.colab import files
uploaded = files.upload()

Saving Back To The Future (EN).txt to Back To The Future (EN).txt
Saving Back To The Future (ES).txt to Back To The Future (ES).txt
Saving El Hoyo (EN).txt to El Hoyo (EN).txt
Saving El Hoyo (ES).txt to El Hoyo (ES).txt


In [8]:
path_en = "El Hoyo (EN).txt"
path_es = "El Hoyo (ES).txt"

words_en, text_en = load_subtitles(path_en)
words_es, text_es = load_subtitles(path_es)

"""--------------------------------ENGLISH----------------------------------"""

# Perform the Named Entity Recognition with SpaCy and measure the time it takes
start_spacy_en = perf_counter()
entities_spacy_en = ner(text_en, "spacy", "en")
stop_spacy_en = perf_counter()
duration_spacy_en = stop_spacy_en - start_spacy_en

# Perform the Named Entity Recognition with Stanza and measure the time it takes
start_stanza_en = perf_counter()
entities_stanza_en = ner(text_en, "stanza", "en")
stop_stanza_en = perf_counter()
duration_stanza_en = stop_stanza_en - start_stanza_en

# Evaluate the labels predicted by SpaCy and Stanza
concordance_en, differences_en = eval_subtitles(words_en, entities_spacy_en,
                                                entities_stanza_en)

# Print the results for English
print(f"Duration of the SpaCy NER in seconds:  {round(duration_spacy_en, 3)} sec")
print(f"Duration of the Stanza NER in seconds: {round(duration_stanza_en, 3)} sec")
print(f"Concordance of the SpaCy and Stanza in percent: {round(concordance_en * 100, 3)} %")

print("English DONE!")

"""--------------------------------SPANISH----------------------------------"""

# Perform the Named Entity Recognition with SpaCy and measure the time it takes
start_spacy_es = perf_counter()
entities_spacy_es = ner(text_es, "spacy", "es")
stop_spacy_es = perf_counter()
duration_spacy_es = stop_spacy_es - start_spacy_es

# Perform the Named Entity Recognition with Stanza and measure the time it takes
start_stanza_es = perf_counter()
entities_stanza_es = ner(text_es, "stanza", "es")
stop_stanza_es = perf_counter()
duration_stanza_es = stop_stanza_es - start_stanza_es

# Evaluate the labels predicted by SpaCy and Stanza
concordance_es, differences_es = eval_subtitles(words_es, entities_spacy_es,
                                                entities_stanza_es)

# Print the results for Spanish
print(f"Duration of the SpaCy NER in seconds:  {round(duration_spacy_es, 3)} sec")
print(f"Duration of the Stanza NER in seconds: {round(duration_stanza_es, 3)} sec")
print(f"Concordance of SpaCy and Stanza in percent: {round(concordance_es * 100, 3)} %")

print("Spanish DONE!")

Duration of the SpaCy NER in seconds:  2.078 sec
Duration of the Stanza NER in seconds: 7.948 sec
Concordance of the SpaCy and Stanza in percent: 98.128 %
English DONE!
Duration of the SpaCy NER in seconds:  0.885 sec
Duration of the Stanza NER in seconds: 7.563 sec
Concordance of SpaCy and Stanza in percent: 79.92 %
Spanish DONE!


# Back to the Future

In [9]:
path_en = "Back To The Future (EN).txt"
path_es = "Back To The Future (ES).txt"

words_en, text_en = load_subtitles(path_en)
words_es, text_es = load_subtitles(path_es)

"""--------------------------------ENGLISH----------------------------------"""

# Perform the Named Entity Recognition with SpaCy and measure the time it takes
start_spacy_en = perf_counter()
entities_spacy_en = ner(text_en, "spacy", "en")
stop_spacy_en = perf_counter()
duration_spacy_en = stop_spacy_en - start_spacy_en

# Perform the Named Entity Recognition with Stanza and measure the time it takes
start_stanza_en = perf_counter()
entities_stanza_en = ner(text_en, "stanza", "en")
stop_stanza_en = perf_counter()
duration_stanza_en = stop_stanza_en - start_stanza_en

# Evaluate the labels predicted by SpaCy and Stanza
concordance_en, differences_en = eval_subtitles(words_en, entities_spacy_en,
                                                entities_stanza_en)

# Print the results for English
print(f"Duration of the SpaCy NER in seconds:  {round(duration_spacy_en, 3)} sec")
print(f"Duration of the Stanza NER in seconds: {round(duration_stanza_en, 3)} sec")
print(f"Concordance of the SpaCy and Stanza in percent: {round(concordance_en * 100, 3)} %")

print("English DONE!")

"""--------------------------------SPANISH----------------------------------"""

# Perform the Named Entity Recognition with SpaCy and measure the time it takes
start_spacy_es = perf_counter()
entities_spacy_es = ner(text_es, "spacy", "es")
stop_spacy_es = perf_counter()
duration_spacy_es = stop_spacy_es - start_spacy_es

# Perform the Named Entity Recognition with Stanza and measure the time it takes
start_stanza_es = perf_counter()
entities_stanza_es = ner(text_es, "stanza", "es")
stop_stanza_es = perf_counter()
duration_stanza_es = stop_stanza_es - start_stanza_es

# Evaluate the labels predicted by SpaCy and Stanza
concordance_es, differences_es = eval_subtitles(words_es, entities_spacy_es,
                                                entities_stanza_es)

# Print the results for Spanish
print(f"Duration of the SpaCy NER in seconds:  {round(duration_spacy_es, 3)} sec")
print(f"Duration of the Stanza NER in seconds: {round(duration_stanza_es, 3)} sec")
print(f"Concordance of SpaCy and Stanza in percent: {round(concordance_es * 100, 3)} %")

print("Spanish DONE!")

Duration of the SpaCy NER in seconds:  1.74 sec
Duration of the Stanza NER in seconds: 14.468 sec
Concordance of the SpaCy and Stanza in percent: 97.807 %
English DONE!
Duration of the SpaCy NER in seconds:  1.35 sec
Duration of the Stanza NER in seconds: 12.809 sec
Concordance of SpaCy and Stanza in percent: 81.49 %
Spanish DONE!
