# spaCy Performance on Named Entity Recognition with Code-Mixed Data

In this notebook we examine the multi-functional model spaCy's performances on named entity recognition (NER) tasks, when the data are multilingual. More specifically, we will be focusing on code-mixing (code-switched) data, where the vocabulary of two different languages are used interchangeably in one sentence.


In [8]:
import numpy as np
import pandas as pd
import itertools
import spacy

from tqdm import tqdm

import warnings

warnings.filterwarnings("ignore")

In [9]:
# read in file
file_path = "../data/train.conll"

# empty list to store DataFrames for each sentence
corpus = []

# read the CoNLL-U file line by line
with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

    current_sentence = []
    columns = ["word", "lang", "entity_type"]
    for line in lines:
        if line.startswith("# sent_enum"):
            # if a new sentence begins, process the current one
            if current_sentence:
                df = pd.DataFrame(current_sentence, columns=columns)
                corpus.append(df)
                current_sentence = []
        else:
            # append each line to the current sentence
            current_sentence.append(line.strip().split("\t"))

# last sentence in the file
if current_sentence:
    df = pd.DataFrame(current_sentence, columns=columns)
    corpus.append(df)
# each sentence cann now be called by corpus[idx]

In [10]:
# load spaCy model for both L1 and L2
model_eng = spacy.load("en_core_web_sm")
model_spa = spacy.load("es_core_news_sm")

In [11]:
def tag_eng_sent(model_eng, corpus, sent_idx):
    """tag a sentence with English as L1
    return a dictionary with language tags, gold NE tags and spacy NER results"""
    sent_df = corpus[sent_idx][:-1]  # remove last row resulted by CoNLL-U seperator

    # extract all pre-processed tokens to a list
    gold_tokens = list(sent_df["word"])
    # regularize gold NER tags, save to list
    gold_tags = ["Yes" if tag != "O" else "O" for tag in list(sent_df["entity_type"])]
    # also save language tags
    gold_langs = list(sent_df["lang"])

    sentence_text = sent_df["word"].str.cat(sep=" ")
    doc = model_eng(sentence_text)
    nes = [i.text for i in doc.ents]
    # flat the nes tokens
    nes_tokens = [
        item for sublist in [item.split() for item in nes] for item in sublist
    ]

    if len(nes_tokens) == 0:  # check if spaCy found any NE
        spacy_tags = ["O"] * len(sent_df)
    else:
        spacy_tags = []  # list to store spaCy NER results
        for token in gold_tokens:
            if len(nes_tokens) != 0:
                if token in nes_tokens[0] or nes_tokens[0] in token:
                    spacy_tags.append("Yes")
                    nes_tokens = nes_tokens[1:]
                else:
                    spacy_tags.append("O")
            else:
                spacy_tags.append("O")

    results = {
        "mlang": "eng",
        "lang": gold_langs,
        "true_ne": gold_tags,
        "spacy_ne": spacy_tags,
    }

    return results

In [12]:
def tag_spa_sent(model_spa, corpus, sent_idx):
    """tag a sentence with English as L1
    return a dictionary with language tags, gold NE tags and spacy NER results"""
    sent_df = corpus[sent_idx][:-1]  # remove last row resulted by CoNLL-U seperator

    # extract all pre-processed tokens to a list
    gold_tokens = list(sent_df["word"])
    # regularize gold NER tags, save to list
    gold_tags = ["Yes" if tag != "O" else "O" for tag in list(sent_df["entity_type"])]
    # also save language tags
    gold_langs = list(sent_df["lang"])

    sentence_text = sent_df["word"].str.cat(sep=" ")
    doc = model_spa(sentence_text)
    nes = [i.text for i in doc.ents]
    # flat the nes tokens
    nes_tokens = [
        item for sublist in [item.split() for item in nes] for item in sublist
    ]

    if len(nes_tokens) == 0:  # check if spaCy found any NE
        spacy_tags = ["O"] * len(sent_df)
    else:
        spacy_tags = []  # list to store spaCy NER results
        for token in gold_tokens:
            if len(nes_tokens) != 0:
                if token in nes_tokens[0] or nes_tokens[0] in token:
                    spacy_tags.append("Yes")
                    nes_tokens = nes_tokens[1:]
                else:
                    spacy_tags.append("O")
            else:
                spacy_tags.append("O")

    results = {
        "mlang": "spa",
        "lang": gold_langs,
        "true_ne": gold_tags,
        "spacy_ne": spacy_tags,
    }

    return results

In [13]:
# def most_frequent_element(lst):
#     most_frequent = max(set(lst), key=lst.count)
#
#     return most_frequent
#
#
# ner_results = []
# for i in tqdm(range(len(corpus)), desc="Processing"):
#     lang_tags = list(corpus[i]["lang"])
#     # make sure the sentence is code-mixed
#     if "lang1" in lang_tags and "lang2" in lang_tags:
#         # find the dominant language (lang1=eng, lang2=spa)
#         mlang = most_frequent_element(lang_tags)
#         if mlang == "lang1":
#             ner_results.append(
#                 tag_eng_sent(model_eng=model_eng, corpus=corpus, sent_idx=i)
#             )
#         else:
#             ner_results.append(
#                 tag_spa_sent(model_spa=model_spa, corpus=corpus, sent_idx=i)
#             )

In [16]:
def most_frequent_element(lst):
    most_frequent = max(set(lst), key=lst.count)

    return most_frequent


ner_results_cs = []
ner_results_noncs = []
for i in tqdm(range(len(corpus)), desc="Processing"):
    lang_tags = list(corpus[i]["lang"])
    if "lang1" in lang_tags and "lang2" in lang_tags:
        # find the dominant language (lang1=eng, lang2=spa)
        mlang = most_frequent_element(lang_tags)
        if mlang == "lang1":
            ner_results_cs.append(
                tag_eng_sent(model_eng=model_eng, corpus=corpus, sent_idx=i)
            )
        else:
            ner_results_cs.append(
                tag_spa_sent(model_spa=model_spa, corpus=corpus, sent_idx=i)
            )
    else:
        mlang = most_frequent_element(lang_tags)
        if mlang == "lang1":
            ner_results_noncs.append(
                tag_eng_sent(model_eng=model_eng, corpus=corpus, sent_idx=i)
            )
        else:
            ner_results_noncs.append(
                tag_spa_sent(model_spa=model_spa, corpus=corpus, sent_idx=i)
            )

Processing: 100%|██████████| 33611/33611 [03:53<00:00, 143.97it/s]


In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix

gold = np.array(list(itertools.chain(*[sent["true_ne"] for sent in ner_results_noncs])))
pred = np.array(
    list(itertools.chain(*[sent["spacy_ne"] for sent in ner_results_noncs]))
)

# create the confusion matrix
cfm = confusion_matrix(gold, pred)
# reshape the confusion matrix to a 2x2 matrix
cfm = cfm.reshape((2, 2))

# create a pandas DataFrame from the confusion matrix
df_cfm = pd.DataFrame(
    cfm,
    index=["Actual non-NE", "Actual NE"],
    columns=["Predicted non-NE", "Predicted NE"],
)
print(f"confusion matrix for NER results of monolingual sentences:\n{df_cfm}")

# get predictions accuracy scores
acc = accuracy_score(gold, pred)
print(f"accuracy score for NER results of monolingual sentences: {acc}")

confusion matrix for NER results of monolingual sentences:
               Predicted non-NE  Predicted NE
Actual non-NE            221529         41730
Actual NE                  2198          4470
accuracy score for NER results of monolingual sentences: 0.8372597035494782


In [58]:
gold_l1 = []  # gold NE tags of tokens of matrix language
pred_l1 = []  # predicted NE tags of tokens of matrix language
gold_l2 = []  # gold NE tags of tokens of embedded language
pred_l2 = []  # predicted NE tags of tokens of embedded language

for i in range(len(ner_results_cs)):
    if ner_results_cs[i]["mlang"] == "eng":
        # retrieve indices of tokens of matrix language == English
        idx_l1 = [
            idx for idx, item in enumerate(ner_results_cs[i]["lang"]) if item == "lang1"
        ]
        # get gold and spaCy NE tags and add to overall list
        g_l1 = [ner_results_cs[i]["true_ne"][idx] for idx in idx_l1]
        p_l1 = [ner_results_cs[i]["spacy_ne"][idx] for idx in idx_l1]
        gold_l1.append(g_l1)
        pred_l1.append(p_l1)

        # do the same for tokens of embedded language == Spanish
        idx_l2 = [
            idx for idx, item in enumerate(ner_results_cs[i]["lang"]) if item == "lang2"
        ]
        g_l2 = [ner_results_cs[i]["true_ne"][idx] for idx in idx_l2]
        p_l2 = [ner_results_cs[i]["spacy_ne"][idx] for idx in idx_l2]
        gold_l2.append(g_l2)
        pred_l2.append(p_l2)

    elif ner_results_cs[i]["mlang"] == "spa":
        # retrieve indices of tokens of matrix language == Spanish
        idx_l1 = [
            idx for idx, item in enumerate(ner_results_cs[i]["lang"]) if item == "lang2"
        ]
        # get gold and spaCy NE tags and add to overall list
        g_l1 = [ner_results_cs[i]["true_ne"][idx] for idx in idx_l1]
        p_l1 = [ner_results_cs[i]["spacy_ne"][idx] for idx in idx_l1]
        gold_l1.append(g_l1)
        pred_l1.append(p_l1)

        # do the same for tokens of embedded language == English
        idx_l2 = [
            idx for idx, item in enumerate(ner_results_cs[i]["lang"]) if item == "lang1"
        ]
        g_l2 = [ner_results_cs[i]["true_ne"][idx] for idx in idx_l2]
        p_l2 = [ner_results_cs[i]["spacy_ne"][idx] for idx in idx_l2]
        gold_l2.append(g_l2)
        pred_l2.append(p_l2)

In [59]:
gold_l1 = np.array(list(itertools.chain(*gold_l1)))
pred_l1 = np.array(list(itertools.chain(*pred_l1)))

# create the confusion matrix
cfm = confusion_matrix(gold_l1, pred_l1)
# reshape the confusion matrix to a 2x2 matrix
cfm = cfm.reshape((2, 2))

# create a pandas DataFrame from the confusion matrix
df_cfm = pd.DataFrame(
    cfm,
    index=["Actual non-NE", "Actual NE"],
    columns=["Predicted non-NE", "Predicted NE"],
)
print(f"confusion matrix for NER results of L1 tokens in code-mixed sentences:\n{df_cfm}")

# get predictions accuracy scores
acc = accuracy_score(gold_l1, pred_l1)
print(f"accuracy score for NER results of L1 tokens in code-mixed sentences: {acc}")

confusion matrix for NER results of L1 tokens in code-mixed sentences:
               Predicted non-NE  Predicted NE
Actual non-NE             68521          9604
Actual NE                   435           434
accuracy score for NER results of L1 tokens in code-mixed sentences: 0.8729143985619161


In [60]:
gold_l2 = np.array(list(itertools.chain(*gold_l2)))
pred_l2 = np.array(list(itertools.chain(*pred_l2)))

# create the confusion matrix
cfm = confusion_matrix(gold_l2, pred_l2)
# reshape the confusion matrix to a 2x2 matrix
cfm = cfm.reshape((2, 2))

# create a pandas DataFrame from the confusion matrix
df_cfm = pd.DataFrame(
    cfm,
    index=["Actual non-NE", "Actual NE"],
    columns=["Predicted non-NE", "Predicted NE"],
)
print(f"confusion matrix for NER results of L2 tokens in code-mixed sentences:\n{df_cfm}")

# get predictions accuracy scores
acc = accuracy_score(gold_l2, pred_l2)
print(f"accuracy score for NER results of L2 tokens in code-mixed sentences: {acc}")

confusion matrix for NER results of L2 tokens in code-mixed sentences:
               Predicted non-NE  Predicted NE
Actual non-NE             10710          6668
Actual NE                   337           760
accuracy score for NER results of L2 tokens in code-mixed sentences: 0.6208389715832205


### Error Analysis 1

How many inserted normal non-NE L2 words are falsely tagged as named entities?


In [18]:
# target_word_idxs: list of indices of inserted L2 tokens that are not NEs in each sentence
target_word_idxs = []
for result in ner_results_cs:
    if result["mlang"] == "eng":
        # get CS Spanish token index
        cs_idx = [i for i in range(len(result["lang"])) if result["lang"][i] == "lang2"]
        # remove CS Spanish tokens that are actually NEs
        cs_ne_idx = [idx for idx in cs_idx if result["true_ne"][idx] == "O"]
        target_word_idxs.append(cs_ne_idx)
    elif result["mlang"] == "spa":
        # get CS English token index
        cs_idx = [i for i in range(len(result["lang"])) if result["lang"][i] == "lang1"]
        # remove CS English tokens that are actually NEs
        cs_ne_idx = [idx for idx in cs_idx if result["true_ne"][idx] == "O"]
        target_word_idxs.append(cs_ne_idx)

cs_fauxne = []  # [(CS tokens count, CS tokens tagged as NE count) of sent_1, ...]
# get from spaCy falsely tagged inserted L2 tokens
for i in range(len(target_word_idxs)):
    if len(target_word_idxs[i]) > 0:
        cs_count = len(target_word_idxs[i])
        cs_as_ne_count = len(
            [j for j in target_word_idxs[i] if ner_results_cs[i]["spacy_ne"][j] != "O"]
        )
        cs_fauxne.append((cs_count, cs_as_ne_count))

In [19]:
all_cs_count = sum(t[0] for t in cs_fauxne)
all_cs_as_ne_count = sum(t[1] for t in cs_fauxne)

print(all_cs_as_ne_count / all_cs_count)

0.38370353320290024


### Error Analysis 2

How many falsely tagged tokens are actually normal inserted non-NE L2 words?

Namely: How many error are caused by code-switching?


In [20]:
# target_word_idxs: list of indices of falsely tagged tokens by spaCy
target_word_idxs = []
for result in ner_results:
    spacy_wrong_ne_idx = [
        i
        for i, (elem1, elem2) in enumerate(zip(result["spacy_ne"], result["true_ne"]))
        if elem1 != elem2
    ]
    target_word_idxs.append(spacy_wrong_ne_idx)

fauxne_at_cs = (
    []
)  # [(falsely tagged NE count, error on CS position count) of sent_1, ...]
for i in range(len(target_word_idxs)):
    if len(target_word_idxs[i]) > 0:
        fauxne_count = len(target_word_idxs[i])
        sentence = ner_results_cs[i]

        if sentence["mlang"] == "eng":
            fauxne_at_cs_count = len(
                [j for j in target_word_idxs[i] if sentence["lang"][j] == "lang2"]
            )
        elif sentence["mlang"] == "spa":
            fauxne_at_cs_count = len(
                [j for j in target_word_idxs[i] if sentence["lang"][j] == "lang1"]
            )
        fauxne_at_cs.append((fauxne_count, fauxne_at_cs_count))

In [21]:
all_fauxne_count = sum(t[0] for t in fauxne_at_cs)
all_fauxne_at_cs_count = sum(t[1] for t in fauxne_at_cs)

print(all_fauxne_at_cs_count / all_fauxne_count)

0.29552324744494024


### Error Analysis 3

How many inserted L2 tokens that are actually NEs are correctly identified as NE by L1 model?


In [22]:
# target_word_idxs: list of indices of inserted L2 words that are NEs by gold standard
target_word_idxs = []
for result in ner_results_cs:
    if result["mlang"] == "eng":
        # get CS Spanish token index
        cs_idx = [i for i in range(len(result["lang"])) if result["lang"][i] == "lang2"]
        # keep CS Spanish tokens that are actually NEs
        cs_ne_idx = [idx for idx in cs_idx if result["true_ne"][idx] != "O"]
        target_word_idxs.append(cs_ne_idx)
    elif result["mlang"] == "spa":
        # get CS English token index
        cs_idx = [i for i in range(len(result["lang"])) if result["lang"][i] == "lang1"]
        # keep CS English tokens that are actually NEs
        cs_ne_idx = [idx for idx in cs_idx if result["true_ne"][idx] != "O"]
        target_word_idxs.append(cs_ne_idx)

csne_as_ne = []  # [(L2 tokens = NE count, NE-L2 tokens as NE count) of sent_1, ...]
for i in range(len(ner_results_cs)):
    if len(target_word_idxs[i]) > 0:
        l2ne_count = len(target_word_idxs[i])
        l2ne_as_ne_count = len(
            [
                j
                for j in target_word_idxs[i]
                if ner_results_cs[i]["spacy_ne"][j] == "Yes"
            ]
        )

        csne_as_ne.append((l2ne_count, l2ne_as_ne_count))

In [23]:
all_l2ne_count = sum(t[0] for t in csne_as_ne)
all_l2ne_as_ne_count = sum(t[1] for t in csne_as_ne)

print(all_l2ne_as_ne_count / all_l2ne_count)

0.6927985414767548


In [24]:
# how many inserted L2 tokens are also NEs
print(all_l2ne_as_ne_count)

760
