  # Import Libraries

In [1]:
from typing import Dict, List, Optional, Tuple
from pathlib import Path

from itertools import zip_longest, accumulate, filterfalse
from collections import Counter

import re
import unicodedata

import csv
import pickle

# define custom types
Conllu = List[List[str]]
LabelConlluDict = Dict[str, Conllu]
DocuConlluDict = Dict[str, LabelConlluDict]


  # Define some pathes

In [2]:
file_ar1 = "../outputs/conllu/wikibase-item_quantity_time/stanza_ar.conllu"
file_ar2 = "../outputs/conllu/wikibase-item_quantity_time/udp_ar.conllu"

file_en1 = "../outputs/conllu/wikibase-item_quantity_time/stanza_en.conllu"
file_en2 = "../outputs/conllu/wikibase-item_quantity_time/udp_en.conllu"

results_dir = "../outputs/conllu_compare/wikibase-item_quantity_time"


  # Define some functions

In [3]:
def read_text(file):
    with open(file, "r") as f:
        conllu = f.read()
    return conllu



In [4]:
def read_conllu(conllu_file):
    """
    Read a CoNLL-U formatted file and return a dictionary of sentences
    keyed by label IDs and label texts.

    Args:
        conllu_file (str): The file path of the CoNLL-U formatted file.

    Returns:
        dict: A dictionary of sentences keyed by sentence ID and text reference.
    """

    # Define regular expressions to extract sentence ID and text.
    sent_id_regx = re.compile(r"(?<=# sent_id = )\w+")
    text_regex = re.compile(r"(?<=# text = ).+")
    conllu_filter = re.compile(r"#.+\n")

    # Define regular expressions to fix spaces issues in text.
    remove_extra_spaces = re.compile(r'\s\s+')
    add_space_b4r_openbracket = re.compile(r'(?<=\w)\(')
    add_space_b4r_doublequote = re.compile(r'(?<=\w)"')

    conllu = read_text(conllu_file)  # read CoNLL-U file

    # Extract sentence IDs and text from the CoNLL-U formatted file.
    sent_ids = sent_id_regx.findall(conllu)
    texts = text_regex.findall(conllu)
    # Remove comments from the CoNLL-U formatted string.
    sents_conllu = conllu_filter.sub(repl="", string=conllu)

    # Split the CoNLL-U formatted string into sentences and tokens.
    sents_conllu = map(lambda x: x.split("\n"), sents_conllu.split("\n\n"))
    sents_conllu = [[tc.split("\t")[:8] for tc in sc] for sc in sents_conllu]

    # Fix issues with extra spaces in text.
    texts = [remove_extra_spaces.sub(' ', txt) for txt in texts]
    texts = [add_space_b4r_openbracket.sub(' (', txt) for txt in texts]
    texts = [add_space_b4r_doublequote.sub(' "', txt) for txt in texts]
    texts = [unicodedata.normalize("NFKD", txt) for txt in texts]  # /xa0

    # Create a dictionary of sentences keyed by sentence ID and text reference.
    return {
        idx: {
            txt: conllu
        }
        for idx, txt, conllu in zip(sent_ids, texts, sents_conllu)
    }



In [5]:
def extract_conllu_diffs(
    conllu_doc_base: DocuConlluDict, conllu_docu_2compr: DocuConlluDict
) -> List[Tuple[List[str], List[str], List[Conllu]]]:
    """
    Compare two CoNLL-U dictionaries and extract the differences.
    """
    diff_conllu: List[Conllu] = []  # List Conllu with differences
    diff_idx: List[str] = []  # List of labels idx that has different conllus
    diff_text: List[str] = []  # List of labels text that has different conllus

    # Loop over the base CoNLL-U
    for label_idx, label_conllu_base in conllu_doc_base.items():
        for label_text_base, conllu_base in label_conllu_base.items():
            # Get the corresponding CoNLL-U dict {label_text: CoNLLU-U} from
            # the second dictionary to compare with.
            label_conllu_2compr = conllu_docu_2compr.get(label_idx, None)
            if label_conllu_2compr is None:
                # the second dictionary does not have this label
                # TODO: either raise an error or record it as a difference
                pass
            else:
                # get the corresponding CoNLL-U list of list of CoNLL-U fields
                conllu_2compr = label_conllu_2compr.get(label_text_base)
                # Combine the corresponding CoNLL-U fields. If Both CoNLL-Us do
                # not have the same lenghth fill the empty fields with '<none>'
                # [
                #   [(base_token1_idx, 2compr_token1_idx), ...
                #     (base_token1_UPOS, 2compr_token1_UPOS), ...],
                #   [(base_token2_idx, 2compr_token2_idx), ...
                #     (base_token2_UPOS, 2compr_token2_UPOS), ...],
                #   [ (...), (....), ...]
                # ]
                conllu_fields_combined = [
                    list(zip_longest(c1, c2, fillvalue="<none>"))
                    for c1, c2 in zip_longest(
                        conllu_base, conllu_2compr, fillvalue=["<none>"])
                ]
                # Check if there are any differences
                ndiff = all(f1 == f2 for field_tuple in conllu_fields_combined
                            for f1, f2 in field_tuple)
                if not ndiff:
                    combined_fields = []  # type: Conllu # list of list of str
                    # Record the differences between the corresponding
                    # token fields
                    # For each field in conllu if the base conllu field is the
                    # same as the conllu-to-compare field record the base
                    # field. Otherwise, record the differences in the following
                    # format: f'{base-field}[{field_to_compare}]'
                    for field_tuple in conllu_fields_combined:
                        combined_fields.append([
                            f"{field_base}[{field_2compr}]"
                            if field_base != field_2compr else field_base
                            for field_base, field_2compr in field_tuple
                        ])
                    diff_conllu.append(combined_fields)
                    diff_idx.append(label_idx)
                    diff_text.append(label_text_base)

    # Return a list of
    # the labels indeces, labels text, and differing fields
    return diff_idx, diff_text, diff_conllu



In [6]:
def write_conllu_diffs_to_csv(file_path: str, diff_idx: List[str],
                              diff_text: List[str],
                              diff_conllu: List[Conllu]) -> None:
    """
    Write the CoNLL-U formatted sentences from diff_idx, diff_text, and
    diff_conllu to a CSV file.

    Args:
        file_path (str): The file path to write the CSV file to.
        diff_idx (List[str]): A list of labels IDs.
        diff_text (List[str]): A list of labels texts
        diff_conllu (List[List[List[str]]]): A list of CoNLL-U formatted fields.

    """
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        match_diff = re.compile(r'\[.+\]')
        for idx, text, conllu in zip(diff_idx, diff_text, diff_conllu):
            # exclude feats field from printing. If the differences are in
            # feats only do not print the conllu.
            print_flag = False
            for fields in conllu:
                # some fields [ID, Form, Lemma, POS, XPOS, head, udlabel]
                fields_list = fields[:5] + fields[6:8]
                if any(bool(match_diff.search(f)) for f in fields_list):
                    print_flag = True
                    break

            # found differences in any of [ID, Form, Lemma, POS, XPOS, head,
            # udlabel]
            if print_flag:
                writer.writerow([f'# {idx}'])
                writer.writerow([f'# {text}'])
                for fields in conllu:
                    fields_list = fields[:5] + fields[6:8]
                    writer.writerow(fields_list)
                writer.writerow([])


def read_diff_report_comments(path: str):
    msg = 'Number of issues do not match with the number of tools in comments'
    with open(path, 'r') as f:
        reader = csv.reader(f, delimiter=';')
        _ = next(reader)
        all_comments = []
        for row in reader:
            # Check if all cells in the first three columns are not empty
            if all(row[:3]):
                # Process the row here
                tools = row[0].split('|')
                issues = row[1].split('|')
                assert len(tools) == len(issues), f'{row[3]}: {msg}'
                all_comments.extend(list(zip(tools, issues)))

    return all_comments


In [7]:


def get_diff_per_field(
    diff_idx: List[str], diff_conllu: List[Conllu]
) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:

    match_diff = re.compile(r'\[.+\]')
    diff_idxs = {'split': [], 'upos': [], 'feat': [], 'head': [], 'udp': []}
    diff_values = {'upos': [], 'feat': [], 'head': [], 'udp': []}

    for idx, conllu in zip(diff_idx, diff_conllu):
        # check that there tokens are identcal. No difference in split. If
        # exists, get the corresponding label id and continue to the next
        # record. If not search for other difference.
        _, t_forms, *_ = zip(*conllu)
        split_diff_bool = bool(match_diff.search(" ".join(t_forms)))
        if split_diff_bool:
            diff_idxs['split'].append(idx)
            continue
        for fields in conllu:
            _, _, _, t_upos, _, t_feat, t_head, t_udp = fields
            # check if there is differences in UPOS. If exists, there is no
            # need to check differences in Feats filed
            upos_diff_bool = bool(match_diff.search(t_upos))
            if upos_diff_bool:
                diff_idxs['upos'].append(idx)
                diff_values['upos'].append(t_upos)
            else:
                feat_diff_bool = bool(match_diff.search(t_feat))
                if feat_diff_bool:
                    diff_idxs['feat'].append(idx)
                    diff_values['feat'].append(t_feat)

            # check if there is differences in UD
            t_head_diff_bool = bool(match_diff.search(t_head))
            if t_head_diff_bool:
                diff_idxs['head'].append(idx)
                diff_values['head'].append(t_head)

            # check if there is differences in head field
            t_udp_diff_bool = bool(match_diff.search(t_udp))
            if t_udp_diff_bool:
                diff_idxs['udp'].append(idx)
                diff_values['udp'].append(t_udp)

    return diff_idxs, diff_values


def examine_field_diff(
        field_diff: List[str],
        threshold: Optional[float] = None
) -> List[Tuple[str, int, float, float]]:

    diff_counts = Counter(field_diff).most_common()

    diff_names, diff_names_count = zip(*diff_counts)
    diff_names: Tuple[str]
    diff_names_count: Tuple[int]

    total_diff_num = sum(diff_names_count)
    diff_name_percent = [dc / total_diff_num for dc in diff_names_count]
    diff_name_percent_cumsum = accumulate(diff_name_percent)
    diff_data = list(
        zip(diff_names, diff_names_count, diff_name_percent,
            diff_name_percent_cumsum))

    if threshold is not None:
        diff_data = list(filterfalse(lambda x: x[-1] > threshold, diff_data))

    return diff_data



In [8]:
def print_diff_report(diff_data: List[Tuple[str, int, float, float]]):
    values, counts, percnts, percent_cumsums = zip(*diff_data)
    v_w = max([len(v) for v in values]) + 2

    header = ['Diff Value', 'Count', '%', "% CumSum"]
    hw = [max(v_w, 11), 9, 9, 9]
    for h, w in zip(header, hw):
        print(f'{h:{w}}', end='')
    print()

    for v, c, p, pcm in diff_data:
        print(f'{v:{hw[0]}}{c:<{hw[1]}}{p:<{hw[2]}.2%}{pcm:<{hw[3]}.2%}')
    print()



  # Compare between Stanza and UDpipe CoNLL-U Outputs

In [9]:
# Extract difference between two CoNLL-U files
diff_idxs_report = []
diff_values_report = []
for file_1, file_2 in [(file_ar1, file_ar2), (file_en1, file_en2)]:
    # Read files
    conllu_dict1 = read_conllu(file_1)  # stanza
    conllu_dict2 = read_conllu(file_2)  # udpipe

    # get differences
    label_ids, label_txts, conllus = extract_conllu_diffs(
        conllu_dict1, conllu_dict2)

    # Count differences per field
    diff_idxs, diff_values = get_diff_per_field(label_ids, conllus)
    diff_idxs_report.append(diff_idxs)
    diff_values_report.append(diff_values)

    # write differences int csv
    f1_name = Path(file_1).stem
    f2_name = Path(file_2).stem
    file_name = f'{results_dir}{f1_name}_vs_{f2_name}'
    with open(f'{file_name}.pkl', "wb") as f:
        pickle.dump([label_ids, label_txts, conllus], f)

    # write csv
    lang = f1_name.split('_')[-1]
    file_name = f'{results_dir}split_diff_conllu_{lang}.csv'
    write_conllu_diffs_to_csv(file_path=file_name,
                              diff_idx=label_ids,
                              diff_text=label_txts,
                              diff_conllu=conllus)

    # print difference percentege
    diff_prnt = len(label_ids) / len(conllu_dict1)
    print(f'Difference between {f1_name} and {f2_name}:')
    print(f'  - Total Percentage:       {diff_prnt:.2%}')
    print(f'  - Total length:           {len(label_ids)}')
    print()


Difference between stanza_ar and udp_ar:
  - Total Percentage:       52.85%
  - Total length:           1037

Difference between stanza_en and udp_en:
  - Total Percentage:       31.06%
  - Total length:           709



In [10]:
for i, lang in enumerate(['Arabic', 'English']):
    split_diff_num = len(diff_idxs_report[i]['split'])
    split_diff_percent = split_diff_num / len(label_ids)
    print(f'Differences in token split in {lang}: {split_diff_num}', end=" ")
    print(f' which represents {split_diff_percent: .2%} of labels.')


Differences in token split in Arabic: 252  which represents  35.54% of labels.
Differences in token split in English: 26  which represents  3.67% of labels.


In [11]:
print('Differences count in UPOS')
for i, lang in enumerate(['Arabic', 'English']):
    print(f'in {lang}:')
    diff_data = examine_field_diff(diff_values_report[i]['upos'], 0.99)
    print_diff_report(diff_data)


Differences count in UPOS
in Arabic:
Diff Value  Count    %        % CumSum 
NOUN[X]     86       31.16%   31.16%   
ADJ[NOUN]   57       20.65%   51.81%   
X[NOUN]     48       17.39%   69.20%   
ADJ[X]      32       11.59%   80.80%   
NOUN[ADJ]   15       5.43%    86.23%   
X[ADJ]      11       3.99%    90.22%   
VERB[NOUN]  7        2.54%    92.75%   
VERB[X]     5        1.81%    94.57%   
NOUN[VERB]  3        1.09%    95.65%   
ADJ[PRON]   2        0.72%    96.38%   
X[ADP]      2        0.72%    97.10%   
SCONJ[X]    1        0.36%    97.46%   
VERB[ADJ]   1        0.36%    97.83%   
X[VERB]     1        0.36%    98.19%   
NOUN[ADP]   1        0.36%    98.55%   
ADJ[CCONJ]  1        0.36%    98.91%   

in English:
Diff Value    Count    %        % CumSum 
VERB[NOUN]    79       27.53%   27.53%   
NOUN[PROPN]   40       13.94%   41.46%   
NOUN[ADJ]     38       13.24%   54.70%   
PROPN[NOUN]   29       10.10%   64.81%   
ADJ[NOUN]     18       6.27%    71.08%   
VERB[ADJ]     13  

In [12]:
print('Differences count in feat')
for i, lang in enumerate(['Arabic', 'English']):
    print(f'in {lang}:')
    diff_data = examine_field_diff(diff_values_report[i]['feat'], 0.99)
    print_diff_report(diff_data)


Differences count in feat
in Arabic:
Diff Value                                                                                                                                                 Count    %        % CumSum 
Case=Nom|Definite=Ind|Number=Sing[Case=Nom|Definite=Cons|Number=Sing]                                                                                      23       10.70%   10.70%   
_[Foreign=Yes]                                                                                                                                             12       5.58%    16.28%   
Case=Nom|Definite=Cons|Number=Sing[_]                                                                                                                      11       5.12%    21.40%   
Case=Gen|Definite=Def|Number=Sing[Case=Gen|Definite=Def|Number=Plur]                                                                                       9        4.19%    25.58%   
Case=Nom|Definite=Def|Gender=Masc|Number=Sing[Ca

In [13]:
print('Differences count in head')
for i, lang in enumerate(['Arabic', 'English']):
    print(f'in {lang}:')
    diff_data = examine_field_diff(diff_values_report[i]['head'], 0.99)
    print_diff_report(diff_data)


Differences count in head
in Arabic:
Diff Value Count    %        % CumSum 
1[2]       46       22.12%   22.12%   
2[1]       45       21.63%   43.75%   
0[2]       18       8.65%    52.40%   
1[0]       18       8.65%    61.06%   
0[1]       14       6.73%    67.79%   
3[1]       12       5.77%    73.56%   
3[0]       10       4.81%    78.37%   
1[3]       8        3.85%    82.21%   
2[3]       7        3.37%    85.58%   
1[4]       7        3.37%    88.94%   
2[0]       6        2.88%    91.83%   
3[4]       3        1.44%    93.27%   
3[2]       3        1.44%    94.71%   
4[2]       2        0.96%    95.67%   
4[3]       1        0.48%    96.15%   
1[5]       1        0.48%    96.63%   
0[5]       1        0.48%    97.12%   
5[2]       1        0.48%    97.60%   
6[5]       1        0.48%    98.08%   
0[3]       1        0.48%    98.56%   

in English:
Diff Value Count    %        % CumSum 
0[2]       76       20.99%   20.99%   
1[0]       74       20.44%   41.44%   
2[3]       26 

In [14]:
print('Differences count in UD')
for i, lang in enumerate(['Arabic', 'English']):
    print(f'in {lang}:')
    diff_data = examine_field_diff(diff_values_report[i]['udp'], 0.99)
    print_diff_report(diff_data)


Differences count in UD
in Arabic:
Diff Value          Count    %        % CumSum 
punct[case]         57       18.94%   18.94%   
amod[nmod]          48       15.95%   34.88%   
nmod[amod]          29       9.63%    44.52%   
root[nmod]          14       4.65%    49.17%   
nmod[root]          14       4.65%    53.82%   
punct[mark]         13       4.32%    58.14%   
root[case]          12       3.99%    62.13%   
nsubj[root]         11       3.65%    65.78%   
obl[nmod]           8        2.66%    68.44%   
nsubj[obj]          7        2.33%    70.76%   
obl:arg[root]       7        2.33%    73.09%   
nsubj[nmod]         6        1.99%    75.08%   
root[amod]          5        1.66%    76.74%   
amod[case]          5        1.66%    78.41%   
nmod[case]          5        1.66%    80.07%   
nmod[obl]           4        1.33%    81.40%   
obl:arg[nmod]       4        1.33%    82.72%   
nmod[flat:foreign]  4        1.33%    84.05%   
nmod[dep]           3        1.00%    85.05%   
dep[c

 # Export Differences data

 ## Ckeck differences in split

In [None]:
langs = ['ar', 'en']
lang = 'en'
lang_dict_path = list(Path(results_dir).glob(f'*{lang}.pkl'))[0]

with open(lang_dict_path, 'rb') as f:
    label_ids, label_txts, conllus = pickle.load(f)

split_label_ids = diff_idxs_report[langs.index(lang)]['split']
idx = [label_ids.index(li) for li in split_label_ids]
split_label_txts = [label_txts[i] for i in idx]
split_conllus = [conllus[i] for i in idx]

# write
file_name = f'{results_dir}split_diff_conllu_{lang}.csv'
write_conllu_diffs_to_csv(file_path=file_name,
                          diff_idx=split_label_ids,
                          diff_text=split_label_txts,
                          diff_conllu=split_conllus)


In [None]:
from pprint import pprint

lang = 'ar'
file_name = f'{results_dir}split_diff_conllu_{lang}_commented.csv'
comments = Counter(read_diff_report_comments(file_name)).most_common()
comments_percent = len(comments) / split_diff_num
print(f'Percentage of commints on split difference: {comments_percent:.2%}')
print('comment on split differences')
pprint(comments)


 ## Notes of split difference

 ### Arabic
 **Stanza detects connected conjunction while UDP detceds connected pronouns.**

 Stanza, unlike UDP, splits the connected Arabic conjunction
 "و", but this comes with high number of positive falses, where the "و" was
 splitted while it is one of the form's letters.

 The same concept with the connected pronouns. UDP has detect the connected
 pronouns but this come with the price of positive error.

 **Stanza detects connected prepositions** unlike UDP, with the price of high
 positive error.

 **UDP split definite words**


 ### English
 The differences are due ti the word/token diffentions when it comes to using
 abbreviations, where dashes, barckets and numbers are used.

 Also, Stanza tend
 to correctly split the 'Apostrophe S'