In [None]:
from typing import Dict, List, Optional, Tuple

from unicodedata import category, normalize
import re
import pickle


 # Functions

 ## Utilities Functions

In [None]:


def is_nonspace_mark(x: str) -> bool:
    """detect diacritics in a string"""
    return bool(category(x) == "Mn")


def remove_diacritics_ar(text: str) -> str:
    """remove diacritics in a string"""
    return ''.join([t for t in text if not is_nonspace_mark(t)])



In [None]:


def read_text(file):
    """read file"""
    with open(file, "r") as f:
        conllu = f.read()
    return conllu



 ## Helper Classes

 |     | derived from other word  | word derived from them| Gender |
 |-----|--------------------------|-----------------------|--------|
 |جَامِد static |   no              | nothing         | ?M or F or both |
 |مَصْدَر gerund |   no              | yes             | ?M or F or both |
 |مُشْتَقّ derived|   yes             | yes             | ?M/F (_nized) |

In [None]:


class Ism:
    """This class represents an Arabic noun (ism) and stores information about
    its form, lemma, Universal Parts of Speech (upos), root, gender, and
    plural. It also includes methods for updating the plural and comparing two
    instances of the class, so you can use it as key in a dictionary.

    Parameters

    - form (str): the form of the noun

    - lemma (str): the lemma of the noun

    - upos (str): the Universal Parts of Speech of the noun

    - root (Optional[str]): the root of the noun (default is empty string)

    - gender (Optional[str]): the gender of the noun (default is empty string)

    - plural (Optional[List[str]]): a list of possible plural forms of the noun
      (default is empty list)
      """

    def __init__(self,
                 form: str,
                 lemma: str,
                 upos: str,
                 root: Optional[str] = '',
                 gender: Optional[str] = '',
                 plural: Optional[List[str]] = '') -> None:
        self.form = form
        self.lemma = lemma
        self.upos = upos
        self.root = root
        self.gender = gender
        self.plural = plural

    def update_plural(self, plurals: List[str]):
        """Updates the plural forms of the noun with the given list of plurals.

        Args:
            plurals (List[str]): A list of possible plural forms of the noun.
        """
        # get pluras form that are not is self.plural, then update
        plurals_new = list(set(plurals) - set(self.plural))
        if plurals_new:
            self.plural.extend(plurals_new)

    def __eq__(self, other):
        """Compares two instances of the Ism class for equality based on their
        form and lemma.

        Args:
            other (Ism): The other instance of the Ism class to compare to.

        Returns:
            bool: True if the instances are equal, False otherwise.
        """
        if isinstance(other, Ism):
            if not (other.form == self.form):
                return False

            if not (other.lemma == self.lemma):
                return False

            return True

        else:
            raise TypeError(
                f"Cannot compare objects of type {type(other).__name__} with {type(self).__name__}"  # noqa: E501
            )

    def __hash__(self):
        """Returns a hash value for an instance of the class based on its form
        and lemma. Neede if you will use intences of Ism as a dictionary key.

        Returns:
            int: The hash value for the instance.
        """
        return hash((self.form, self.lemma))

    def __repr__(self) -> str:
        plurals = "-".join(self.plural)
        return f'{self.form}[{self.lemma}][ج]({plurals})'

    def __str__(self) -> str:
        plurals = "-".join(self.plural)
        return f'{self.form}[ج]({plurals})'



In [None]:
class IsmDict(dict):
    """
    This class extends the dict class by adding a custom `__setitem__` method
    that overrides the default behavior of setting an item in the dictionary.

    The original purpose of the `__setitem__` method is to update the
    dictionary by adding the value to dictionary according to the key.

    However, in this implementation, it first checks whether the key already
    exists in the dictionary. The class expects that the key is an instence of
    Ism. If the key is already exists, it updates the existing key by merging
    its plural with the new key's plural and adds the value to the existing
    key's value. If it does not exist, it adds the plural to the dictionary as
    is.
    """

    def __setitem__(self, key: Ism, value: list):
        if not isinstance(key, Ism):
            raise TypeError(
                f"Key must be an instance of Ism, got {type(key).__name__}")

        # Find if key already exists in the dictionary
        existing_key = next((k for k in self if k == key), None)
        if existing_key is not None:
            # If key exists, update its plural and add value to its value
            existing_key.update_plural(key.plural)
            super().__setitem__(existing_key, self[key] + value)
        else:
            # If key does not exist, add key-value pair to dictionary
            super().__setitem__(key, value)



 ## Helper CFunctions

In [None]:
def process_plural(form_plural: str, lemma: str, case: str,
                   definite: str) -> Tuple[str, str]:
    """This function processes the plural form of an Arabic noun and determine
    its type. The function uses regular expressions to identify whether the
    plural form is a regular plural or a broken plural, and returns an
    appropriate response based on the type of plural form.

     Args:
    - form_plural (str): The plural form of the Arabic noun to be processed.

    - lemma (str): The singular form of the Arabic noun.

    - case (str): The grammatical case of the noun.

    - definite (str): The definiteness of the noun.

    Returns:
    - A tuple containing:

       1. The plural type if detected, either:
          - 'broken' for the broken plural
          - 'Fem' for the femenine slaem plural
          - 'Masc' for masculine salem plural

          if there is an error and the plural type is not detectable it return
          an error message.

       2. The lemma of the noun if the plural type is detected. Otherwise it is
           an empty string.
    """

    # remove diacritics
    lemma_nodiac = remove_diacritics_ar(lemma)
    # sometimes lemma is just one diacritics
    if not lemma_nodiac:
        print(lemma)

    lc = lemma_nodiac[-1]
    # regeular exp
    definite_regx = re.compile(rf'(ال)?{lemma_nodiac}')
    salm_femn_regx = re.compile(rf'(ال)?{lemma_nodiac[:-1]}{lc}?(ات)')
    salm_masc_regx = re.compile(rf'(ال)?{lemma_nodiac}(ون|ين|ي|و)')

    # common error lemma is the same as plural form
    if bool(definite_regx.fullmatch(form_plural)):
        return 'wrong_single_form', ''

    # should be broken plural if not salem
    is_salm_femn_plural = bool(salm_femn_regx.fullmatch(form_plural))
    is_salm_masc_plural = bool(salm_masc_regx.fullmatch(form_plural))
    is_salm_plural = is_salm_femn_plural or is_salm_masc_plural
    if not is_salm_plural:
        return 'broken', ''

    # TODO - Connected conjunctions and prepositions if not splitted introduce
    # an issue when detecting the salem plural. Because the detection is based
    # on the lemma, if the lemma is not correct the detection will be wrong.
    # جمع مؤنث سالم | Salem feminine plural
    if is_salm_femn_plural:
        return 'Fem', lemma_nodiac
    # جمع مذكر سالم | Salem feminine plural
    elif is_salm_masc_plural:
        if form_plural.endswith('ي') or form_plural.endswith('و'):
            if case == 'Gen' and definite == 'Cons':
                return 'Masc', lemma_nodiac
            else:
                return 'wrong_plural_form', ''
        else:
            return 'Masc', lemma_nodiac



In [None]:
def extract_sentences_info(conllu: str):
    """Extract sentence IDs and text from the CoNLL-U formatted file.

    Args:
    - conllu (str): A string containing CoNLL-U formatted data for one or
    more sentences.

    Returns:
        - A tuple containing:
          - A list of sentence IDs extracted from the CoNLL-U formatted file.
          - A list of sentence texts extracted from the CoNLL-U formatted file.
          - A string containing the CoNLL-U formatted data for all sentences
          with comments removed.
    """

    # Extract sentence IDs and text from the CoNLL-U formatted file.
    sent_id_regx = re.compile(r"(?<=# sent_id = )\w+")
    text_regex = re.compile(r"(?<=# text = ).+")
    conllu_filter = re.compile(r"#.+\n")

    sent_ids = sent_id_regx.findall(conllu)
    texts = text_regex.findall(conllu)
    # Remove comments from the CoNLL-U formatted string.
    sents_conllu = conllu_filter.sub(repl="", string=conllu)

    return sent_ids, texts, sents_conllu



In [None]:
def fix_spaces(texts: List[str]) -> List[str]:
    """Fix issues with extra spaces in a list of strings.

    Args:
    - texts (List[str]): A list of strings to be processed.

    Returns:
    - A list of strings with issues of extra spaces fixed."""

    # Fix issues with extra spaces in text.
    remove_extra_spaces = re.compile(r'\s\s+')
    add_space_b4r_openbracket = re.compile(r'(?<=\w)\(')
    add_space_b4r_doublequote = re.compile(r'(?<=\w)"')

    texts = [remove_extra_spaces.sub(' ', txt) for txt in texts]
    texts = [add_space_b4r_openbracket.sub(' (', txt) for txt in texts]
    texts = [add_space_b4r_doublequote.sub(' "', txt) for txt in texts]
    texts = [normalize("NFKD", txt) for txt in texts]

    return texts



In [None]:
def split_conllu_sentences(sents_conllu: str):
    """
    Split CoNLL-U formatted data into a list of sentences, each represented as
    a list of word/token information.

    Args:
    - sents_conllu (str): A string containing CoNLL-U formatted data for
    one or more sentences.

    Returns:
    - A list of sentences, each represented as a list of word/token
    information. Each word/token information is itself represented as a list of
    8 string elements, corresponding to the 8 columns of the CoNLL-U format.
    """

    # This function splits CoNLL-U formatted data into a list of sentences, each
    # represented as a list of word/token information. It uses the newline
    # character and empty lines to split the input string into separate
    # sentences, and then splits each sentence into a list of word/token
    # information using tab characters. The function returns a list of these
    # sentence lists.

    sents_conllu = map(lambda x: x.split("\n"), sents_conllu.split("\n\n"))
    sents_conllu = [[tc.split("\t")[:8] for tc in sc] for sc in sents_conllu]

    return sents_conllu



In [None]:
def get_feats(feats: str):
    """Extract morphological features from a string representation of CoNLL-U
    formatted word/token features.

    Args:
    - feats (str): A string containing CoNLL-U formatted word/token features.

    Returns:
    - A tuple containing:
        - A string representing the gender of the word/token.
        - A string representing the case of the word/token.
        - A string representing the definiteness of the word/token.
        - A boolean value indicating whether the word/token is plural.

    Example usage:
    >>> get_feats('Gender=Masc|Number=Sing|Case=Nom')
    ('Masc', 'Nom', '', False)
        """

    gender_regex = re.compile(r"(?<=Gender=)[a-zA-Z]+")
    is_plural_regex = re.compile(r"Number=Plur")
    definite_regex = re.compile(r"(?<=Definite=)[a-zA-Z]+")
    case_regex = re.compile(r"(?<=Case=)[a-zA-Z]+")

    gender = gender_regex.findall(feats)
    gender = gender[0] if gender else ""

    is_plural = bool(is_plural_regex.search(feats))

    case_ = case_regex.findall(feats)
    case_ = case_[0] if case_ else ""

    definite = definite_regex.findall(feats)
    definite = definite[0] if definite else ""

    return gender, case_, definite, is_plural



In [None]:


# read Arabic conllu
def get_noun_adj_conllu(
    conllu_file
) -> Tuple[Dict[Ism, Tuple[str, str]], Dict[Ism, Tuple[str, str]], Dict[
        Ism, Tuple[str, str]]]:
    """
    Extract information about nouns, adjectives, and X items from a CoNLL-U
    formatted file.

    Args:
    - conllu_file (str): The filepath to the CoNLL-U formatted file.

    Returns:
        A tuple of three dictionaries, containing:
        - A dictionary of nouns, where the keys are Ism objects representing
          the noun and the values are tuples containing the sentence ID and
          text where the noun appears.
        - A dictionary of adjectives, where the keys are Ism objects
          representing the adjective and the values are tuples containing the
          sentence ID and text where the adjective appears.
        - A dictionary of X items, where the keys are Ism objects representing
          the X item and the values are tuples containing the sentence ID and
          text where the X item appears.
    """

    # This function reads a CoNLL-U formatted file and extracts information
    # about nouns, adjectives, and X items from the file.

    # It uses a variety of helper functions to extract information about each
    # token, including its lemma, part of speech, and morphological features.

    # It then creates Ism objects to represent each noun, adjective, or X item,
    # and stores these objects in one of three dictionaries depending on their
    # part of speech. The function returns a tuple containing these three
    # dictionaries.

    conllu = read_text(conllu_file)  # read CoNLL-U file

    # Extract sentence IDs and text from the CoNLL-U formatted file.
    sent_ids, texts, sents_conllu = extract_sentences_info(conllu)
    # Split the CoNLL-U formatted string into sentences and tokens.
    sents_conllu = split_conllu_sentences(sents_conllu)
    # fix spaces issue with label texts
    texts = fix_spaces(texts)

    noun_dict = IsmDict()
    sus_forms_dict = IsmDict()
    x_dict = IsmDict()
    adj_dict = IsmDict()
    for idx, txt, conllu in zip(sent_ids, texts, sents_conllu):
        for token_fields in conllu:
            _, form, lemma, upos, _, feats, *_ = token_fields
            if upos not in ('NOUN', 'ADJ', 'X'):
                continue

            # Extract token feats
            form_gender, form_case, form_definite, is_plural = get_feats(feats)

            # if form is plural according to Conllu feats. Check if the plural
            # form is Salem plural. If it is, get the single form. If not, save
            # it is the suspected plural dictionary.
            if is_plural:
                gender_inferred, form_single = process_plural(
                    form_plural=form,
                    lemma=lemma,
                    case=form_case,
                    definite=form_definite)
                if not form_single:
                    form_single = f'X_{form}'
                    ism = Ism(form=form_single,
                              lemma=lemma,
                              upos=upos,
                              plural=[],
                              gender=form_gender)
                    sus_forms_dict[ism] = [(idx, txt)]
                else:
                    form_gender = gender_inferred
                    form_plural = [form]
            else:
                form_single = form
                form_plural = []
                form_gender = ''

            if upos == 'NOUN':
                noun = Ism(form=form_single,
                           lemma=lemma,
                           upos=upos,
                           plural=form_plural,
                           gender=form_gender)
                noun_dict[noun] = [(idx, txt)]

            elif upos == 'ADJ':
                adj = Ism(form=form_single,
                          lemma=lemma,
                          upos=upos,
                          plural=form_plural,
                          gender=form_gender)
                adj_dict[adj] = [(idx, txt)]

            elif upos == 'X':
                x = Ism(form=form_single,
                        lemma=lemma,
                        upos=upos,
                        plural=form_plural,
                        gender=form_gender)
                x_dict[x] = [(idx, txt)]

    return noun_dict, adj_dict, x_dict, sus_forms_dict



 Extract information fron Conll-U

In [None]:
file_ar2 = "../../outputs/conllu/udp_ar edited.conllu"
results_dir = "../../outputs/conllu_compare/"


In [None]:
nouns, adjs, xs, sus_plurals = get_noun_adj_conllu(file_ar2)


In [None]:
print(f'There are:\n - {len(nouns)} nouns\n - {len(adjs)} adjectives')
print(f' - {len(xs)} tokens that cannot be assigned a real UPOS.')
print()
print(f'There are {len(sus_plurals)} word that could be plural.')


In [None]:
noun_plurals_counter = 0
for token_reper in nouns:
    sing_form, plurals = str(token_reper).split('[ج]')

    if plurals != '()':
        noun_plurals_counter += 1

adj_plurals_counter = 0
for token_reper in adjs:
    sing_form, plurals = str(token_reper).split('[ج]')

    if plurals != '()':
        adj_plurals_counter += 1

x_plurals_counter = 0
for token_reper in xs:
    sing_form, plurals = str(token_reper).split('[ج]')

    if plurals != '()':
        x_plurals_counter += 1


In [None]:
print(f'{noun_plurals_counter} plurals are detected in nouns')
print(f'{adj_plurals_counter} plurals are detected in adjetives')
print(f'{x_plurals_counter} plurals are detected in X')


 # Save

In [None]:
file_name = f'{results_dir}{nouns}'
with open(f'{file_name}.pkl', "wb") as f:
    pickle.dump([nouns, adjs, xs, sus_plurals], f)
