# 1.0 Imports

In [1]:
import gzip
import json
from pathlib import Path
from typing import Dict, List
from unicodedata import normalize

import pandas as pd
from pyarabic.araby import DIACRITICS, SHADDA, is_arabicword, name, vocalizedlike

## 3.0 Define Variables

In [2]:
DIACRITICS = set(DIACRITICS)  # Aabic diacritics/short vowels

# 2.0 Define Paths

In [3]:
# Path for preproceed Wikitionary
wiktionary_dir = Path("../../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [4]:
# Arabic GF-Wordnet
data_dir = Path("../../data/interim/")
ar_words_gf_path = data_dir / "word_unambiguous.csv"

In [5]:
output_dir = Path("../../data/processed")

## 4.0 Define Functions

### 4.1 Utilities Functions

In [6]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [7]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## 3.0 Load Files

In [8]:
# 1. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-876

Reading line-128636

In [9]:
# 2. load the csv arabic-english_gf words
df_ar_gf = pd.read_csv(ar_words_gf_path, delimiter="\t", index_col=0, converters={"senses": pd.eval})

In [10]:
df_ar_gf = df_ar_gf[df_ar_gf["select"] == 1]

# 4.0 Extract Morphology Feature

## 4.1 Gender

In [11]:
def get_gender(row, wiktionary):
    wiki_idx: int = row["wiki_idx"]
    entry: Dict[int, dict] = wiktionary[wiki_idx]
    for _, word_data in entry.items():
        categories: List[str] = word_data.get("categories", [])
        for category in categories:
            masculine = category.find("masculine") != -1
            feminine = category.find("feminine") != -1
            if masculine and feminine:
                return "N"
            elif masculine:
                return "masc"
            elif feminine:
                return "fem"
        return pd.NA

In [12]:
def get_gender_(row, wiktionary):
    wiki_idx: int = row["wiki_idx"]
    form: str = normalize_ar(row["vocal_forms"])
    entry: Dict[int, dict] = wiktionary[wiki_idx]
    for _, word_data in entry.items():
        word_forms: List[dict] = word_data.get("forms", [])
        for form_dict in word_forms:
            if vocalizedlike(form, normalize_ar(form_dict["form"])):
                tags = form_dict.get("tags", [None])
                masculine = "masculine" in tags
                feminine = "feminine" in tags
                if masculine and feminine:
                    return "N"
                elif masculine:
                    return "masc"
                elif feminine:
                    return "fem"


In [13]:
def get_noun_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        for form_dict in word_forms:
            if "plural" == form_dict.get("tags", [None])[0]:
                return form_dict["form"]
        inflections = word_data.get("head_templates", [])
        for dict_inflection in inflections:
            plural = dict_inflection.get("args", {}).get("pl", None)
            if plural is not None:
                if is_arabicword(plural):
                    return plural
                elif plural == "-":
                    return "غير معدود"
                return "جمع سالم"
    return pd.NA

In [14]:
def get_adj_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        for form_dict in word_forms:
            if ['masculine', 'plural'] == form_dict.get("tags", [None]):
                return form_dict["form"]
    return pd.NA

In [15]:
def get_plural(row, wikitionary):
    wiki_idx = row["wiki_idx"]
    pos = row["pos"]
    entry = wikitionary[wiki_idx]
    if pos == "noun":
        return get_noun_plural(entry)
    elif pos == "adj":
        return get_adj_plural(entry)
    else:
        return pd.NA

In [16]:
df_ar_gf_nouns = df_ar_gf.copy()[df_ar_gf["pos"] == "noun"]
df_ar_gf_adjs = df_ar_gf.copy()[df_ar_gf["pos"] == "adj"]

In [17]:
df_ar_gf_nouns["gender"] = df_ar_gf_nouns.apply(get_gender, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["gender"].value_counts()

gender
masc    40
fem     18
Name: count, dtype: int64

In [18]:
df_ar_gf_nouns["plural"] = df_ar_gf_nouns.apply(get_plural, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["plural"].isna().value_counts()

plural
False    36
True     22
Name: count, dtype: int64

In [19]:
df_ar_gf_adjs["gender"] = df_ar_gf_adjs.apply(get_gender_, args=(wikitionary,), axis="columns")
df_ar_gf_adjs["gender"].value_counts()

gender
masc    18
Name: count, dtype: int64

In [21]:
df_ar_gf_adjs["plural"] = df_ar_gf_adjs.apply(get_plural, args=(wikitionary,), axis="columns")
df_ar_gf_adjs["plural"].isna().value_counts()

plural
False    18
Name: count, dtype: int64