# 1.0 Imports

In [1]:
import gzip
import json
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List
from unicodedata import normalize

import pandas as pd
from pyarabic.araby import DIACRITICS, SHADDA, is_arabicword, name, vocalizedlike

# 2.0 Define Paths

In [2]:
# Path for preproceed Wikitionary
wiktionary_dir = Path("../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [3]:
# Arabic GF-Wordnet
data_dir = Path("../data/interim/unambiguous")
ar_words_gf_path = data_dir / "20240202.1101_Q79_Q34_Q16_word_unambiguous.csv"

In [4]:
# Path to save words status
output_dir = Path("../data/interim/lexicon")
output_dir.mkdir(parents=True, exist_ok=True)

# 3.0 Define Variables

In [5]:
DIACRITICS = set(DIACRITICS)  # Aabic diacritics/short vowels

In [6]:
TIMESTAMP = "_".join(ar_words_gf_path.stem.split("_")[:-2])

## 4.0 Define Functions

### 4.1 Utilities Functions

In [7]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [8]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## 4.2 Morphology Features

### 4.2.1 Gender

In [9]:
def get_noun_gender(entry, form):
    for _, word_data in entry.items():
        categories: List[str] = word_data.get("categories", [])
        for category in categories:
            masculine = category.find("masculine") != -1
            feminine = category.find("feminine") != -1
            if masculine and feminine:
                return "N"
            elif masculine:
                return "masc"
            elif feminine:
                return "fem"
        return pd.NA

In [10]:
def get_adj_gender(entry, form):
    for _, word_data in entry.items():
        word_forms: List[dict] = word_data.get("forms", [])
        masculine, feminine = None, None
        for form_dict in word_forms:
            if {'indefinite', 'masculine', 'singular', 'nominative'}.issubset(form_dict.get("tags", [])):
                masculine = normalize_ar(form_dict["form"])
            elif {'indefinite', 'feminine', 'singular', 'nominative'}.issubset(form_dict.get("tags", [])):
                feminine = normalize_ar(form_dict["form"])
            if masculine and feminine:
                break

        # TODO: Confusing = redesign
        if not masculine and not feminine:
            return pd.NA, pd.NA
        elif not masculine or not feminine:
            return "masc", pd.NA if masculine else "fem", pd.NA

        if vocalizedlike(form, masculine):
            return "masc", feminine
        return "fem", masculine


In [11]:
def get_gender(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    pos = row["wiki_pos"]
    form: str = normalize_ar(row["vocal_forms"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        if pos == "noun":
            return get_noun_gender(entry, None)
        elif pos == "adj":
            return get_adj_gender(entry, form)
    return [pd.NA] * 2

### 4.2.2 Plural

In [12]:
def get_noun_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        for form_dict in word_forms:
            if "plural" == form_dict.get("tags", [None])[0]:
                return form_dict["form"]
        inflections = word_data.get("head_templates", [])
        for dict_inflection in inflections:
            plural = dict_inflection.get("args", {}).get("pl", None)
            if plural is not None:
                if is_arabicword(plural):
                    return normalize_ar(plural)
                elif plural == "-":
                    return normalize_ar("غير معدود")
                return normalize_ar("جمع سالم")
    return pd.NA

In [13]:
def get_adj_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        masc_pl, fem_pl = None, None
        for form_dict in word_forms:
            if ['masculine', 'plural'] == form_dict.get("tags", [None]):
                masc_pl = form_dict["form"]
            elif ['feminine', 'plural'] == form_dict.get("tags", [None]):
                fem_pl = form_dict["form"]
            if masc_pl and fem_pl:
                break
    masc_pl = normalize_ar(masc_pl) if masc_pl else pd.NA
    fem_pl = normalize_ar(fem_pl) if fem_pl else pd.NA

    return masc_pl, fem_pl

In [14]:
def get_plural(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    pos = row["wiki_pos"]
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        if pos == "noun":
            return get_noun_plural(entry)
        elif pos == "adj":
            return get_adj_plural(entry)
    if pos == "noun":
        return pd.NA
    elif pos == "adj":
        return [pd.NA]*2

### 4.2.3 Root

In [15]:
def get_root(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        for _, word_data in entry.items():
            for etymology in word_data.get('etymology_templates', []):
                if etymology.get("name", "").startswith("ar-root"):
                    if len(etymology["args"]) == 3:
                        return normalize_ar("".join(list(etymology["args"].values())))
                    if len(etymology["args"]) == 1:
                        return normalize_ar("".join(etymology["args"]["1"].split()))
        return pd.NA

### 4.2.4 Verb Forms

In [16]:
def get_verb_form(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        form_number = None
        for _, word_data in entry.items():
            form_numbers = []
            categories: List[str] = word_data.get("categories", [])
            for category in categories:
                if match := re.search(r"(?<=form-)\w+(?= verbs)", category):
                    form_numbers.append(match.group())
            if form_numbers:
                form_number = max(form_numbers)

        return f"Form{form_number}" if form_number is not None else pd.NA


In [17]:
def get_imperfect_form(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        for _, word_data in entry.items():
            word_forms: List[dict] = word_data.get("forms", [])
            for form_dict in word_forms:
                if {
                    "active",
                    "indicative",
                    "masculine",
                    "non-past",
                    "imperfective",
                    "singular",
                    "third-person",
                }.issubset(form_dict.get("tags", [])):
                    return normalize_ar(form_dict["form"])

        return pd.NA


# 5.0 Load Files

In [18]:
# 1. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-1912

Reading line-128636

In [19]:
# 2. load the csv arabic-english_gf words
df_ar_gf = pd.read_csv(ar_words_gf_path, delimiter="\t", index_col=0)

In [20]:
df_ar_gf = df_ar_gf[df_ar_gf["select"] == 1]

In [21]:
# Get nouns, adjectives and verbs
df_ar_gf_nouns = df_ar_gf.copy()[(df_ar_gf["wiki_pos"] == "noun") & (df_ar_gf["select"] == 1)]
df_ar_gf_adjs = df_ar_gf.copy()[(df_ar_gf["wiki_pos"] == "adj") & (df_ar_gf["select"] == 1)]
df_ar_gf_verbs = df_ar_gf.copy()[(df_ar_gf["wiki_pos"] == "verb") & (df_ar_gf["select"] == 1)]

# 6.0 Get Morphological Features

## 6.1 Nouns

In [22]:
df_ar_gf_nouns["gender"] = df_ar_gf_nouns.apply(get_gender, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["gender"].value_counts()

gender
[<NA>, <NA>]    18
masc             9
fem              9
Name: count, dtype: int64

In [23]:
df_ar_gf_nouns["plural"] = df_ar_gf_nouns.apply(get_plural, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["plural"].isna().value_counts()

plural
True     23
False    17
Name: count, dtype: int64

In [24]:
df_ar_gf_nouns["root"] = df_ar_gf_nouns.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["root"].isna().value_counts()

root
True     27
False    13
Name: count, dtype: int64

## 6.2 Adjectives

In [25]:
df_ar_gf_adjs["gender"], df_ar_gf_adjs["other_gender_form"] = zip(
    *df_ar_gf_adjs.apply(get_gender, args=(wikitionary,), axis="columns")
)
df_ar_gf_adjs["gender"].value_counts()

gender
masc    3
Name: count, dtype: int64

In [26]:
df_ar_gf_adjs["masc_pl"], df_ar_gf_adjs["fem_pl"]  = zip(*df_ar_gf_adjs.apply(get_plural, args=(wikitionary,), axis="columns"))
print(df_ar_gf_adjs["masc_pl"].isna().value_counts())
print(df_ar_gf_adjs["fem_pl"].isna().value_counts())

masc_pl
True     5
False    3
Name: count, dtype: int64
fem_pl
True     5
False    3
Name: count, dtype: int64


In [27]:
df_ar_gf_adjs["root"] = df_ar_gf_adjs.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_adjs["root"].isna().value_counts()

root
True     5
False    3
Name: count, dtype: int64

## 6.3 Verbs

In [None]:
df_ar_gf_verbs["verb_form"] = df_ar_gf_verbs.apply(get_verb_form, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["verb_form"].isna().value_counts()

In [None]:
df_ar_gf_verbs["root"] = df_ar_gf_verbs.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["root"].isna().value_counts()

root
True    1
Name: count, dtype: int64

In [None]:
df_ar_gf_verbs["imperfect"] = df_ar_gf_verbs.apply(get_imperfect_form, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["imperfect"].isna().value_counts()

imperfect
True    1
Name: count, dtype: int64

# 7.0 Export Data

In [29]:
# df_ar_gf_verbs.drop(columns=["select", "entry_num", "pos"], inplace=True)
df_ar_gf_nouns.drop(columns=["select", "entry_num", "pos"], inplace=True)
df_ar_gf_adjs.drop(columns=["select", "entry_num", "pos"], inplace=True)

In [34]:
# df_ar_gf_verbs.to_csv(output_dir / f"{TIMESTAMP}_verbs_lexicon.csv", decimal=",")
df_ar_gf_nouns.to_csv(output_dir / f"{TIMESTAMP}_nouns_lexicon.csv", decimal=",")
df_ar_gf_adjs.to_csv(output_dir / f"{TIMESTAMP}_adjectives_lexicon.csv", decimal=",")

# 8.0 Visualize Data

In [30]:
df_ar_gf_verbs

Unnamed: 0,wiki_idx,no_en_entry,en_entry,ar,vocal_forms,pos,wiki_pos,select,entry_num,ar_letters,senses,tim_translit,file


In [32]:
df_ar_gf_adjs

Unnamed: 0,wiki_idx,no_en_entry,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,other_gender_form,masc_pl,fem_pl,root
32,-1,1,demographic_N,سكانية,سُكّانِيَّة,adj,سين-ضمة | كاف-شدة | ألف | نون-كسرة | ياء-شدة-ف...,residential / population+[fem.sg.],suk~Aniy~ap,,,,,,
80,-1,1,life_expectancy_N,متوقع,مُتَوَقَّع,adj,ميم-ضمة | تاء-فتحة | واو-فتحة | قاف-شدة-فتحة |...,,,,,,,,
68,-1,1,percent_MU,مئوية,مِئَوِيَّة,adj,ميم-كسرة | همزة على الياء-فتحة | واو-كسرة | يا...,one-hundred / hundreth / percentage+[fem.sg.],mi}awiy~ap,,,,,,
8,-1,1,vat_1_N,مضافة,مُضافَة,adj,ميم-ضمة | ضاد | ألف | فاء-فتحة | تاء مربوطة,added+[fem.sg.],muDAfap,,,,,,
21,7862,0,average_1_N,متوسط,مُتَوَسِّط,adj,ميم-ضمة | تاء-فتحة | واو-فتحة | سين-شدة-كسرة |...,"['being in the middle, mediating / middle, cen...",mutawas~iT,,masc,مُتَوَسِّطَةٌ,مُتَوَسِّطُونَ,مُتَوَسِّطَات,وسط
4,26328,0,healthcare_2_N,صحية,صِحِّيَّة,adj,صاد-كسرة | حاء-شدة-كسرة | ياء-شدة-فتحة | تاء م...,['feminine singular of صِحِّيّ (ṣiḥḥiyy)'],SiH~iy~ap,,,,,,
49,7862,1,life_expectancy_N,متوسط,مُتَوَسِّط,adj,ميم-ضمة | تاء-فتحة | واو-فتحة | سين-شدة-كسرة |...,"['being in the middle, mediating / middle, cen...",mutawas~iT,,masc,مُتَوَسِّطَةٌ,مُتَوَسِّطُونَ,مُتَوَسِّطَات,وسط
1,3016,0,regime_1_N,حاكم,حَاكِم,adj,حاء-فتحة | ألف | كاف-كسرة | ميم,"['ruling, governing / decisive']",HaAkim,,masc,حَاكِمَةٌ,حَاكِمُون,حَاكِمَات,حكم


In [33]:
df_ar_gf_nouns

Unnamed: 0,wiki_idx,no_en_entry,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,plural,root
17,-1,0,algonquin_N,ألجونكوين,ألجونكوين,noun,همزة على الألف | لام | جيم | واو | نون | كاف |...,,>ljwnkwyn,,"[<NA>, <NA>]",,
22,-1,0,blackfoot_N,بلاكفوت,بلاكفوت,noun,باء | لام | ألف | كاف | فاء | واو | تاء,,blAkfwt,,"[<NA>, <NA>]",,
23,-1,0,cayuga_N,كايوجا,كايوجا,noun,كاف | ألف | ياء | واو | جيم | ألف,,kAywjA,,"[<NA>, <NA>]",,
10,-1,0,chinese_N,صينىة,صِينِيَّة,noun,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء...,Chinese+[fem.sg.],Siyniy~ap,,"[<NA>, <NA>]",,
24,-1,0,chipewyan_N,تشيبويان,تشيبويان,noun,تاء | شين | ياء | باء | واو | ياء | ألف | نون,,t$ybwyAn,,"[<NA>, <NA>]",,
33,-1,1,demographic_N,تركيبة,تَرْكِيبُه,noun,تاء-فتحة | راء-سكون | كاف-كسرة | ياء | باء-ضمة...,installation / assembling+its / his,tarokiybuh,,"[<NA>, <NA>]",,
35,-1,0,english_N,الإنجليزية,الإِنْجلِيزِيَّة,noun,ألف | لام | همزة تحت الألف-كسرة | نون-سكون | ج...,the+English_(language)+[fem.sg.],Al<inojliyziy~ap,,"[<NA>, <NA>]",,
41,-1,0,filipino_2_N,الفلبينية,الفِلِبِّينِيَّة,noun,ألف | لام | فاء-كسرة | لام-كسرة | باء-شدة-كسرة...,the+Philippine / Filipino+[fem.sg.],Alfilib~iyniy~ap,,"[<NA>, <NA>]",,
44,-1,0,finnish_N,الفنلندية,الفِنْلَنْدِيَّة,noun,ألف | لام | فاء-كسرة | نون-سكون | لام-فتحة | ن...,the+Finnish+[fem.sg.],Alfinolanodiy~ap,,"[<NA>, <NA>]",,
45,-1,0,haida_N,هيدا,هيدا,noun,هاء | ياء | دال | ألف,,hydA,,"[<NA>, <NA>]",,
