# 1.0 Imports

In [1]:
import gzip
import json
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List
from unicodedata import normalize

import pandas as pd
from pyarabic.araby import DIACRITICS, SHADDA, is_arabicword, name, vocalizedlike

# 2.0 Define Variables

In [2]:
DIACRITICS = set(DIACRITICS)  # Aabic diacritics/short vowels

# 3.0 Define Paths

In [3]:
# Path for preproceed Wikitionary
wiktionary_dir = Path("../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [4]:
# Arabic GF-Wordnet
data_dir = Path("../data/interim/")
ar_words_gf_path = data_dir / "word_unambiguous.csv"

In [5]:
# Path to save words status
output_dir = Path("../data/interim/lexicon")
output_dir.mkdir(parents=True, exist_ok=True)

## 4.0 Define Functions

### 4.1 Utilities Functions

In [6]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [7]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## 4.2 Morphology Features

### 4.2.1 Gender

In [8]:
def get_noun_gender(entry, form):
    for _, word_data in entry.items():
        categories: List[str] = word_data.get("categories", [])
        for category in categories:
            masculine = category.find("masculine") != -1
            feminine = category.find("feminine") != -1
            if masculine and feminine:
                return "N"
            elif masculine:
                return "masc"
            elif feminine:
                return "fem"
        return pd.NA

In [9]:
def get_adj_gender(entry, form):
    for _, word_data in entry.items():
        word_forms: List[dict] = word_data.get("forms", [])
        masculine, feminine = None, None
        for form_dict in word_forms:
            if {'indefinite', 'masculine', 'singular', 'nominative'}.issubset(form_dict.get("tags", [])):
                masculine = normalize_ar(form_dict["form"])
            elif {'indefinite', 'feminine', 'singular', 'nominative'}.issubset(form_dict.get("tags", [])):
                feminine = normalize_ar(form_dict["form"])
            if masculine and feminine:
                break

        # TODO: Confusing = redesign
        if not masculine and not feminine:
            return pd.NA, pd.NA
        elif not masculine or not feminine:
            return "masc", pd.NA if masculine else "fem", pd.NA

        if vocalizedlike(form, masculine):
            return "masc", feminine
        return "fem", masculine


In [10]:
def get_gender(row, wikitionary):
    wiki_idx = row["wiki_idx"]
    pos = row["pos"]
    form: str = normalize_ar(row["vocal_forms"])
    entry = wikitionary[wiki_idx]
    if pos == "noun":
        return get_noun_gender(entry, None)
    elif pos == "adj":
        return get_adj_gender(entry, form)
    else:
        return pd.NA

### 4.2.2 Plural

In [11]:
def get_noun_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        for form_dict in word_forms:
            if "plural" == form_dict.get("tags", [None])[0]:
                return form_dict["form"]
        inflections = word_data.get("head_templates", [])
        for dict_inflection in inflections:
            plural = dict_inflection.get("args", {}).get("pl", None)
            if plural is not None:
                if is_arabicword(plural):
                    return normalize_ar(plural)
                elif plural == "-":
                    return normalize_ar("غير معدود")
                return normalize_ar("جمع سالم")
    return pd.NA

In [12]:
def get_adj_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        masc_pl, fem_pl = None, None
        for form_dict in word_forms:
            if ['masculine', 'plural'] == form_dict.get("tags", [None]):
                masc_pl = form_dict["form"]
            elif ['feminine', 'plural'] == form_dict.get("tags", [None]):
                fem_pl = form_dict["form"]
            if masc_pl and fem_pl:
                break
    masc_pl = normalize_ar(masc_pl) if masc_pl else pd.NA
    fem_pl = normalize_ar(fem_pl) if fem_pl else pd.NA

    return masc_pl, fem_pl

In [13]:
def get_plural(row, wikitionary):
    wiki_idx = row["wiki_idx"]
    pos = row["pos"]
    entry = wikitionary[wiki_idx]
    if pos == "noun":
        return get_noun_plural(entry)
    elif pos == "adj":
        return get_adj_plural(entry)
    else:
        return pd.NA

### 4.2.3 Root

In [14]:
def get_root(row, wikitionary):
    wiki_idx = row["wiki_idx"]
    entry = wikitionary[wiki_idx]
    for _, word_data in entry.items():
        for etymology in word_data.get('etymology_templates', []):
            if etymology.get("name", "").startswith("ar-root"):
                if len(etymology["args"]) == 3:
                    return normalize_ar("".join(list(etymology["args"].values())))
                if len(etymology["args"]) == 1:
                    return normalize_ar("".join(etymology["args"]["1"].split()))
    return pd.NA

### 4.2.4 Verb Forms

In [15]:
def get_verb_form(row, wikitionary):
    wiki_idx = row["wiki_idx"]
    entry = wikitionary[wiki_idx]
    form_number = pd.NA
    for _, word_data in entry.items():
        form_numbers = []
        categories: List[str] = word_data.get("categories", [])
        for category in categories:
            if match := re.search(r"(?<=form-)\w+(?= verbs)", category):
                form_numbers.append(match.group())
        if form_numbers:
            form_number = max(form_numbers)

    return form_number


In [16]:
def get_imperfect_form(row, wikitionary):
    wiki_idx = row["wiki_idx"]
    entry = wikitionary[wiki_idx]
    for _, word_data in entry.items():
        word_forms: List[dict] = word_data.get("forms", [])
        for form_dict in word_forms:
            if {
                "active",
                "indicative",
                "masculine",
                "non-past",
                "imperfective",
                "singular",
                "third-person",
            }.issubset(form_dict.get("tags", [])):
                return normalize_ar(form_dict["form"])

    return pd.NA


# 5.0 Load Files

In [17]:
# 1. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-2262

Reading line-128636

In [18]:
# 2. load the csv arabic-english_gf words
df_ar_gf = pd.read_csv(ar_words_gf_path, delimiter="\t", index_col=0, converters={"senses": pd.eval})

In [19]:
df_ar_gf = df_ar_gf[df_ar_gf["select"] == 1]

In [20]:
# Get nouns, adjectives and verbs
df_ar_gf_nouns = df_ar_gf.copy()[(df_ar_gf["pos"] == "noun") & (df_ar_gf["select"] == 1)]
df_ar_gf_adjs = df_ar_gf.copy()[(df_ar_gf["pos"] == "adj") & (df_ar_gf["select"] == 1)]
df_ar_gf_verbs = df_ar_gf.copy()[(df_ar_gf["pos"] == "verb") & (df_ar_gf["select"] == 1)]

# 6.0 Get Morphological Features

## 6.1 Nouns

In [21]:
df_ar_gf_nouns["gender"] = df_ar_gf_nouns.apply(get_gender, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["gender"].value_counts()

gender
masc    40
fem     18
Name: count, dtype: int64

In [22]:
df_ar_gf_nouns["plural"] = df_ar_gf_nouns.apply(get_plural, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["plural"].isna().value_counts()

plural
False    36
True     22
Name: count, dtype: int64

In [23]:
df_ar_gf_nouns["root"] = df_ar_gf_nouns.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["root"].isna().value_counts()

root
True     35
False    23
Name: count, dtype: int64

## 6.2 Adjectives

In [24]:
df_ar_gf_adjs["gender"], df_ar_gf_adjs["other_gender_form"] = zip(
    *df_ar_gf_adjs.apply(get_gender, args=(wikitionary,), axis="columns")
)
df_ar_gf_adjs["gender"].value_counts()

gender
masc    18
Name: count, dtype: int64

In [25]:
df_ar_gf_adjs["masc_pl"], df_ar_gf_adjs["fem_pl"]  = zip(*df_ar_gf_adjs.apply(get_plural, args=(wikitionary,), axis="columns"))
print(df_ar_gf_adjs["masc_pl"].isna().value_counts())
print(df_ar_gf_adjs["fem_pl"].isna().value_counts())

masc_pl
False    18
Name: count, dtype: int64
fem_pl
False    18
Name: count, dtype: int64


In [26]:
df_ar_gf_adjs["root"] = df_ar_gf_adjs.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_adjs["root"].isna().value_counts()

root
True     10
False     8
Name: count, dtype: int64

## 6.3 Verbs

In [27]:
df_ar_gf_verbs["verb_form"] = df_ar_gf_verbs.apply(get_verb_form, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["verb_form"].isna().value_counts()

verb_form
False    5
Name: count, dtype: int64

In [28]:
df_ar_gf_verbs["root"] = df_ar_gf_verbs.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["root"].isna().value_counts()

root
False    5
Name: count, dtype: int64

In [29]:
df_ar_gf_verbs["imperfect"] = df_ar_gf_verbs.apply(get_imperfect_form, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["imperfect"].isna().value_counts()

imperfect
False    5
Name: count, dtype: int64

# 7.0 Export Data

In [35]:
df_ar_gf_verbs.drop(columns=["select", "entry_num", "pos"], inplace=True)
df_ar_gf_nouns.drop(columns=["select", "entry_num", "pos"], inplace=True)
df_ar_gf_adjs.drop(columns=["select", "entry_num", "pos"], inplace=True)

In [36]:
RUN_TIME_STAMP = date_time = datetime.fromtimestamp(time.time()).strftime("%Y%m%d.%H%M%S")
df_ar_gf_verbs.to_csv(output_dir / f"{RUN_TIME_STAMP}_verbs_lexicon.csv", decimal=",")
df_ar_gf_nouns.to_csv(output_dir / f"{RUN_TIME_STAMP}_nouns_lexicon.csv", decimal=",")
df_ar_gf_adjs.to_csv(output_dir / f"{RUN_TIME_STAMP}_adjectives_lexicon.csv", decimal=",")

# 8.0 Visualize Data

In [37]:
df_ar_gf_verbs

Unnamed: 0,li,wiki_idx,ar,en,vocal_forms,nesba,ar_letters,senses,verb_form,root,imperfect
79,22868,117316,صمم,designate_4_V2,صَمَّمَ,0,صاد-فتحة | ميم-شدة-فتحة | ميم-فتحة,"[to deafen [+accusative], to deafen / to resol...",II,صمم,يُصَمِّمُ
110,38154,698,ملك,have_1_V2,مَلَكَ,0,ميم-فتحة | لام-فتحة | كاف-فتحة,"[to take in possession, to take over, to acqui...",I,ملك,يَمْلِكُ
232,75568,8035,عرض,show_2_V2,عَرَضَ,0,| | | | | | عين-فتحة | راء-فتحة | ضاد-فتحة,"[to appear / to happen, to occur / to show, to...",I,عرض,يَعْرِضُ
249,78616,6248,تكلم,speak_3_V2,تَكَلَّمَ,0,تاء-فتحة | كاف-فتحة | لام-شدة-فتحة | ميم-فتحة,"[to talk, to have a discussion or conversation]",V,كلم,يَتَكَلَّمُ
266,82083,20863,نجح,succeed_V2,نَجَحَ,0,نون-فتحة | جيم-فتحة | حاء-فتحة,"[to succeed, to be successful]",I,نجح,يَنْجَحُ


In [38]:
df_ar_gf_adjs

Unnamed: 0,li,wiki_idx,ar,en,vocal_forms,nesba,ar_letters,senses,gender,other_gender_form,masc_pl,fem_pl,root
1,292,5922,مُطلَق,absolute_3_A,مُطْلَق,0,ميم-ضمة | طاء-سكون | لام-فتحة | قاف,"[absolute, utter, very, unlimited, unrestricte...",masc,مُطْلَقَةٌ,مُطْلَقُونَ,مُطْلَقَات,طلق
7,3970,64,عربي,arabic_A,عَرَبِيّ,1,عين-فتحة | راء-فتحة | باء-كسرة | ياء-شدة,[Arab / Arabic / Arabian],masc,عَرَبِيَّةٌ,عَرَب,عَرَبِيَّات,
48,16805,25071,شُيُوعِيّ,communist_A,شُيُوعِيّ,1,شين-ضمة | ياء-ضمة | واو | عين-كسرة | ياء-شدة,[communist / Communist / communal],masc,شُيُوعِيَّةٌ,شُيُوعِيُّونَ,شُيُوعِيَّات,
50,17986,8330,دستوري,constitutional_2_A,دُسْتُورِيّ,1,دال-ضمة | سين-سكون | تاء-ضمة | واو | راء-كسرة ...,[constitutional],masc,دُسْتُورِيَّةٌ,دُسْتُورِيُّونَ,دُسْتُورِيَّات,
62,20596,2656,جَارٍ,current_A,جَارٍ,0,جيم-فتحة | ألف | راء-كسرتان,[active participle of جَرَى (jarā)],masc,جَارِيَةٌ,جَارُون,جَارِيَات,جري
100,33131,10031,سابق,former_3_A,سَابِق,0,سين-فتحة | ألف | باء-كسرة | قاف,"[preceding, previous / former / active partici...",masc,سَابِقَةٌ,سُبَّاق,سَابِقَات,سبق
102,33517,4339,حُرّ,free_1_A,حُرّ,0,حاء-ضمة | راء-شدة,"[free / unimpeded / set free, freedman / born ...",masc,حُرَّةٌ,أَحْرَار,حُرَّات,حرر
121,39071,12466,عال,high_1_A,عَالٍ,0,عين-فتحة | ألف | لام-كسرتان,[active participle of عَلَا (ʕalā) or active p...,masc,عَالِيَةٌ,عَالُونَ,عَالِيَات,علو
141,42239,7474,شَخْصي,individual_4_A,شَخْصِيّ,1,شين-فتحة | خاء-سكون | صاد-كسرة | ياء-شدة,[own / personal / personal],masc,شَخْصِيَّةٌ,شَخْصِيُّونَ,شَخْصِيَّات,
167,46486,4262,كبِير,large_1_A,كَبِير,0,كاف-فتحة | باء-كسرة | ياء | راء,"[big, large / great, great importance / old (f...",masc,كَبِيرَةٌ,كُبَرَاء,كَبِيرَات,كبر


In [33]:
df_ar_gf_nouns

Unnamed: 0,li,wiki_idx,ar,en,entry_num,vocal_forms,pos,select,nesba,ar_letters,senses,gender,plural,root
3,1558,8230,أفْغانِيّ,afghani_1_N,2,أَفْغَانِيّ,noun,1,0,همزة على الألف-فتحة | فاء-سكون | غين-فتحة | أل...,[Afghan],masc,أَفْغَان,
6,1643,1770,سِنّ,age_1_N,3,سِنّ,noun,1,0,سين-كسرة | نون-شدة,"[tooth, tusk, fang / point or tip / a spearhea...",fem,أَسْنَان,
10,4109,503,مِنْطقة,area_6_N,3,مِنْطَقَة,noun,1,0,ميم-كسرة | نون-سكون | طاء-فتحة | قاف-فتحة | تا...,"[belt, girdle / zone / vicinity, range, distri...",fem,مِنْطَقَات,نطق
11,4109,504,مِنْطقة,area_6_N,3,مَنْطِقَة,noun,1,0,ميم-فتحة | نون-سكون | طاء-كسرة | قاف-فتحة | تا...,"[zone / vicinity, range, district, area, terri...",fem,مَنْطِقَات,نطق
12,4109,505,مِنْطقة,area_6_N,3,مَنْطَقَة,noun,1,0,ميم-فتحة | نون-سكون | طاء-فتحة | قاف-فتحة | تا...,[verbal noun of مَنْطَقَ (manṭaqa) (form Iq)],fem,,نطق
14,4871,115331,آشوري,assyrian_2_N,2,آشُورِيّ,noun,1,0,ألف ممدودة | شين-ضمة | واو | راء-كسرة | ياء-شدة,"[Assyrian, Ashurite]",masc,آشُورِيُّون,
15,6160,6264,منع,ban_2_N,2,مَنْع,noun,1,0,ميم-فتحة | نون-سكون | عين,[verbal noun of مَنَعَ (manaʕa) (form I) / pre...,masc,,منع
18,7316,123391,بيلاروسي,belarusian_N,2,بِيلَارُوسِيّ,noun,1,0,باء-كسرة | ياء | لام-فتحة | ألف | راء-ضمة | وا...,[Belarusian (person)],masc,,
27,9270,274,كِتاب,book_1_N,3,كِتَاب,noun,1,0,كاف-كسرة | تاء-فتحة | ألف | باء,[verbal noun of كَتَبَ (kataba) (form I) / ver...,masc,كُتُب,
35,9394,4496,حدّ,border_1_N,7,حَدّ,noun,1,0,حاء-فتحة | دال-شدة,[verbal noun of حَدَّ (ḥadda) (form I) / limit...,masc,حُدُود,
