# 1.0 Imports

In [36]:
import gzip
import json
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List
from unicodedata import normalize

import pandas as pd
from pyarabic.araby import DIACRITICS, SHADDA, is_arabicword, name, vocalizedlike

# 2.0 Define Paths

In [37]:
# Path for preproceed Wikitionary
wiktionary_dir = Path("../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [38]:
# Arabic GF-Wordnet
data_dir = Path("../data/interim/unambiguous")
ar_words_gf_path = data_dir / "20231201.1559_Q79_Q34_word_unambiguous.csv"

In [39]:
# Path to save words status
output_dir = Path("../data/interim/lexicon")
output_dir.mkdir(parents=True, exist_ok=True)

# 3.0 Define Variables

In [40]:
DIACRITICS = set(DIACRITICS)  # Aabic diacritics/short vowels

In [41]:
TIMESTAMP = "_".join(ar_words_gf_path.stem.split("_")[:-2])

## 4.0 Define Functions

### 4.1 Utilities Functions

In [42]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [43]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## 4.2 Morphology Features

### 4.2.1 Gender

In [44]:
def get_noun_gender(entry, form):
    for _, word_data in entry.items():
        categories: List[str] = word_data.get("categories", [])
        for category in categories:
            masculine = category.find("masculine") != -1
            feminine = category.find("feminine") != -1
            if masculine and feminine:
                return "N"
            elif masculine:
                return "masc"
            elif feminine:
                return "fem"
        return pd.NA

In [45]:
def get_adj_gender(entry, form):
    for _, word_data in entry.items():
        word_forms: List[dict] = word_data.get("forms", [])
        masculine, feminine = None, None
        for form_dict in word_forms:
            if {'indefinite', 'masculine', 'singular', 'nominative'}.issubset(form_dict.get("tags", [])):
                masculine = normalize_ar(form_dict["form"])
            elif {'indefinite', 'feminine', 'singular', 'nominative'}.issubset(form_dict.get("tags", [])):
                feminine = normalize_ar(form_dict["form"])
            if masculine and feminine:
                break

        # TODO: Confusing = redesign
        if not masculine and not feminine:
            return pd.NA, pd.NA
        elif not masculine or not feminine:
            return "masc", pd.NA if masculine else "fem", pd.NA

        if vocalizedlike(form, masculine):
            return "masc", feminine
        return "fem", masculine


In [46]:
def get_gender(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    pos = row["wiki_pos"]
    form: str = normalize_ar(row["vocal_forms"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        if pos == "noun":
            return get_noun_gender(entry, None)
        elif pos == "adj":
            return get_adj_gender(entry, form)
    return [pd.NA] * 2

### 4.2.2 Plural

In [47]:
def get_noun_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        for form_dict in word_forms:
            if "plural" == form_dict.get("tags", [None])[0]:
                return form_dict["form"]
        inflections = word_data.get("head_templates", [])
        for dict_inflection in inflections:
            plural = dict_inflection.get("args", {}).get("pl", None)
            if plural is not None:
                if is_arabicword(plural):
                    return normalize_ar(plural)
                elif plural == "-":
                    return normalize_ar("غير معدود")
                return normalize_ar("جمع سالم")
    return pd.NA

In [48]:
def get_adj_plural(entry):
    for _, word_data in entry.items():
        word_forms = word_data.get("forms", [])
        masc_pl, fem_pl = None, None
        for form_dict in word_forms:
            if ['masculine', 'plural'] == form_dict.get("tags", [None]):
                masc_pl = form_dict["form"]
            elif ['feminine', 'plural'] == form_dict.get("tags", [None]):
                fem_pl = form_dict["form"]
            if masc_pl and fem_pl:
                break
    masc_pl = normalize_ar(masc_pl) if masc_pl else pd.NA
    fem_pl = normalize_ar(fem_pl) if fem_pl else pd.NA

    return masc_pl, fem_pl

In [49]:
def get_plural(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    pos = row["wiki_pos"]
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        if pos == "noun":
            return get_noun_plural(entry)
        elif pos == "adj":
            return get_adj_plural(entry)
    if pos == "noun":
        return pd.NA
    elif pos == "adj":
        return [pd.NA]*2

### 4.2.3 Root

In [50]:
def get_root(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        for _, word_data in entry.items():
            for etymology in word_data.get('etymology_templates', []):
                if etymology.get("name", "").startswith("ar-root"):
                    if len(etymology["args"]) == 3:
                        return normalize_ar("".join(list(etymology["args"].values())))
                    if len(etymology["args"]) == 1:
                        return normalize_ar("".join(etymology["args"]["1"].split()))
        return pd.NA

### 4.2.4 Verb Forms

In [51]:
def get_verb_form(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        form_number = None
        for _, word_data in entry.items():
            form_numbers = []
            categories: List[str] = word_data.get("categories", [])
            for category in categories:
                if match := re.search(r"(?<=form-)\w+(?= verbs)", category):
                    form_numbers.append(match.group())
            if form_numbers:
                form_number = max(form_numbers)

        return f"Form{form_number}" if form_number is not None else pd.NA


In [52]:
def get_imperfect_form(row, wikitionary):
    wiki_idx = int(row["wiki_idx"])
    if wiki_idx > -1:
        entry = wikitionary[wiki_idx]
        for _, word_data in entry.items():
            word_forms: List[dict] = word_data.get("forms", [])
            for form_dict in word_forms:
                if {
                    "active",
                    "indicative",
                    "masculine",
                    "non-past",
                    "imperfective",
                    "singular",
                    "third-person",
                }.issubset(form_dict.get("tags", [])):
                    return normalize_ar(form_dict["form"])

        return pd.NA


# 5.0 Load Files

In [53]:
# 1. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-128636

In [54]:
# 2. load the csv arabic-english_gf words
df_ar_gf = pd.read_csv(ar_words_gf_path, delimiter="\t", index_col=0)

In [55]:
df_ar_gf = df_ar_gf[df_ar_gf["select"] == 1]

In [56]:
# Get nouns, adjectives and verbs
df_ar_gf_nouns = df_ar_gf.copy()[(df_ar_gf["wiki_pos"] == "noun") & (df_ar_gf["select"] == 1)]
df_ar_gf_adjs = df_ar_gf.copy()[(df_ar_gf["wiki_pos"] == "adj") & (df_ar_gf["select"] == 1)]
df_ar_gf_verbs = df_ar_gf.copy()[(df_ar_gf["wiki_pos"] == "verb") & (df_ar_gf["select"] == 1)]

# 6.0 Get Morphological Features

## 6.1 Nouns

In [57]:
df_ar_gf_nouns["gender"] = df_ar_gf_nouns.apply(get_gender, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["gender"].value_counts()

gender
masc            43
fem             29
[<NA>, <NA>]    10
Name: count, dtype: int64

In [58]:
df_ar_gf_nouns["plural"] = df_ar_gf_nouns.apply(get_plural, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["plural"].isna().value_counts()

plural
False    61
True     35
Name: count, dtype: int64

In [59]:
df_ar_gf_nouns["root"] = df_ar_gf_nouns.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_nouns["root"].isna().value_counts()

root
True     53
False    43
Name: count, dtype: int64

## 6.2 Adjectives

In [60]:
df_ar_gf_adjs["gender"], df_ar_gf_adjs["other_gender_form"] = zip(
    *df_ar_gf_adjs.apply(get_gender, args=(wikitionary,), axis="columns")
)
df_ar_gf_adjs["gender"].value_counts()

gender
masc    25
Name: count, dtype: int64

In [61]:
df_ar_gf_adjs["masc_pl"], df_ar_gf_adjs["fem_pl"]  = zip(*df_ar_gf_adjs.apply(get_plural, args=(wikitionary,), axis="columns"))
print(df_ar_gf_adjs["masc_pl"].isna().value_counts())
print(df_ar_gf_adjs["fem_pl"].isna().value_counts())

masc_pl
False    25
True      7
Name: count, dtype: int64
fem_pl
False    25
True      7
Name: count, dtype: int64


In [62]:
df_ar_gf_adjs["root"] = df_ar_gf_adjs.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_adjs["root"].isna().value_counts()

root
True     17
False    15
Name: count, dtype: int64

## 6.3 Verbs

In [63]:
df_ar_gf_verbs["verb_form"] = df_ar_gf_verbs.apply(get_verb_form, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["verb_form"].isna().value_counts()

verb_form
False    8
True     1
Name: count, dtype: int64

In [64]:
df_ar_gf_verbs["root"] = df_ar_gf_verbs.apply(get_root, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["root"].isna().value_counts()

root
False    8
True     1
Name: count, dtype: int64

In [65]:
df_ar_gf_verbs["imperfect"] = df_ar_gf_verbs.apply(get_imperfect_form, args=(wikitionary,), axis="columns")
df_ar_gf_verbs["imperfect"].isna().value_counts()

imperfect
False    8
True     1
Name: count, dtype: int64

# 7.0 Export Data

In [66]:
df_ar_gf_verbs.drop(columns=["select", "entry_num", "pos"], inplace=True)
df_ar_gf_nouns.drop(columns=["select", "entry_num", "pos"], inplace=True)
df_ar_gf_adjs.drop(columns=["select", "entry_num", "pos"], inplace=True)

In [67]:
df_ar_gf_verbs.to_csv(output_dir / f"{TIMESTAMP}_verbs_lexicon.csv", decimal=",")
df_ar_gf_nouns.to_csv(output_dir / f"{TIMESTAMP}_nouns_lexicon.csv", decimal=",")
df_ar_gf_adjs.to_csv(output_dir / f"{TIMESTAMP}_adjectives_lexicon.csv", decimal=",")

# 8.0 Visualize Data

In [68]:
df_ar_gf_verbs

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,verb_form,root,imperfect
208,8500,birr_1_N,بير,بِيرَ,verb,باء-كسرة | ياء | راء,,biyr,6_20231201.1559_Q79_Q34,,,
194,116328,consider_6_V3,اعتبر,اِعْتَبَر,verb,ألف-كسرة | عين-سكون | تاء-فتحة | باء-فتحة | راء,"['to take example, to take warning, to learn a...",AiEotabar,20_20231122.1302_Q79_Q34,VIII,عبر,يَعْتَبِرُ
317,20863,succeed_V2,نجح,نَجَحَ,verb,نون-فتحة | جيم-فتحة | حاء-فتحة,"['to succeed, to be successful']",najaHa,50_20231201.1559_Q79_Q34,I,نجح,يَنْجَحُ
165,8070,rank_2_V2,صنف,صَنَّفَ,verb,صاد-فتحة | نون-شدة-فتحة | فاء-فتحة,"['to sort, classify, or categorize something /...",San~afa,100_20231122.1302_Q79_Q34,II,صنف,يُصَنِّفُ
236,148,designate_4_V2,عين,عَيَّنَ,verb,عين-فتحة | ياء-شدة-فتحة | نون-فتحة,"['to appoint, to assign, to name, to nominate ...",Eay~ana,18_20231201.1559_Q79_Q34,II,عين,يُعَيِّنُ
190,4743,reach,وصل,وَصَل,verb,واو-فتحة | صاد-فتحة | لام-فتحة,"['(transitive, with إِلَى (ʔilā)) to arrive (“...",waSala,158_20231122.1302_Q79_Q34,I,وصل,يَصِلُ
181,6248,speak_3_V2,تكلم,تَكَلَّم,verb,تاء-فتحة | كاف-فتحة | لام-شدة-فتحة | ميم-فتحة,"['to talk, to have a discussion or conversation']",takal~ama,118_20231122.1302_Q79_Q34,V,كلم,يَتَكَلَّمُ
158,698,have_1_V2,ملك,مَلَك,verb,ميم-فتحة | لام-فتحة | كاف-فتحة,"['to take in possession, to take over, to acqu...",malaka,54_20231122.1302_Q79_Q34,I,ملك,يَمْلِكُ
169,8035,show_2_V2,عرض,عَرَضَ,verb,عين-فتحة | راء-فتحة | ضاد-فتحة,"['to appear / to happen, to occur / to show, t...",EaraDa,113_20231122.1302_Q79_Q34,I,عرض,يَعْرِضُ


In [69]:
df_ar_gf_adjs

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,other_gender_form,masc_pl,fem_pl,root
31,-1,added_A,مضاف,مُضاف,adj,ميم-ضمة | ضاد | ألف | فاء,added,muDAf,157_20231122.1302_Q79_Q34,,,,,
1,-1,authoritarian_1_A,استبدادي,اِسْتِبْدادِي,adj,ألف-كسرة | سين-سكون | تاء-كسرة | باء-سكون | دا...,arbitrary / authoritarian,AisotibodAdiy,7_20231122.1302_Q79_Q34,,,,,
14,-1,median_3_A,الوسيط,الوَسِيط,adj,ألف | لام | واو-فتحة | سين-كسرة | ياء | طاء,the+mediator / go-between / intermediary,AlwasiyT,76_20231122.1302_Q79_Q34,,,,,
16,-1,nordic_2_A,الشمال,الشَّمال,adj,,,,84_20231122.1302_Q79_Q34,,,,,
2,4973,domestic_1_A,محلي,مَحَلِّيّ,adj,ميم-فتحة | حاء-فتحة | لام-شدة-كسرة | ياء-شدة,"['local, national']",maHal~iy~,31_20231122.1302_Q79_Q34,masc,مَحَلِّيَّةٌ,مَحَلِّيُّونَ,مَحَلِّيَّات,
3,117599,following_2_A,تالي,تَالِي,adj,تاء-فتحة | ألف | لام-كسرة | ياء,[''],taAliy,41_20231122.1302_Q79_Q34,,,,,
7,10802,full_3_A,ممتلئ,مُمْتَلِئ,adj,ميم-ضمة | ميم-سكون | تاء-فتحة | لام-كسرة | همز...,"['full, filled, filled up, replete']",mumotali},45_20231122.1302_Q79_Q34,masc,مُمْتَلِئَةٌ,مُمْتَلِئُونَ,مُمْتَلِئَات,ملء
8,124714,gross_1_A,إجمالي,إِجْمَالِيّ,adj,همزة تحت الألف-كسرة | جيم-سكون | ميم-فتحة | أل...,"['comprehensive, general, total']",<ijomaAliy~,50_20231122.1302_Q79_Q34,masc,إِجْمَالِيَّةٌ,إِجْمَالِيُّونَ,إِجْمَالِيَّات,جمل
11,124077,individual_4_A,فردي,فَرْدِيّ,adj,فاء-فتحة | راء-سكون | دال-كسرة | ياء-شدة,"['single / individual, personal / odd, uneven']",farodiy~,59_20231122.1302_Q79_Q34,masc,فَرْدِيَّةٌ,فَرْدِيُّونَ,فَرْدِيَّات,فرد
15,115192,moderate_1_A,معتدل,مُعْتَدِل,adj,ميم-ضمة | عين-سكون | تاء-فتحة | دال-كسرة | لام,"['straight, even, proportionate / temperate, m...",muEotadil,79_20231122.1302_Q79_Q34,masc,مُعْتَدِلَةٌ,مُعْتَدِلُونَ,مُعْتَدِلَات,عدل


In [70]:
df_ar_gf_nouns

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,plural,root
213,-1,chinese_N,صينىة,صِينِيَّة,noun,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء...,Chinese+[fem.sg.],Siyniy~ap,10_20231201.1559_Q79_Q34,"[<NA>, <NA>]",,
231,-1,dari_N,دارية,دارِيَّة,noun,دال | ألف | راء-كسرة | ياء-شدة-فتحة | تاء مربوطة,,dAriy~ap,15_20231201.1559_Q79_Q34,"[<NA>, <NA>]",,
69,-1,development_2_N,تطوير,تَطْوِير,noun,تاء-فتحة | طاء-سكون | واو-كسرة | ياء | راء,development / advancement / promotion,taTowiyr,29_20231122.1302_Q79_Q34,"[<NA>, <NA>]",,
263,-1,italian_N,ايطالية,إِيطالِيَّة,noun,همزة تحت الألف-كسرة | ياء | طاء | ألف | لام-كس...,,<iyTAliy~ap,31_20231201.1559_Q79_Q34,"[<NA>, <NA>]",,
108,-1,krona_1_N,كرون,كَرون,noun,كاف-فتحة | راء | واو | نون,,karwn,69_20231122.1302_Q79_Q34,"[<NA>, <NA>]",,
...,...,...,...,...,...,...,...,...,...,...,...,...
273,3220,mother_1_N,أم,أُمّ,noun,همزة على الألف-ضمة | ميم-شدة,['mother / origin / source'],>um~,37_20231201.1559_Q79_Q34,fem,أُمَّهَات,ءمم
311,3927,starch_1_N,نشاء,نَشَاء,noun,نون-فتحة | شين-فتحة | ألف | همزة,"['starch, cornstarch, farina']",na$aA',49_20231201.1559_Q79_Q34,masc,,نشو
44,2554,age_1_N,عمر,عُمْر,noun,عين-ضمة | ميم-سكون | راء,"['life as a period of time, length of life, li...",Eumor,0_20231122.1302_Q79_Q34,masc,أَعْمَار,عمر
145,5764,west_2_N,غرب,غَرْب,noun,غين-فتحة | راء-سكون | باء,['verbal noun of غَرَبَ (ḡaraba) (form I) / we...,garob,136_20231122.1302_Q79_Q34,masc,,
