## 1.0 Imports

In [1]:
# files handling
import gzip
import json

# regular expressions
import re

# system handling
from pathlib import Path
from typing import Dict, List
from IPython.display import display
from IPython.display import Markdown as md

# string and string emcoding
from unicodedata import category, normalize

# data handling
import pandas as pd

# Arabic tools
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.morphology.database import MorphologyDB
from pyarabic.araby import (
    DIACRITICS,
    LETTERS,
    SHADDA,
    is_arabicrange,
    is_vocalizedtext,
    name,
)
from pyarabic.trans import convert as translit

## 2.0 Define Paths

In [2]:
# Paths for reindexed Wikitionary and the Arabic words.

# Arabic words csv
data_dir = Path("../data/interim/gf_wordnet")
ar_words_gf_path = sorted(list(data_dir.glob("*.csv")))

# Path for preproceed Wikitionary
wiktionary_dir = Path("../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [3]:
# Path to save words status
output_dir = Path("../data/interim/ambiguous")
output_unambiguous_dir = Path("../data/interim/unambiguous")

In [4]:
gf_lin_fun_path = "../data/processed/gf/WordNetAra.gf"

In [5]:
ar_words_gf_path

[PosixPath('../data/interim/gf_wordnet/20231122.1302_Q79_Q34_ar2en_words_gf.csv'),
 PosixPath('../data/interim/gf_wordnet/20231201.1559_Q79_Q34_ar2en_words_gf.csv'),
 PosixPath('../data/interim/gf_wordnet/20240202.1101_Q79_Q34_Q16_ar2en_words_gf.csv')]

## 3.0 Define Variables

In [6]:
DIACRITICS = set(DIACRITICS)  # Aabic diacritics/short vowels

In [7]:
entites = "_".join(sorted({p for path in ar_words_gf_path for p in path.stem.split("_")[1:-3]}, reverse=True))
TIMESTAMP = ar_words_gf_path[-1].stem.split("_")[0]

In [8]:
entites, TIMESTAMP

('Q79_Q34_Q16', '20240202.1101')

In [9]:
SET_BASE_POS = {"N", "V", "V2", "V3", "A"}
SET_PROPN_POS = {"NP", "LN", "GN", "SN", "PN"}

In [10]:
# Remove last Harakat from an Arabic Word
NOT_LETTERS_PATTERN = f"[^{LETTERS}]"
DIACRITICS_PATTERN = "".join(DIACRITICS)
LAST_HARAKAT_PATTERN = re.compile(rf"[{DIACRITICS_PATTERN}](?={NOT_LETTERS_PATTERN}*$)", re.UNICODE)

In [11]:
pd.set_option("display.max_colwidth", None)

## 4.0 Define Functions

### 4.1 Utilities Functions

In [12]:
def is_nonspace_mark(x: str) -> bool:
    """detect diacritics in a string"""
    return bool(category(x) == "Mn")

In [13]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [14]:
def remove_diacritics_ar(text: str) -> str:
    """remove diacritics in a string"""
    return "".join([t for t in text if not is_nonspace_mark(t)])

In [15]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    if not isinstance(ar_vocalized, str):
        return ar_vocalized
    ar_norm = "".join(c for c in ar_vocalized if is_arabicrange(c)).strip()
    ar_norm = normalize("NFC", ar_norm)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

In [16]:
def join_ar_words(words):
    if not isinstance(words, str):
        return words
    # join arabic lettera names by |
    # if the next word is haraka name join by -
    joined = name(words[0])
    for word in words[1:]:
        # Check if the word contains Arabic diacritics
        if word in DIACRITICS:
            joined += "-" + name(word)
        else:
            joined += " | " + name(word)
    return joined

### 4.2 Main Functions

In [17]:
def get_word_dictionaries(ar_word: str, wiki_idxs: Dict[str, int], wikitionary: List[dict]) -> List[dict]:
    """Get the Wikitionary info for a word"""
    word_idx = wiki_idxs.get(ar_word, [])
    word_data = [wikitionary[idx] for idx in word_idx]
    return word_data

In [18]:
def count_entry(ar_word, wiki_idxs, wikitionary):
    """Count howmay entry are there in the Wikitionary for a word"""
    ar_word_unvocalized = remove_diacritics_ar(ar_word)
    word_wikidicts = get_word_dictionaries(ar_word_unvocalized, wiki_idxs, wikitionary)
    return len(word_wikidicts)

In [19]:
def get_canonical_form(word, wiki_idxs, wikitionary):
    """get all vocalize forms of the word from Wikitionary"""
    # get all word'd entries in Wikitionary
    # Vocalized form is found in `'forms': [{'form': WORD_VOCALIZED, 'tags': ['canonical']}`
    form_vocal = []
    pos = []
    glosses = []
    word_idxs = []
    for word_idx in wiki_idxs.get(word, []):
        for _, word_data in wikitionary[word_idx].items():
            for form_dict in word_data.get("forms", []):
                if form_dict.get("tags", [None])[0] == "canonical":
                    form_word = form_dict.get("form")
                    if form_word and is_vocalizedtext(form_word):
                        form_vocal.append(normalize_ar(form_word))
                        pos.append(word_data["pos"])
                        word_idxs.append(word_idx)
                        glosses.append(
                            [
                                " / ".join(
                                    [", ".join(sense["glosses"]) for sense in word_data.get("senses", []) if "glosses" in sense]
                                )
                            ]
                        )

    if not form_vocal:
        # Not found in WikiData Use Camel Tools
        db = MorphologyDB.builtin_db()
        analyzer = Analyzer(db)
        analyses = analyzer.analyze(word)
        for analysis in analyses:
            if not form_vocal or (analysis["diac"] != form_vocal[-1] and analysis["pos"] != pos[-1]):
                form_vocal.append(LAST_HARAKAT_PATTERN.sub("", analysis["diac"]))
                pos.append(analysis["pos"])
                glosses.append(" / ".join(analysis["gloss"].split(";")))
                word_idxs.append(-1)

    return (form_vocal, pos, word_idxs, glosses) if form_vocal else (word, pd.NA, -1, pd.NA)

## 5.0 Check Wikitionary

- Get the missing words in Wikitionary
- Confirm that each Arabic word has only one entry in the Wikitionary file. If not, the short vowels need to be added or edited in the source
file to remove any disambguation.
- Export words with thier number of entries to be modified manually later

### 5.1 Load Files

In [20]:
with open(gf_lin_fun_path, encoding="utf-8", mode="rt") as obj_file:
    list_lin_fn_words = obj_file.read().split("\n")[3:-1]

dict_lin_words = {"en_entry": [], "ar": []}
for str_lin_fn_words in list_lin_fn_words:
    en_entry, ar_entry = str_lin_fn_words.split(" = ")
    en_entry = en_entry[4:]
    ar_word = remove_diacritics_ar(ar_entry[1:-1].split("_")[0])
    dict_lin_words["en_entry"].append(en_entry)
    dict_lin_words["ar"].append(ar_word)

df_ar_gf_old = pd.DataFrame(dict_lin_words)

In [21]:
df_ar_gf_all = []
for path in ar_words_gf_path:
    df = pd.read_csv(path, delimiter="\t", index_col=0).reset_index()
    df.drop("index", axis=1, inplace=True)
    df_ar_gf_all.append(df)

df_ar_gf_all = pd.concat(df_ar_gf_all, ignore_index=True).drop("en", axis=1)

In [22]:
df_ar_gf_old.head(5)

Unnamed: 0,en_entry,ar
0,dari_N,دارية
1,development_2_N,تطوير
2,italian_N,إيطالية
3,krona_1_N,كرون
4,latvian_N,لاتفية


In [23]:
df_ar_gf_all.head(5)

Unnamed: 0,en_entry,ar,pos,no_en_entry
0,authoritarian_1_A,استبدادي,A,0
1,domestic_1_A,محلي,A,0
2,following_2_A,تالي,A,0
3,free_1_A,حر,A,0
4,full_3_A,ممتلئ,A,0


In [24]:
df_merged = df_ar_gf_all.merge(df_ar_gf_old, on="en_entry", how="outer", suffixes=["", "_"], indicator=True)

In [25]:
df_merged.head(5)

Unnamed: 0,en_entry,ar,pos,no_en_entry,ar_,_merge
0,authoritarian_1_A,استبدادي,A,0,استبدادي,both
1,domestic_1_A,محلي,A,0,محلي,both
2,following_2_A,تالي,A,0,تالي,both
3,free_1_A,حر,A,0,حر,both
4,full_3_A,ممتلئ,A,0,ممتلئ,both


In [26]:
merge_status = df_merged["_merge"].value_counts()
merge_status

_merge
left_only     170
both          136
right_only      0
Name: count, dtype: int64

In [27]:
if "right_only" in merge_status:
    display(df_merged[df_merged["_merge"] == "right_only"])


Unnamed: 0,en_entry,ar,pos,no_en_entry,ar_,_merge


In [28]:
df_ar_gf_all = df_merged.copy()[df_merged["_merge"] != "both"]
df_ar_gf_all.drop(["_merge", "ar_"], inplace=True, axis=True)

In [29]:
df_ar_gf_all.shape

(170, 4)

In [30]:
set_all_pos = set(df_ar_gf_all["pos"].to_list())
set_all_pos

{'A', 'CN', 'GN', 'LN', 'MU', 'N', 'NP', 'PN', 'SN', 'VP'}

In [31]:
set_other_pos = set_all_pos.difference(SET_BASE_POS).difference(SET_PROPN_POS)
set_other_pos

{'CN', 'MU', 'VP'}

In [32]:
df_ar_gf = df_ar_gf_all.copy()[(df_ar_gf_all["pos"].isin(SET_BASE_POS)) | (df_ar_gf_all["pos"] == "MU")]
df_ar_cn_gf = df_ar_gf_all.copy()[(df_ar_gf_all["pos"].str.startswith("CN")) | (df_ar_gf_all["pos"].str.startswith("VP"))]
df_ar_pn_gf = df_ar_gf_all.copy()[df_ar_gf_all["pos"].isin(SET_PROPN_POS)]

In [33]:
df_ar_gf_all.shape[0], df_ar_gf.shape[0] + df_ar_cn_gf.shape[0] + df_ar_pn_gf.shape[0]

(170, 170)

In [34]:
display(df_ar_gf.head(4))

Unnamed: 0,en_entry,ar,pos,no_en_entry
21,regime_1_N,نظام,N,0
22,regime_1_N,حاكم,A,0
24,healthcare_2_N,رعاية,N,0
25,healthcare_2_N,صحية,N,0


In [35]:
# 2. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-4555

Reading line-128636

In [36]:
# 3. load Indices
with gzip.open(wiki_indices_path, "rt", encoding="utf-8") as gzip_obj:
    wiki_idxs = json.load(gzip_obj)

### 5.2 Get the Number of Entry for each Word

In [37]:
# Normalize words, the check the number of entry
df_ar_gf["entry_num"] = df_ar_gf["ar"].apply(normalize_ar).apply(count_entry, args=(wiki_idxs, wikitionary))

In [38]:
df_ar_gf.head()

Unnamed: 0,en_entry,ar,pos,no_en_entry,entry_num
21,regime_1_N,نظام,N,0,1
22,regime_1_N,حاكم,A,0,2
24,healthcare_2_N,رعاية,N,0,1
25,healthcare_2_N,صحية,N,0,1
27,vat_1_N,ضريبة,N,1,1


In [39]:
max_entrs = df_ar_gf["entry_num"].max()
md(f"> Some words has more than {max_entrs} entries!!")

> Some words has more than 8 entries!!

In [40]:
pd.DataFrame({"count": df_ar_gf["entry_num"].value_counts()})

Unnamed: 0_level_0,count
entry_num,Unnamed: 1_level_1
0,22
1,19
2,6
3,1
8,1


### Add All Vocalized Forms for each Word

In [41]:
df_ambiguous_grouped = df_ar_gf.copy()
(
    df_ambiguous_grouped["vocal_forms"],
    df_ambiguous_grouped["wiki_pos"],
    df_ambiguous_grouped["wiki_idx"],
    df_ambiguous_grouped["senses"],
) = zip(
    *df_ambiguous_grouped["ar"]
    .apply(normalize_ar)
    .apply(remove_diacritics_ar)
    .apply(get_canonical_form, args=(wiki_idxs, wikitionary))
)

In [42]:
df_ambiguous_grouped.head(30)

Unnamed: 0,en_entry,ar,pos,no_en_entry,entry_num,vocal_forms,wiki_pos,wiki_idx,senses
21,regime_1_N,نظام,N,0,1,[نِظَام],[noun],[661],"[[system / regularity / order / method / rule / regime: perhaps short for نِظَام الْحُكْم (niẓām al-ḥukm, “system of rule”)]]"
22,regime_1_N,حاكم,A,0,2,"[حَاكِم, حَاكِم]","[adj, noun]","[3016, 3017]","[[ruling, governing / decisive], [judge / ruler, sovereign / governor]]"
24,healthcare_2_N,رعاية,N,0,1,[رِعايَة],[noun],[124114],"[[care, support, custody]]"
25,healthcare_2_N,صحية,N,0,1,[صِحِّيَّة],[adj],[26328],[[feminine singular of صِحِّيّ (ṣiḥḥiyy)]]
27,vat_1_N,ضريبة,N,1,1,[ضَرِيبَة],[noun],[117242],"[[tribute, tax, public levy]]"
28,vat_1_N,قيمة,N,1,1,[قِيمَة],[noun],[20995],"[[value, worth / amount, quantity / price]]"
29,vat_1_N,مضافة,A,1,0,"[مَضافَة, مُضافَة]","[noun, adj]","[-1, -1]","[guest_room / hostel+[fem.sg.], added+[fem.sg.]]"
31,prime_minister_2_N,وزير,N,0,1,[وَزِير],[noun],[818],"[[minister, cabinet minister / vizier / helper, assistant / queen]]"
166,chinese_N,صينىة,N,0,0,"[صِينِيَّة, صِينِيَّة, صِينِيَّة]","[noun, adj, noun]","[-1, -1, -1]","[Chinese+[fem.sg.], Chinese+[fem.sg.], porcelain / china+[fem.sg.]]"
185,inequality_N,مساواة,N,1,2,"[مُسَاوَاة, مُسَاوَاة]","[noun, adj]","[13023, 13024]","[[verbal noun of سَاوَى (sāwā) (form III) / equality, equivalence / equal rights / settlement (of a bill)], [feminine singular of مُسَاوًى (musāwan)]]"


In [43]:
df_ar_gf_ambiguous = df_ambiguous_grouped.copy().explode(["vocal_forms", "wiki_idx", "wiki_pos", "senses"])

In [44]:
df_ar_gf_ambiguous.head(5)

Unnamed: 0,en_entry,ar,pos,no_en_entry,entry_num,vocal_forms,wiki_pos,wiki_idx,senses
21,regime_1_N,نظام,N,0,1,نِظَام,noun,661,"[system / regularity / order / method / rule / regime: perhaps short for نِظَام الْحُكْم (niẓām al-ḥukm, “system of rule”)]"
22,regime_1_N,حاكم,A,0,2,حَاكِم,adj,3016,"[ruling, governing / decisive]"
22,regime_1_N,حاكم,A,0,2,حَاكِم,noun,3017,"[judge / ruler, sovereign / governor]"
24,healthcare_2_N,رعاية,N,0,1,رِعايَة,noun,124114,"[care, support, custody]"
25,healthcare_2_N,صحية,N,0,1,صِحِّيَّة,adj,26328,[feminine singular of صِحِّيّ (ṣiḥḥiyy)]


In [45]:
df_ar_gf_ambiguous.shape

(78, 9)

## 6.0 Export Ambiguous Words

- Manually fixing the diacritics of the words that has more than one entry.
- Manually checking the missing word in Wkitionary

In [46]:
# Sort by count and index
df_ar_gf_ambiguous = (
    df_ar_gf_ambiguous.copy()
    .reset_index(drop=False)
    .sort_values(by=["entry_num", "en_entry", "wiki_idx", "ar"], kind="mergesort")
)

In [47]:
# Add Letter Names in Arabic
df_ar_gf_ambiguous["ar_letters"] = df_ar_gf_ambiguous["vocal_forms"].apply(normalize_ar).apply(join_ar_words)

In [48]:
df_ar_gf_ambiguous["tim_translit"] = df_ar_gf_ambiguous["vocal_forms"].apply(
    lambda x: translit(x, "arabic", "tim") if isinstance(x, str) else x
)

In [49]:
df_ar_gf_ambiguous["select"] = 0

In [50]:
# Rearrange columns order
neworder = [
    "wiki_idx",
    "no_en_entry",
    "en_entry",
    "ar",
    "vocal_forms",
    "pos",
    "wiki_pos",
    "select",
    "entry_num",
    "ar_letters",
    "senses",
    "tim_translit",
    "file"
]
df_ar_gf_ambiguous = df_ar_gf_ambiguous.reindex(columns=neworder)

In [51]:
display(df_ar_gf_ambiguous)

Unnamed: 0,wiki_idx,no_en_entry,en_entry,ar,vocal_forms,pos,wiki_pos,select,entry_num,ar_letters,senses,tim_translit,file
17,-1,0,algonquin_N,ألجونكوين,ألجونكوين,N,,0,0,همزة على الألف | لام | جيم | واو | نون | كاف | واو | ياء | نون,,>ljwnkwyn,
22,-1,0,blackfoot_N,بلاكفوت,بلاكفوت,N,,0,0,باء | لام | ألف | كاف | فاء | واو | تاء,,blAkfwt,
23,-1,0,cayuga_N,كايوجا,كايوجا,N,,0,0,كاف | ألف | ياء | واو | جيم | ألف,,kAywjA,
10,-1,0,chinese_N,صينىة,صِينِيَّة,N,noun,0,0,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,Chinese+[fem.sg.],Siyniy~ap,
11,-1,0,chinese_N,صينىة,صِينِيَّة,N,adj,0,0,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,Chinese+[fem.sg.],Siyniy~ap,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,2554,1,life_expectancy_N,عمر,عُمُر,N,noun,0,8,عين-ضمة | ميم-ضمة | راء,"[life as a period of time, length of life, lifespan, lifetime / age]",Eumur,
56,2554,1,life_expectancy_N,عمر,عُمْر,N,noun,0,8,عين-ضمة | ميم-سكون | راء,"[life as a period of time, length of life, lifespan, lifetime / age]",Eumor,
57,2555,1,life_expectancy_N,عمر,عَمْر,N,noun,0,8,عين-فتحة | ميم-سكون | راء,[verbal noun of عَمَرَ (ʕamara) (form I)],Eamor,
58,2556,1,life_expectancy_N,عمر,عُمَر,N,noun,0,8,عين-ضمة | ميم-فتحة | راء,[verbal noun of عَمَرَ (ʕamara) (form I)],Eumar,


In [52]:
df_ar_gf_ambiguous.to_csv(output_dir / f"{TIMESTAMP}_{entites}_word_ambiguous.csv", sep="\t")
df_ar_gf_ambiguous.to_csv(output_unambiguous_dir / f"{TIMESTAMP}_{entites}_word_unambiguous.csv", sep="\t")

> Export Other DataFrames

In [53]:
df_ar_cn_gf.to_csv(output_dir / f"{TIMESTAMP}_{entites}_CN_ambiguous.csv", sep="\t")
df_ar_pn_gf.to_csv(output_dir / f"{TIMESTAMP}_{entites}_PN_ambiguous.csv", sep="\t")