## 1.0 Imports

In [84]:
# files handling
import gzip
import json

# regular expressions
import re

# system handling
from pathlib import Path
from typing import Dict, List
from IPython.display import display
from IPython.display import Markdown as md

# string and string emcoding
from unicodedata import category, normalize

# data handling
import pandas as pd

# Arabic tools
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.morphology.database import MorphologyDB
from pyarabic.araby import (
    DIACRITICS,
    LETTERS,
    SHADDA,
    is_arabicrange,
    is_vocalizedtext,
    name,
)
from pyarabic.trans import convert as translit

## 2.0 Define Paths

In [85]:
# Paths for reindexed Wikitionary and the Arabic words.

# Arabic words csv
data_dir = Path("../data/interim/gf_wordnet")
ar_words_gf_path = sorted(list(data_dir.glob("*.csv")))

# Path for preproceed Wikitionary
wiktionary_dir = Path("../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [86]:
# Path to save words status
output_dir = Path("../data/interim/ambiguous")
output_unambiguous_dir = Path("../data/interim/unambiguous")

In [120]:
gf_lin_fun_path = "../data/processed/gf/WordNetAra.gf"

In [87]:
ar_words_gf_path

[PosixPath('../data/interim/gf_wordnet/20231122.1302_Q79_Q34_ar2en_words_gf.csv'),
 PosixPath('../data/interim/gf_wordnet/20231201.1559_Q79_Q34_ar2en_words_gf.csv'),
 PosixPath('../data/interim/gf_wordnet/20240202.1101_Q79_Q34_Q16_ar2en_words_gf.csv')]

## 3.0 Define Variables

In [88]:
DIACRITICS = set(DIACRITICS)  # Aabic diacritics/short vowels

In [89]:
entites = "_".join(sorted({p for path in ar_words_gf_path for p in path.stem.split("_")[1:-3]}, reverse=True))
TIMESTAMP = ar_words_gf_path[-1].stem.split("_")[0]

In [90]:
entites, TIMESTAMP

('Q79_Q34_Q16', '20240202.1101')

In [91]:
SET_BASE_POS = {"N", "V", "V2", "V3", "A"}
SET_PROPN_POS = {"NP", "LN", "GN", "SN", "PN"}

In [92]:
# Remove last Harakat from an Arabic Word
NOT_LETTERS_PATTERN = f"[^{LETTERS}]"
DIACRITICS_PATTERN = "".join(DIACRITICS)
LAST_HARAKAT_PATTERN = re.compile(rf"[{DIACRITICS_PATTERN}](?={NOT_LETTERS_PATTERN}*$)", re.UNICODE)

In [93]:
pd.set_option("display.max_colwidth", None)

## 4.0 Define Functions

### 4.1 Utilities Functions

In [94]:
def is_nonspace_mark(x: str) -> bool:
    """detect diacritics in a string"""
    return bool(category(x) == "Mn")

In [95]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [96]:
def remove_diacritics_ar(text: str) -> str:
    """remove diacritics in a string"""
    return "".join([t for t in text if not is_nonspace_mark(t)])

In [97]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    if not isinstance(ar_vocalized, str):
        return ar_vocalized
    ar_norm = "".join(c for c in ar_vocalized if is_arabicrange(c)).strip()
    ar_norm = normalize("NFC", ar_norm)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

In [98]:
def join_ar_words(words):
    if not isinstance(words, str):
        return words
    # join arabic lettera names by |
    # if the next word is haraka name join by -
    joined = name(words[0])
    for word in words[1:]:
        # Check if the word contains Arabic diacritics
        if word in DIACRITICS:
            joined += "-" + name(word)
        else:
            joined += " | " + name(word)
    return joined

### 4.2 Main Functions

In [99]:
def get_word_dictionaries(ar_word: str, wiki_idxs: Dict[str, int], wikitionary: List[dict]) -> List[dict]:
    """Get the Wikitionary info for a word"""
    word_idx = wiki_idxs.get(ar_word, [])
    word_data = [wikitionary[idx] for idx in word_idx]
    return word_data

In [100]:
def count_entry(ar_word, wiki_idxs, wikitionary):
    """Count howmay entry are there in the Wikitionary for a word"""
    ar_word_unvocalized = remove_diacritics_ar(ar_word)
    word_wikidicts = get_word_dictionaries(ar_word_unvocalized, wiki_idxs, wikitionary)
    return len(word_wikidicts)

In [101]:
def get_canonical_form(word, wiki_idxs, wikitionary):
    """get all vocalize forms of the word from Wikitionary"""
    # get all word'd entries in Wikitionary
    # Vocalized form is found in `'forms': [{'form': WORD_VOCALIZED, 'tags': ['canonical']}`
    form_vocal = []
    pos = []
    glosses = []
    word_idxs = []
    for word_idx in wiki_idxs.get(word, []):
        for _, word_data in wikitionary[word_idx].items():
            for form_dict in word_data.get("forms", []):
                if form_dict.get("tags", [None])[0] == "canonical":
                    form_word = form_dict.get("form")
                    if form_word and is_vocalizedtext(form_word):
                        form_vocal.append(normalize_ar(form_word))
                        pos.append(word_data["pos"])
                        word_idxs.append(word_idx)
                        glosses.append(
                            [
                                " / ".join(
                                    [", ".join(sense["glosses"]) for sense in word_data.get("senses", []) if "glosses" in sense]
                                )
                            ]
                        )

    if not form_vocal:
        # Not found in WikiData Use Camel Tools
        db = MorphologyDB.builtin_db()
        analyzer = Analyzer(db)
        analyses = analyzer.analyze(word)
        for analysis in analyses:
            if not form_vocal or (analysis["diac"] != form_vocal[-1] and analysis["pos"] != pos[-1]):
                form_vocal.append(LAST_HARAKAT_PATTERN.sub("", analysis["diac"]))
                pos.append(analysis["pos"])
                glosses.append(" / ".join(analysis["gloss"].split(";")))
                word_idxs.append(-1)

    return (form_vocal, pos, word_idxs, glosses) if form_vocal else (word, pd.NA, -1, pd.NA)

## 5.0 Check Wikitionary

- Get the missing words in Wikitionary
- Confirm that each Arabic word has only one entry in the Wikitionary file. If not, the short vowels need to be added or edited in the source
file to remove any disambguation.
- Export words with thier number of entries to be modified manually later

### 5.1 Load Files

In [124]:
with open(gf_lin_fun_path, encoding="utf-8", mode="rt") as obj_file:
    list_lin_fn_words = obj_file.read().split("\n")[3:-1]

dict_lin_words = {}
for str_lin_fn_words in list_lin_fn_words:
    en_entry, ar_entry = str_lin_fn_words.split(" = ")
    en_entry = en_entry[4:]
    ar_word = remove_diacritics_ar(ar_entry[1:-1].split("_")[0])
    dict_lin_words[en_entry] = ar_word


In [102]:
df_ar_gf_all = []
for path in ar_words_gf_path:
    df = pd.read_csv(path, delimiter="\t", index_col=0).reset_index()
    df["file"] = "_".join(path.stem.split("_")[:3])
    df["file"] = df[["file", "index"]].apply(lambda x: "_".join([str(x[1]), x[0]]), axis=1)
    df.drop("index", axis=1, inplace=True)
    df_ar_gf_all.append(df)

df_ar_gf_all = pd.concat(df_ar_gf_all, ignore_index=True).drop("en", axis=1)

In [103]:
df_ar_gf_all.head(5)

Unnamed: 0,en_entry,ar,pos,file
0,authoritarian_1_A,استبدادي,A,7_20231122.1302_Q79_Q34
1,domestic_1_A,محلي,A,31_20231122.1302_Q79_Q34
2,following_2_A,تالي,A,41_20231122.1302_Q79_Q34
3,free_1_A,حر,A,43_20231122.1302_Q79_Q34
4,full_3_A,ممتلئ,A,45_20231122.1302_Q79_Q34


In [104]:
set_all_pos = set(df_ar_gf_all["pos"].to_list())
set_all_pos

{'A', 'CN', 'GN', 'LN', 'MU', 'N', 'NP', 'PN', 'SN', 'V', 'V2', 'V3', 'VP'}

In [105]:
set_other_pos = set_all_pos.difference(SET_BASE_POS).difference(SET_PROPN_POS)
set_other_pos

{'CN', 'MU', 'VP'}

In [106]:
df_ar_gf = df_ar_gf_all.copy()[(df_ar_gf_all["pos"].isin(SET_BASE_POS)) | (df_ar_gf_all["pos"] == "MU")]
df_ar_cn_gf = df_ar_gf_all.copy()[(df_ar_gf_all["pos"].str.startswith("CN")) | (df_ar_gf_all["pos"].str.startswith("VP"))]
df_ar_pn_gf = df_ar_gf_all.copy()[df_ar_gf_all["pos"].isin(SET_PROPN_POS)]

In [107]:
df_ar_gf_all.shape[0], df_ar_gf.shape[0] + df_ar_cn_gf.shape[0] + df_ar_pn_gf.shape[0]

(304, 304)

In [108]:
display(df_ar_gf.head(4))

Unnamed: 0,en_entry,ar,pos,file
0,authoritarian_1_A,استبدادي,A,7_20231122.1302_Q79_Q34
1,domestic_1_A,محلي,A,31_20231122.1302_Q79_Q34
2,following_2_A,تالي,A,41_20231122.1302_Q79_Q34
3,free_1_A,حر,A,43_20231122.1302_Q79_Q34


In [109]:
# 2. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-3219

Reading line-128636

In [110]:
# 3. load Indices
with gzip.open(wiki_indices_path, "rt", encoding="utf-8") as gzip_obj:
    wiki_idxs = json.load(gzip_obj)

### 5.2 Get the Number of Entry for each Word

In [111]:
# Normalize words, the check the number of entry
df_ar_gf["entry_num"] = df_ar_gf["ar"].apply(normalize_ar).apply(count_entry, args=(wiki_idxs, wikitionary))

In [112]:
df_ar_gf.head()

Unnamed: 0,en_entry,ar,pos,file,entry_num
0,authoritarian_1_A,استبدادي,A,7_20231122.1302_Q79_Q34,0
1,domestic_1_A,محلي,A,31_20231122.1302_Q79_Q34,1
2,following_2_A,تالي,A,41_20231122.1302_Q79_Q34,1
3,free_1_A,حر,A,43_20231122.1302_Q79_Q34,3
4,full_3_A,ممتلئ,A,45_20231122.1302_Q79_Q34,1


In [113]:
max_entrs = df_ar_gf["entry_num"].max()
md(f"> Some words has more than {max_entrs} entries!!")

> Some words has more than 12 entries!!

In [114]:
pd.DataFrame({"count": df_ar_gf["entry_num"].value_counts()})

Unnamed: 0_level_0,count
entry_num,Unnamed: 1_level_1
1,62
2,46
0,33
3,24
8,4
6,4
4,4
5,3
10,2
12,1


### Add All Vocalized Forms for each Word

In [115]:
df_ambiguous_grouped = df_ar_gf.copy()
(
    df_ambiguous_grouped["vocal_forms"],
    df_ambiguous_grouped["wiki_pos"],
    df_ambiguous_grouped["wiki_idx"],
    df_ambiguous_grouped["senses"],
) = zip(
    *df_ambiguous_grouped["ar"]
    .apply(normalize_ar)
    .apply(remove_diacritics_ar)
    .apply(get_canonical_form, args=(wiki_idxs, wikitionary))
)

In [116]:
df_ambiguous_grouped.head(30)

Unnamed: 0,en_entry,ar,pos,file,entry_num,vocal_forms,wiki_pos,wiki_idx,senses
0,authoritarian_1_A,استبدادي,A,7_20231122.1302_Q79_Q34,0,"[اِسْتِبْدادِي, اِسْتِبْدادِي]","[noun, adj]","[-1, -1]","[despotism / monopolization+my, arbitrary / authoritarian]"
1,domestic_1_A,محلي,A,31_20231122.1302_Q79_Q34,1,[مَحَلِّيّ],[adj],[4973],"[[local, national]]"
2,following_2_A,تالي,A,41_20231122.1302_Q79_Q34,1,[تَالِي],[adj],[117599],[[]]
3,free_1_A,حر,A,43_20231122.1302_Q79_Q34,3,"[حُرّ, حَرّ, حِرّ]","[adj, noun, noun]","[4339, 4340, 4341]","[[free / unimpeded / set free, freedman / born free and noble / virtuous, genuine, true, pure, good / unmixed], [heat / burning of the heart, from pain, wrath, distress, affliction, trouble or fatigue / difficulty or severity of work], [vulva, pudendum of a woman]]"
4,full_3_A,ممتلئ,A,45_20231122.1302_Q79_Q34,1,[مُمْتَلِئ],[adj],[10802],"[[full, filled, filled up, replete]]"
5,gross_1_A,إجمالي,A,50_20231122.1302_Q79_Q34,1,[إِجْمَالِيّ],[adj],[124714],"[[comprehensive, general, total]]"
6,high_1_A,عالي,A,57_20231122.1302_Q79_Q34,2,"[عَالِي, عَالِي]","[adv, adj]","[5210, 5211]","[[over, upon, above], []]"
7,individual_4_A,فردي,A,59_20231122.1302_Q79_Q34,1,[فَرْدِيّ],[adj],[124077],"[[single / individual, personal / odd, uneven]]"
8,large_1_A,كبير,A,72_20231122.1302_Q79_Q34,2,"[كَبِير, كَبِير]","[adj, noun]","[4262, 4263]","[[big, large / great, great importance / old (for a person)], [chief, leader]]"
9,median_3_A,الوسيط,A,76_20231122.1302_Q79_Q34,0,[الوَسِيط],[noun],[-1],[the+mediator / go-between / intermediary]


In [117]:
df_ar_gf_ambiguous = df_ambiguous_grouped.copy().explode(["vocal_forms", "wiki_idx", "wiki_pos", "senses"])

In [118]:
df_ar_gf_ambiguous.head(5)

Unnamed: 0,en_entry,ar,pos,file,entry_num,vocal_forms,wiki_pos,wiki_idx,senses
0,authoritarian_1_A,استبدادي,A,7_20231122.1302_Q79_Q34,0,اِسْتِبْدادِي,noun,-1,despotism / monopolization+my
0,authoritarian_1_A,استبدادي,A,7_20231122.1302_Q79_Q34,0,اِسْتِبْدادِي,adj,-1,arbitrary / authoritarian
1,domestic_1_A,محلي,A,31_20231122.1302_Q79_Q34,1,مَحَلِّيّ,adj,4973,"[local, national]"
2,following_2_A,تالي,A,41_20231122.1302_Q79_Q34,1,تَالِي,adj,117599,[]
3,free_1_A,حر,A,43_20231122.1302_Q79_Q34,3,حُرّ,adj,4339,"[free / unimpeded / set free, freedman / born free and noble / virtuous, genuine, true, pure, good / unmixed]"


In [119]:
df_ar_gf_ambiguous.shape

(407, 9)

## 6.0 Export Ambiguous Words

- Manually fixing the diacritics of the words that has more than one entry.
- Manually checking the missing word in Wkitionary

In [34]:
# Sort by count and index
df_ar_gf_ambiguous = (
    df_ar_gf_ambiguous.copy()
    .reset_index(drop=False)
    .sort_values(by=["entry_num", "en_entry", "wiki_idx", "ar"], kind="mergesort")
)

In [35]:
# Add Letter Names in Arabic
df_ar_gf_ambiguous["ar_letters"] = df_ar_gf_ambiguous["vocal_forms"].apply(normalize_ar).apply(join_ar_words)

In [36]:
df_ar_gf_ambiguous["tim_translit"] = df_ar_gf_ambiguous["vocal_forms"].apply(
    lambda x: translit(x, "arabic", "tim") if isinstance(x, str) else x
)

In [37]:
df_ar_gf_ambiguous["select"] = 0

In [38]:
# Rearrange columns order
neworder = [
    "wiki_idx",
    "en_entry",
    "ar",
    "vocal_forms",
    "pos",
    "wiki_pos",
    "select",
    "entry_num",
    "ar_letters",
    "senses",
    "tim_translit",
    "file"
]
df_ar_gf_ambiguous = df_ar_gf_ambiguous.reindex(columns=neworder)

In [39]:
display(df_ar_gf_ambiguous)

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,pos,wiki_pos,select,entry_num,ar_letters,senses,tim_translit,file
31,-1,added_A,مضاف,مُضاف,A,adj,0,0,ميم-ضمة | ضاد | ألف | فاء,added,muDAf,157_20231122.1302_Q79_Q34
0,-1,authoritarian_1_A,استبدادي,اِسْتِبْدادِي,A,noun,0,0,ألف-كسرة | سين-سكون | تاء-كسرة | باء-سكون | دال | ألف | دال-كسرة | ياء,despotism / monopolization+my,AisotibodAdiy,7_20231122.1302_Q79_Q34
1,-1,authoritarian_1_A,استبدادي,اِسْتِبْدادِي,A,adj,0,0,ألف-كسرة | سين-سكون | تاء-كسرة | باء-سكون | دال | ألف | دال-كسرة | ياء,arbitrary / authoritarian,AisotibodAdiy,7_20231122.1302_Q79_Q34
213,-1,chinese_N,صينىة,صِينِيَّة,N,noun,0,0,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,Chinese+[fem.sg.],Siyniy~ap,10_20231201.1559_Q79_Q34
214,-1,chinese_N,صينىة,صِينِيَّة,N,adj,0,0,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,Chinese+[fem.sg.],Siyniy~ap,10_20231201.1559_Q79_Q34
...,...,...,...,...,...,...,...,...,...,...,...,...
176,8042,show_2_V2,عرض,عَرْض,V2,noun,0,12,عين-فتحة | راء-سكون | ضاد,[army / locusts],EaroD,113_20231122.1302_Q79_Q34
177,8043,show_2_V2,عرض,عَرْض,V2,noun,0,12,عين-فتحة | راء-سكون | ضاد,"[verbal noun of عُرِضَ (ʕuriḍa) (form I) / compensation, substitute / madness, possession by a jinn / hour]",EaroD,113_20231122.1302_Q79_Q34
178,8044,show_2_V2,عرض,عَرَض,V2,noun,0,12,عين-فتحة | راء-فتحة | ضاد,"[symptom, characteristic, accident as opposed to substance, what exposes another thing to view, manifestation]",EaraD,113_20231122.1302_Q79_Q34
179,8045,show_2_V2,عرض,عِرْض,V2,noun,0,12,عين-كسرة | راء-سكون | ضاد,"[tract, quarter, low land / blame, attribution of vice, reproach, reputation]",EiroD,113_20231122.1302_Q79_Q34


In [40]:
df_ar_gf_ambiguous.to_csv(output_dir / f"{TIMESTAMP}_{entites}_word_ambiguous.csv", sep="\t")
df_ar_gf_ambiguous.to_csv(output_unambiguous_dir / f"{TIMESTAMP}_{entites}_word_unambiguous.csv", sep="\t")

> Export Other DataFrames

In [41]:
df_ar_cn_gf.to_csv(output_dir / f"{TIMESTAMP}_{entites}_CN_ambiguous.csv", sep="\t")
df_ar_pn_gf.to_csv(output_dir / f"{TIMESTAMP}_{entites}_PN_ambiguous.csv", sep="\t")
df_ar_prep.to_csv(output_dir / f"{TIMESTAMP}_{entites}_PREP_ambiguous.csv", sep="\t")