## 1.0 Imports

In [1]:
import gzip
import json
from pathlib import Path
from pprint import pprint
from random import choice
from typing import Dict, List
from unicodedata import category, normalize

import pandas as pd
from IPython.display import display
from pyarabic.araby import ALEFAT, DIACRITICS, HAMZAT, SHADDA, is_vocalizedtext, name
from tqdm.notebook import tqdm

## 2.0 Define Paths

In [2]:
# Paths for reindexed Wikitionary and the Arabic words.

# Arabic words csv
data_dir = Path("../data/interim/")
ar_words_gf_path = data_dir / "ar2en_words_gf.csv"

# Path for preproceed Wikitionary
wiktionary_dir = Path("../data/processed/wikidata")
wiki_jsonl_path = wiktionary_dir / "ar-wiktextract-data.json.gz"
wiki_indices_path = wiktionary_dir / "ar_reindex.json.gz"

In [3]:
# Path to save words status
output_dir = Path("../data/interim/")

## 3.0 Define Variables

In [4]:
DIACRITICS = set(DIACRITICS)  # Aabic diacritics/short vowels

## 4.0 Define Functions

### 4.1 Utilities Functions

In [5]:
def is_nonspace_mark(x: str) -> bool:
    """detect diacritics in a string"""
    return bool(category(x) == "Mn")

In [6]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [7]:
def remove_diacritics_ar(text: str) -> str:
    """remove diacritics in a string"""
    return "".join([t for t in text if not is_nonspace_mark(t)])

In [8]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

In [9]:
def join_ar_words(words):
    """join words by | iff the next word is letter else join by -"""
    joined = name(words[0])
    for word in words[1:]:
        # Check if the word contains Arabic diacritics
        if word in DIACRITICS:
            joined += "-" + name(word)
        else:
            joined += " | " + name(word)
    return joined

### 4.2 Main Functions

In [10]:
def get_word_dictionaries(
    ar_word: str, wiki_idxs: Dict[str, int], wikitionary: List[dict]
) -> List[dict]:
    """Get the Wikitionary info for a word"""
    word_idx = wiki_idxs.get(ar_word, [])
    word_data = [wikitionary[idx] for idx in word_idx]
    return word_data

In [11]:
def count_entry(ar_word, wiki_idxs, wikitionary):
    """Count howmay entry are there in the Wikitionary for a word"""
    ar_word_unvocalized = remove_diacritics_ar(ar_word)
    word_wikidicts = get_word_dictionaries(ar_word_unvocalized, wiki_idxs, wikitionary)
    return len(word_wikidicts)

In [12]:
def get_canonical_form(word, wiki_idxs, wikitionary):
    """get all vocalize forms of the word from Wikitionary"""
    # get all word'd entries in Wikitionary
    # Vocalized form is found in `'forms': [{'form': WORD_VOCALIZED, 'tags': ['canonical']}`
    form_vocal = []
    pos = []
    glosses = []
    word_idxs = []
    for word_idx in wiki_idxs.get(word, []):
        for _, word_data in wikitionary[word_idx].items():
            for form_dict in word_data.get("forms", []):
                if form_dict.get("tags", [None])[0] == "canonical":
                    form_word = form_dict.get("form")
                    if form_word and is_vocalizedtext(form_word):
                        form_vocal.append(form_word)
                        pos.append(word_data["pos"])
                        word_idxs.append(word_idx)
                        glosses.append(
                            [
                                " / ".join(
                                    [
                                        ", ".join(sense["glosses"])
                                        for sense in word_data.get("senses", [])
                                        if "glosses" in sense
                                    ]
                                )
                            ]
                        )
    return (
        (form_vocal, pos, word_idxs, glosses)
        if form_vocal
        else (pd.NA, pd.NA, pd.NA, pd.NA)
    )

## 5.0 Check Wikitionary

- Get the missing words in Wikitionary
- Confirm that each Arabic word has only one entry in the Wikitionary file. If not, the short vowels need to be added or edited in the source
file to remove any disambguation.
- Export words with thier number of entries to be modified manually later

### 5.1 Load Files

In [13]:
# 1. load the csv arabic-english_gf words
df_ar_gf = pd.read_csv(ar_words_gf_path, delimiter="\t", index_col=0)

In [14]:
display(df_ar_gf.head(4))

Unnamed: 0_level_0,ar,en
li,Unnamed: 1_level_1,Unnamed: 2_level_1
292,مُطلَق,absolute_3_A
1168,إِدارِيّ,administrative_A
1558,أفْغانِيّ,afghani_1_N
1596,لغة أفريكانية,afrikaans_N


In [15]:
# 2. load the wikitionary
with gzip.open(wiki_jsonl_path, "rt", encoding="utf-8") as gzip_obj:
    wikitionary = []
    for i, json_line in enumerate(gzip_obj):
        wikitionary.append(dict(json.loads(json_line)))
        print(f"Reading line-{i}", end="\r")

Reading line-227

Reading line-128636

In [16]:
# 3. load Indices
with gzip.open(wiki_indices_path, "rt", encoding="utf-8") as gzip_obj:
    wiki_idxs = json.load(gzip_obj)

### 5.2 Get the Number of Entry for each Word

In [17]:
# Normalize words, the check the number of entry
df_ar_gf["entry_num"] = (
    df_ar_gf["ar"].apply(normalize_ar).apply(count_entry, args=(wiki_idxs, wikitionary))
)

In [18]:
df_ar_gf.head()

Unnamed: 0_level_0,ar,en,entry_num
li,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
292,مُطلَق,absolute_3_A,2
1168,إِدارِيّ,administrative_A,1
1558,أفْغانِيّ,afghani_1_N,2
1596,لغة أفريكانية,afrikaans_N,0
1643,سِنّ,age_1_N,3


### Number of entry summary:
  - 300 words are missing
  - Some words has more than 5 entries!!

In [19]:
pd.DataFrame({"count": df_ar_gf["entry_num"].value_counts()})

Unnamed: 0_level_0,count
entry_num,Unnamed: 1_level_1
0,300
1,90
2,45
3,22
4,6
6,4
5,4
8,3
7,2
10,2


### Add All Vocalized Forms for each Word

In [20]:
df_ambiguous_grouped = df_ar_gf.copy()
(
    df_ambiguous_grouped["vocal_forms"],
    df_ambiguous_grouped["pos"],
    df_ambiguous_grouped["wiki_idx"],
    df_ambiguous_grouped["senses"],
) = zip(
    *df_ambiguous_grouped["ar"]
    .apply(normalize_ar)
    .apply(remove_diacritics_ar)
    .apply(get_canonical_form, args=(wiki_idxs, wikitionary))
)

In [29]:
df_ambiguous_grouped.head(30)

Unnamed: 0_level_0,ar,en,entry_num,vocal_forms,pos,wiki_idx,senses
li,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
292,مُطلَق,absolute_3_A,2,"[مُطَلَّق, مُطْلَق]","[adj, adj]","[5921, 5922]","[[divorced], [absolute, utter, very, unlimited..."
1168,إِدارِيّ,administrative_A,1,[إِدَارِيّ],[adj],[126636],"[[administrative, managerial]]"
1558,أفْغانِيّ,afghani_1_N,2,"[أَفْغَانِيّ, أَفْغَانِيّ]","[adj, noun]","[8229, 8230]","[[Afghan], [Afghan]]"
1596,لغة أفريكانية,afrikaans_N,0,,,,
1643,سِنّ,age_1_N,3,"[سَنَّ, سَنّ, سِنّ]","[verb, noun, noun]","[1768, 1769, 1770]","[[to sharpen, to whet, to hone, to grind / to ..."
1973,الأكانية,akan_N,0,,,,
1987,ألاباما,alabama_4_N,1,[أَلَابَامَا],[name],[7593],[[Alabama (a state of the United States)]]
2019,الألبانية,albanian_2_N,0,,,,
2085,الأليوتية,aleut_N,0,,,,
2657,الأمهرية,amharic_N,0,,,,


In [30]:
df_ar_gf_ambiguous = df_ambiguous_grouped.copy().explode(
    ["vocal_forms", "wiki_idx", "pos", "senses"]
)

In [31]:
df_ar_gf_ambiguous.head(5)

Unnamed: 0_level_0,ar,en,entry_num,vocal_forms,pos,wiki_idx,senses
li,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
292,مُطلَق,absolute_3_A,2,مُطَلَّق,adj,5921,[divorced]
292,مُطلَق,absolute_3_A,2,مُطْلَق,adj,5922,"[absolute, utter, very, unlimited, unrestricte..."
1168,إِدارِيّ,administrative_A,1,إِدَارِيّ,adj,126636,"[administrative, managerial]"
1558,أفْغانِيّ,afghani_1_N,2,أَفْغَانِيّ,adj,8229,[Afghan]
1558,أفْغانِيّ,afghani_1_N,2,أَفْغَانِيّ,noun,8230,[Afghan]


## 6.0 Export Ambiguous Words

- Manually fixiing the diacritics of the words that has more than one entry.
- Manually checking the missing word in Wkitionary

In [32]:
# Sort by count and index
df_ar_gf_ambiguous = (
    df_ar_gf_ambiguous.copy()[df_ar_gf_ambiguous["entry_num"] > 1]
    .reset_index(drop=False)
    .sort_values(by=["li", "wiki_idx", "entry_num", "ar"], kind="mergesort")
)

In [33]:
# Add Letter Names in Arabic
df_ar_gf_ambiguous["ar_letters"] = (
    df_ar_gf_ambiguous["vocal_forms"].apply(normalize_ar).apply(join_ar_words)
)

In [34]:
# Rearrange columns order
neworder = [
    "li",
    "wiki_idx",
    "ar",
    "en",
    "entry_num",
    "vocal_forms",
    "pos",
    "ar_letters",
    "senses",
]
df_ar_gf_ambiguous = df_ar_gf_ambiguous.reindex(columns=neworder)

In [35]:
display(df_ar_gf_ambiguous)

Unnamed: 0,li,wiki_idx,ar,en,entry_num,vocal_forms,pos,ar_letters,senses
0,292,5921,مُطلَق,absolute_3_A,2,مُطَلَّق,adj,ميم-ضمة | طاء-فتحة | لام-شدة-فتحة | قاف,[divorced]
1,292,5922,مُطلَق,absolute_3_A,2,مُطْلَق,adj,ميم-ضمة | طاء-سكون | لام-فتحة | قاف,"[absolute, utter, very, unlimited, unrestricte..."
2,1558,8229,أفْغانِيّ,afghani_1_N,2,أَفْغَانِيّ,adj,همزة على الألف-فتحة | فاء-سكون | غين-فتحة | أل...,[Afghan]
3,1558,8230,أفْغانِيّ,afghani_1_N,2,أَفْغَانِيّ,noun,همزة على الألف-فتحة | فاء-سكون | غين-فتحة | أل...,[Afghan]
4,1643,1768,سِنّ,age_1_N,3,سَنَّ,verb,سين-فتحة | نون-شدة-فتحة,"[to sharpen, to whet, to hone, to grind / to m..."
...,...,...,...,...,...,...,...,...,...
295,95265,371,سنة,year_1_N,3,سَنَة,noun,سين-فتحة | نون-فتحة | تاء مربوطة,[year]
296,95265,372,سنة,year_1_N,3,سِنَة,noun,سين-كسرة | نون-فتحة | تاء مربوطة,[drowsiness / slumber; nap]
297,95265,373,سنة,year_1_N,3,سُنَّة,noun,سين-ضمة | نون-شدة-فتحة | تاء مربوطة,"[a usual, recurrent, continual, determinable, ..."
298,95318,21836,ين,yen_2_N,2,ـِينَ,suffix,تطويل-كسرة | ياء | نون-فتحة,"[Oblique form, i.e. genitive and accusative fo..."


In [36]:
df_ar_gf_ambiguous.to_csv(output_dir / "word_ambiguous.csv", sep="\t")