# Imports

In [49]:
import re
from pathlib import Path
import pandas as pd
from unicodedata import normalize
from pyarabic.araby import DIACRITICS, SHADDA, LETTERS, is_arabicword
from time import time
from datetime import datetime

# Define paths

In [50]:
data_dir = Path("../data/interim/lexicon")
ar_adjectives_path = data_dir / "20240202.1101_Q79_Q34_Q16_adjectives_lexicon.csv"
# ar_verbs_path = data_dir / "20231201.1559_Q79_Q34_verbs_lexicon.csv"
ar_nouns_path = data_dir / "20240202.1101_Q79_Q34_Q16_nouns_lexicon.csv"
ar_gn_path = Path("../data/interim/ambiguous/") / "20240202.1101_Q79_Q34_Q16_PN_ambiguous.csv"

In [51]:
output_dir = Path("../data/processed/gf")

In [52]:
pd.set_option('max_colwidth', 400)

# Define Variables

In [53]:
# Remove last Harakat from an Arabic Word
NOT_LETTERS_PATTERN = f"[^{LETTERS}]"
DIACRITICS_PATTERN = "".join(DIACRITICS)
LAST_HARAKAT_PATTERN = re.compile(
    rf"[{DIACRITICS_PATTERN}](?={NOT_LETTERS_PATTERN}*$)", re.UNICODE
)

In [54]:
# Set of Harakat
DIACRITICS_SET = set(DIACRITICS)  # Aabic diacritics/short vowels

In [55]:
by_pass_words = ["غير معدود"]


MORPHOLOGY_MAP = {
    "verb_form": "cls",
    "gender": "g",
    "root": "root",
    "plural": "pl",
    "masc_pl": "masc_pl",
    "fem_pl": "fem_pl",
    "imperfect": "imperfect",
}

In [56]:
TIMESTAMP = "_".join(ar_adjectives_path.stem.split("_")[:-2])

# Define Functions

## Utilities Functions

In [57]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS_SET and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [58]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    if not isinstance(ar_vocalized, str):
        return ar_vocalized
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## Main Functions

In [59]:
def get_lin(row):
    row_dict = dict(row)
    dict_lins = {}
    for morpho, lin_value in row_dict.items():
        if lin_name := MORPHOLOGY_MAP.get(morpho):
            if isinstance(lin_value, str) and lin_value not in by_pass_words:
                if is_arabicword(lin_value):
                    lin_value = f'"{lin_value}"'
                dict_lins[lin_name] = f'{lin_name} = {lin_value}'
    return dict_lins

In [60]:
def build_gf_abstract_entries(row):
    cat = row["wiki_pos"][0].capitalize()
    lemma = row["vocal_forms"]
    idx = row["wiki_idx"]
    senses = row["senses"]
    source = "wikitionary"

    gf_fun_str = "fun '{}_{}' : {} ; "
    comment_str = "-- source: {}, idx: {}, senses: {}"

    gf_fun_str = gf_fun_str.format(lemma, cat, cat)
    comment_str = comment_str.format(source, idx, senses)

    dict_lins = get_lin(row)
    if cat == "V":
        LIN_ENTRY_V = ["cls", "imperfect", "perfect", "root"]
        list_lins = []
        for lin_entry in LIN_ENTRY_V:
            if lin_entry == "perfect":
                list_lins.append(f'perfect = "{lemma}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "N":
        LIN_ENTRY_N = ["g", "pl", "root", "sg"]
        list_lins = []
        for lin_entry in LIN_ENTRY_N:
            if lin_entry == "sg":
                list_lins.append(f'sg = "{lemma}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "A":
        LIN_ENTRY_A = ["fem_pl", "fem_sg", "masc_pl", "masc_sg", "root"]
        list_lins = []
        for lin_entry in LIN_ENTRY_A:
            if lin_entry == "fem_sg":
                if row["gender"] == "fem":
                    list_lins.append(f'fem_sg = "{lemma}"')
                else:
                    list_lins.append(f'fem_sg = "{row["other_gender_form"]}"')
            elif lin_entry == "masc_sg":
                if row["gender"] == "masc":
                    list_lins.append(f'masc_sg = "{lemma}"')
                else:
                    list_lins.append(f'masc_sg = "{row["other_gender_form"]}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "GN":
        list_lins = [row[""]]

    str_lins = " ; ".join(list_lins)
    lin_entry = f"'{lemma}_{cat}'"
    lin = f"lin {lin_entry} = wmk{cat} {{ " + str_lins + " } ;"

    return f"{gf_fun_str}{comment_str}", lin

# Load CSV Files

In [61]:
df_adjs = pd.read_csv(ar_adjectives_path, index_col=0, converters={"senses": pd.eval})
df_nouns = pd.read_csv(ar_nouns_path, index_col=0, converters={"senses": pd.eval})
# df_verbs = pd.read_csv(ar_verbs_path, index_col=0, converters={"senses": pd.eval})


In [77]:
df_pnouns = pd.read_csv(ar_gn_path, index_col=0)

In [62]:
# filter no_en_entry
df_adjs = df_adjs.copy()[df_adjs["no_en_entry"] == 0]
df_nouns = df_nouns.copy()[df_nouns["no_en_entry"] == 0]

In [63]:
df_nouns

Unnamed: 0,wiki_idx,no_en_entry,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,plural,root
17,-1,0,algonquin_N,ألجونكوين,ألجونكوين,noun,همزة على الألف | لام | جيم | واو | نون | كاف | واو | ياء | نون,[],>ljwnkwyn,,,,
22,-1,0,blackfoot_N,بلاكفوت,بلاكفوت,noun,باء | لام | ألف | كاف | فاء | واو | تاء,[],blAkfwt,,,,
23,-1,0,cayuga_N,كايوجا,كايوجا,noun,كاف | ألف | ياء | واو | جيم | ألف,[],kAywjA,,,,
10,-1,0,chinese_N,صينىة,صِينِيَّة,noun,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,Chinese+fem.sg,Siyniy~ap,,fem,,
24,-1,0,chipewyan_N,تشيبويان,تشيبويان,noun,تاء | شين | ياء | باء | واو | ياء | ألف | نون,[],t$ybwyAn,,,,
35,-1,0,english_N,الإنجليزية,إِنْجلِيزِيَّة,noun,ألف | لام | همزة تحت الألف-كسرة | نون-سكون | جيم | لام-كسرة | ياء | زاي-كسرة | ياء-شدة-فتحة | تاء مربوطة,the+English_(language)+fem.sg,Al<inojliyziy~ap,,fem,,
41,-1,0,filipino_2_N,الفلبينية,فِلِبِّينِيَّة,noun,ألف | لام | فاء-كسرة | لام-كسرة | باء-شدة-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,the+Philippine / Filipino+fem.sg,Alfilib~iyniy~ap,,fem,,
44,-1,0,finnish_N,الفنلندية,فِنْلَنْدِيَّة,noun,ألف | لام | فاء-كسرة | نون-سكون | لام-فتحة | نون-سكون | دال-كسرة | ياء-شدة-فتحة | تاء مربوطة,the+Finnish+fem.sg,Alfinolanodiy~ap,,fem,,
45,-1,0,haida_N,هيدا,هيدا,noun,هاء | ياء | دال | ألف,[],hydA,,,,
64,-1,0,mohawk_2_N,موهوك,مَوَّهُوك,noun,,[],,,,,


# Build Abstract GF

In [64]:
df_adjs["other_gender_form"] = df_adjs.apply({"other_gender_form": normalize_ar})
df_adjs["other_gender_form"] = df_adjs.apply(
    {
        "other_gender_form": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
        if isinstance(s, str)
        else s
    }
)
df_adjs["masc_pl"] = df_adjs.apply({"masc_pl": normalize_ar})
df_adjs["masc_pl"] = df_adjs.apply(
    {"masc_pl": lambda s: LAST_HARAKAT_PATTERN.sub("", s) if isinstance(s, str) else s}
)
df_adjs["fem_pl"] = df_adjs.apply({"fem_pl": normalize_ar})
df_adjs["fem_pl"] = df_adjs.apply(
    {"fem_pl": lambda s: LAST_HARAKAT_PATTERN.sub("", s) if isinstance(s, str) else s}
)
df_adjs["vocal_forms"] = df_adjs.apply({"vocal_forms": normalize_ar})
df_adjs["vocal_forms"] = df_adjs.apply(
    {"vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)}
)
df_adjs["abs"], df_adjs["cnc"] = zip(
    *df_adjs.apply(build_gf_abstract_entries, axis="columns")
)

In [65]:
df_adjs["wordnet_ara"] = df_adjs[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_A'", axis=1)

In [66]:
df_nouns["vocal_forms"] = df_nouns.apply({"vocal_forms": normalize_ar})
df_nouns["vocal_forms"] = df_nouns.apply(
    {"vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)}
)
df_nouns["abs"], df_nouns["cnc"] = zip(
    *df_nouns.apply(build_gf_abstract_entries, axis="columns")
)

In [67]:
df_nouns["wordnet_ara"] = df_nouns[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_N'", axis=1)

In [68]:

# df_verbs["vocal_forms"] = df_verbs.apply({"vocal_forms": normalize_ar})
# df_verbs["vocal_forms"] = df_verbs.apply(
#     {
#         "vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
#         if isinstance(s, str)
#         else s
#     }
# )
# df_verbs["imperfect"] = df_verbs.apply({"imperfect": normalize_ar})
# df_verbs["imperfect"] = df_verbs.apply(
#     {
#         "imperfect": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
#         if isinstance(s, str)
#         else s
#     }
# )
# df_verbs["abs"], df_verbs["cnc"] = zip(
#     *df_verbs.apply(build_gf_abstract_entries, axis="columns")
# )

In [69]:
# df_verbs["wordnet_ara"] = df_verbs[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_V'", axis=1)

In [70]:
df_adjs

Unnamed: 0,wiki_idx,no_en_entry,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,gender,other_gender_form,masc_pl,fem_pl,root,abs,cnc,wordnet_ara
21,7862,0,average_1_N,متوسط,مُتَوَسِّط,adj,ميم-ضمة | تاء-فتحة | واو-فتحة | سين-شدة-كسرة | طاء,"[being in the middle, mediating / middle, central / medium / average, middling, indifferent]",mutawas~iT,masc,مُتَوَسِّطَة,مُتَوَسِّطُون,مُتَوَسِّطَات,وسط,"fun 'مُتَوَسِّط_A' : A ; -- source: wikitionary, idx: 7862, senses: ['being in the middle, mediating / middle, central / medium / average, middling, indifferent']","lin 'مُتَوَسِّط_A' = wmkA { fem_pl = ""مُتَوَسِّطَات"" ; fem_sg = ""مُتَوَسِّطَة"" ; masc_pl = ""مُتَوَسِّطُون"" ; masc_sg = ""مُتَوَسِّط"" ; root = ""وسط"" } ;",lin average_1_N = 'مُتَوَسِّط_A'
1,3016,0,regime_1_N,حاكم,حَاكِم,adj,حاء-فتحة | ألف | كاف-كسرة | ميم,"[ruling, governing / decisive]",HaAkim,masc,حَاكِمَة,حَاكِمُون,حَاكِمَات,حكم,"fun 'حَاكِم_A' : A ; -- source: wikitionary, idx: 3016, senses: ['ruling, governing / decisive']","lin 'حَاكِم_A' = wmkA { fem_pl = ""حَاكِمَات"" ; fem_sg = ""حَاكِمَة"" ; masc_pl = ""حَاكِمُون"" ; masc_sg = ""حَاكِم"" ; root = ""حكم"" } ;",lin regime_1_N = 'حَاكِم_A'


In [71]:
# df_verbs

In [72]:
df_nouns

Unnamed: 0,wiki_idx,no_en_entry,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,plural,root,abs,cnc,wordnet_ara
17,-1,0,algonquin_N,ألجونكوين,ألجونكوين,noun,همزة على الألف | لام | جيم | واو | نون | كاف | واو | ياء | نون,[],>ljwnkwyn,,,,,"fun 'ألجونكوين_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'ألجونكوين_N' = wmkN { sg = ""ألجونكوين"" } ;",lin algonquin_N = 'ألجونكوين_N'
22,-1,0,blackfoot_N,بلاكفوت,بلاكفوت,noun,باء | لام | ألف | كاف | فاء | واو | تاء,[],blAkfwt,,,,,"fun 'بلاكفوت_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'بلاكفوت_N' = wmkN { sg = ""بلاكفوت"" } ;",lin blackfoot_N = 'بلاكفوت_N'
23,-1,0,cayuga_N,كايوجا,كايوجا,noun,كاف | ألف | ياء | واو | جيم | ألف,[],kAywjA,,,,,"fun 'كايوجا_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'كايوجا_N' = wmkN { sg = ""كايوجا"" } ;",lin cayuga_N = 'كايوجا_N'
10,-1,0,chinese_N,صينىة,صِينِيَّة,noun,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,Chinese+fem.sg,Siyniy~ap,,fem,,,"fun 'صِينِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: Chinese+fem.sg","lin 'صِينِيَّة_N' = wmkN { g = fem ; sg = ""صِينِيَّة"" } ;",lin chinese_N = 'صِينِيَّة_N'
24,-1,0,chipewyan_N,تشيبويان,تشيبويان,noun,تاء | شين | ياء | باء | واو | ياء | ألف | نون,[],t$ybwyAn,,,,,"fun 'تشيبويان_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'تشيبويان_N' = wmkN { sg = ""تشيبويان"" } ;",lin chipewyan_N = 'تشيبويان_N'
35,-1,0,english_N,الإنجليزية,إِنْجلِيزِيَّة,noun,ألف | لام | همزة تحت الألف-كسرة | نون-سكون | جيم | لام-كسرة | ياء | زاي-كسرة | ياء-شدة-فتحة | تاء مربوطة,the+English_(language)+fem.sg,Al<inojliyziy~ap,,fem,,,"fun 'إِنْجلِيزِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: the+English_(language)+fem.sg","lin 'إِنْجلِيزِيَّة_N' = wmkN { g = fem ; sg = ""إِنْجلِيزِيَّة"" } ;",lin english_N = 'إِنْجلِيزِيَّة_N'
41,-1,0,filipino_2_N,الفلبينية,فِلِبِّينِيَّة,noun,ألف | لام | فاء-كسرة | لام-كسرة | باء-شدة-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,the+Philippine / Filipino+fem.sg,Alfilib~iyniy~ap,,fem,,,"fun 'فِلِبِّينِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: the+Philippine / Filipino+fem.sg","lin 'فِلِبِّينِيَّة_N' = wmkN { g = fem ; sg = ""فِلِبِّينِيَّة"" } ;",lin filipino_2_N = 'فِلِبِّينِيَّة_N'
44,-1,0,finnish_N,الفنلندية,فِنْلَنْدِيَّة,noun,ألف | لام | فاء-كسرة | نون-سكون | لام-فتحة | نون-سكون | دال-كسرة | ياء-شدة-فتحة | تاء مربوطة,the+Finnish+fem.sg,Alfinolanodiy~ap,,fem,,,"fun 'فِنْلَنْدِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: the+Finnish+fem.sg","lin 'فِنْلَنْدِيَّة_N' = wmkN { g = fem ; sg = ""فِنْلَنْدِيَّة"" } ;",lin finnish_N = 'فِنْلَنْدِيَّة_N'
45,-1,0,haida_N,هيدا,هيدا,noun,هاء | ياء | دال | ألف,[],hydA,,,,,"fun 'هيدا_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'هيدا_N' = wmkN { sg = ""هيدا"" } ;",lin haida_N = 'هيدا_N'
64,-1,0,mohawk_2_N,موهوك,مَوَّهُوك,noun,,[],,,,,,"fun 'مَوَّهُوك_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'مَوَّهُوك_N' = wmkN { sg = ""مَوَّهُوك"" } ;",lin mohawk_2_N = 'مَوَّهُوك_N'


# Export GF Files

In [73]:
nouns_abs = "\n".join(df_nouns["abs"].to_list())
adjs_abs = "\n".join(df_adjs["abs"].to_list())
# verbs_abs = "\n".join(df_verbs["abs"].to_list())

nouns_lin = "\n".join(df_nouns["cnc"].to_list())
adjs_lin = "\n".join(df_adjs["cnc"].to_list())
# verbs_lin = "\n".join(df_verbs["cnc"].to_list())

nouns_wordnet = "\n".join(df_nouns["wordnet_ara"].to_list())
adjs_wordnet = "\n".join(df_adjs["wordnet_ara"].to_list())
# verbs_wordnet = "\n".join(df_verbs["wordnet_ara"].to_list())

ar_absolute_grammar = "abstract MorphoDictAraAbs = Cat ** {\n"
ar_absolute_grammar += f"{nouns_abs}" + "\n"
ar_absolute_grammar += f"{adjs_abs}" + "\n"
# ar_absolute_grammar += f"{verbs_abs}" + "\n"
ar_absolute_grammar += "}"

ar_concrete_grammar = "concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {\n"
ar_concrete_grammar += f"{nouns_lin}" + "\n"
ar_concrete_grammar += f"{adjs_lin}" + "\n"
# ar_concrete_grammar += f"{verbs_lin}" + "\n"
ar_concrete_grammar += "}"

wordnet_ar = """--# -path=.:./gf-wordnet
concrete WordNetAra of WordNet = CatAra ** open MorphoDictAra, MoreAra, ParadigmsAra in {
lin en = variants {} ; --- guess from"""
wordnet_ar += f"{nouns_wordnet}" + "\n"
wordnet_ar += f"{adjs_wordnet}" + "\n"
# wordnet_ar += f"{verbs_wordnet}" + "\n"
wordnet_ar += "}"

In [74]:
with open(output_dir / "MorphoDictAraAbs.gf", encoding="utf-8", mode="at") as file:
    file.write(ar_absolute_grammar)

In [75]:
with open(output_dir / "MorphoDictAra.gf", encoding="utf-8", mode="at") as file:
    file.write(ar_concrete_grammar)

In [76]:
with open(output_dir / "WordNetAra.gf", encoding="utf-8", mode="at") as file:
    file.write(wordnet_ar)

# Load Names

In [28]:
# from nltk.corpus import wordnet as wn
# from nltk.wsd import lesk

# # The word to be disambiguated
# ambiguous_word = 'Egypt'

# # The context sentence in which the word is used
# english_context_sentence = ['the', 'branch', 'of', 'sociology', 'that', 'studies', 'the', 'characteristics', 'of', 'human', 'populations']

# # Apply the Lesk algorithm using the English context sentence
# best_sense = lesk(english_context_sentence, ambiguous_word)
# print(best_sense.definition())

# # If a best sense is found, find its Arabic equivalent
# if best_sense:
#     # Get the lemma names for the synset in Arabic ('arb')
#     arabic_lemmas = best_sense.lemma_names('arb')
#     if arabic_lemmas:
#         print(f"Best English sense: {best_sense.name()}")
#         # print(f"English definition: {best_sense.definition()}")
#         print(f"Arabic equivalent(s): {', '.join(arabic_lemmas)}")
#         print(f"ID                  : {best_sense.offset()}")

#     else:
#         print("No Arabic equivalent found.")
# else:
#     print("No best English sense found.")


a republic in northeastern Africa known as the United Arab Republic until 1971; site of an ancient civilization that flourished from 2600 to 30 BC
Best English sense: egypt.n.01
Arabic equivalent(s): أرْض_الكِنانة, جُمْهُورِيّة_مِصْر_العربِيّة, مِصْر
ID                  : 8897065
