# Imports

In [1]:
import re
from pathlib import Path
import pandas as pd
from unicodedata import normalize
from pyarabic.araby import DIACRITICS, SHADDA, LETTERS, is_arabicword
from time import time
from datetime import datetime

# Define paths

In [2]:
data_dir = Path("../data/interim/lexicon")
ar_adjectives_path = data_dir / "20231201.1559_Q79_Q34_adjectives_lexicon.csv"
ar_verbs_path = data_dir / "20231201.1559_Q79_Q34_verbs_lexicon.csv"
ar_nouns_path = data_dir / "20231201.1559_Q79_Q34_nouns_lexicon.csv"
ar_gn_path = data_dir / "20231201.1559_Q79_Q34_given_name_lexicon.csv"

In [3]:
output_dir = Path("../data/processed/gf")

In [4]:
pd.set_option('max_colwidth', 400)

# Define Variables

In [5]:
# Remove last Harakat from an Arabic Word
NOT_LETTERS_PATTERN = f"[^{LETTERS}]"
DIACRITICS_PATTERN = "".join(DIACRITICS)
LAST_HARAKAT_PATTERN = re.compile(
    rf"[{DIACRITICS_PATTERN}](?={NOT_LETTERS_PATTERN}*$)", re.UNICODE
)

In [6]:
# Set of Harakat
DIACRITICS_SET = set(DIACRITICS)  # Aabic diacritics/short vowels

In [7]:
by_pass_words = ["غير معدود"]


MORPHOLOGY_MAP = {
    "verb_form": "cls",
    "gender": "g",
    "root": "root",
    "plural": "pl",
    "masc_pl": "masc_pl",
    "fem_pl": "fem_pl",
    "imperfect": "imperfect",
}

In [8]:
TIMESTAMP = "_".join(ar_adjectives_path.stem.split("_")[:-2])

# Define Functions

## Utilities Functions

In [9]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS_SET and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [10]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    if not isinstance(ar_vocalized, str):
        return ar_vocalized
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## Main Functions

In [11]:
def get_lin(row):
    row_dict = dict(row)
    dict_lins = {}
    for morpho, lin_value in row_dict.items():
        if lin_name := MORPHOLOGY_MAP.get(morpho):
            if isinstance(lin_value, str) and lin_value not in by_pass_words:
                if is_arabicword(lin_value):
                    lin_value = f'"{lin_value}"'
                dict_lins[lin_name] = f'{lin_name} = {lin_value}'
    return dict_lins

In [12]:
def build_gf_abstract_entries(row):
    cat = row["en_entry"].split("_")[-1][0]
    lemma = row["vocal_forms"]
    idx = row["wiki_idx"]
    senses = row["senses"]
    source = "wikitionary"

    gf_fun_str = "fun '{}_{}' : {} ; "
    comment_str = "-- source: {}, idx: {}, senses: {}"

    gf_fun_str = gf_fun_str.format(lemma, cat, cat)
    comment_str = comment_str.format(source, idx, senses)

    dict_lins = get_lin(row)
    if cat == "V":
        LIN_ENTRY_V = ["cls", "imperfect", "perfect", "root"]
        list_lins = []
        for lin_entry in LIN_ENTRY_V:
            if lin_entry == "perfect":
                list_lins.append(f'perfect = "{lemma}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "N":
        LIN_ENTRY_N = ["g", "pl", "root", "sg"]
        list_lins = []
        for lin_entry in LIN_ENTRY_N:
            if lin_entry == "sg":
                list_lins.append(f'sg = "{lemma}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "A":
        LIN_ENTRY_A = ["fem_pl", "fem_sg", "masc_pl", "masc_sg", "root"]
        list_lins = []
        for lin_entry in LIN_ENTRY_A:
            if lin_entry == "fem_sg":
                if row["gender"] == "fem":
                    list_lins.append(f'fem_sg = "{lemma}"')
                else:
                    list_lins.append(f'fem_sg = "{row["other_gender_form"]}"')
            elif lin_entry == "masc_sg":
                if row["gender"] == "masc":
                    list_lins.append(f'masc_sg = "{lemma}"')
                else:
                    list_lins.append(f'masc_sg = "{row["other_gender_form"]}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "GN":
        list_lins = [row[""]]

    str_lins = " ; ".join(list_lins)
    lin_entry = f"'{lemma}_{cat}'"
    lin = f"lin {lin_entry} = wmk{cat} {{ " + str_lins + " } ;"

    return f"{gf_fun_str}{comment_str}", lin

# Load CSV Files

In [13]:
df_adjs = pd.read_csv(ar_adjectives_path, index_col=0, converters={"senses": pd.eval})
df_nouns = pd.read_csv(ar_nouns_path, index_col=0, converters={"senses": pd.eval})
df_verbs = pd.read_csv(ar_verbs_path, index_col=0, converters={"senses": pd.eval})


In [14]:
df_nouns

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,plural,root
213,-1,chinese_N,صينىة,صِينِيَّة,noun,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,[],Siyniy~ap,10_20231201.1559_Q79_Q34,fem,,
231,-1,dari_N,دارية,دارِيَّة,noun,دال | ألف | راء-كسرة | ياء-شدة-فتحة | تاء مربوطة,[],dAriy~ap,15_20231201.1559_Q79_Q34,fem,,
69,-1,development_2_N,تطوير,تَطْوِير,noun,تاء-فتحة | طاء-سكون | واو-كسرة | ياء | راء,[],taTowiyr,29_20231122.1302_Q79_Q34,masc,,
263,-1,italian_N,ايطالية,إِيطالِيَّة,noun,همزة تحت الألف-كسرة | ياء | طاء | ألف | لام-كسرة | ياء-شدة-فتحة | تاء مربوطة,[],<iyTAliy~ap,31_20231201.1559_Q79_Q34,fem,,
108,-1,krona_1_N,كرون,كَرون,noun,كاف-فتحة | راء | واو | نون,[],karwn,69_20231122.1302_Q79_Q34,fem,,
...,...,...,...,...,...,...,...,...,...,...,...,...
311,3927,starch_1_N,نشاء,نَشَاء,noun,نون-فتحة | شين-فتحة | ألف | همزة,"[starch, cornstarch, farina]",na$aA',49_20231201.1559_Q79_Q34,masc,,نشو
44,2554,age_1_N,عمر,عُمْر,noun,عين-ضمة | ميم-سكون | راء,"[life as a period of time, length of life, lifespan, lifetime / age]",Eumor,0_20231122.1302_Q79_Q34,masc,أَعْمَار,عمر
145,5764,west_2_N,غرب,غَرْب,noun,غين-فتحة | راء-سكون | باء,"[verbal noun of غَرَبَ (ḡaraba) (form I) / west, occident / vehemence, violence, tempestuousness]",garob,136_20231122.1302_Q79_Q34,masc,,
97,695,king_1_N,ملِك,مَلِك,noun,ميم-فتحة | لام-كسرة | كاف,"[king, sovereign, monarch]",malik,67_20231122.1302_Q79_Q34,masc,مُلُوك,


# Build Abstract GF

In [15]:
df_adjs["other_gender_form"] = df_adjs.apply({"other_gender_form": normalize_ar})
df_adjs["other_gender_form"] = df_adjs.apply(
    {
        "other_gender_form": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
        if isinstance(s, str)
        else s
    }
)
df_adjs["masc_pl"] = df_adjs.apply({"masc_pl": normalize_ar})
df_adjs["masc_pl"] = df_adjs.apply(
    {"masc_pl": lambda s: LAST_HARAKAT_PATTERN.sub("", s) if isinstance(s, str) else s}
)
df_adjs["fem_pl"] = df_adjs.apply({"fem_pl": normalize_ar})
df_adjs["fem_pl"] = df_adjs.apply(
    {"fem_pl": lambda s: LAST_HARAKAT_PATTERN.sub("", s) if isinstance(s, str) else s}
)
df_adjs["vocal_forms"] = df_adjs.apply({"vocal_forms": normalize_ar})
df_adjs["vocal_forms"] = df_adjs.apply(
    {"vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)}
)
df_adjs["abs"], df_adjs["cnc"] = zip(
    *df_adjs.apply(build_gf_abstract_entries, axis="columns")
)

In [16]:
df_adjs["wordnet_ara"] = df_adjs[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_A'", axis=1)

In [17]:
df_nouns["vocal_forms"] = df_nouns.apply({"vocal_forms": normalize_ar})
df_nouns["vocal_forms"] = df_nouns.apply(
    {"vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)}
)
df_nouns["abs"], df_nouns["cnc"] = zip(
    *df_nouns.apply(build_gf_abstract_entries, axis="columns")
)

In [18]:
df_nouns["wordnet_ara"] = df_nouns[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_N'", axis=1)

In [19]:

df_verbs["vocal_forms"] = df_verbs.apply({"vocal_forms": normalize_ar})
df_verbs["vocal_forms"] = df_verbs.apply(
    {
        "vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
        if isinstance(s, str)
        else s
    }
)
df_verbs["imperfect"] = df_verbs.apply({"imperfect": normalize_ar})
df_verbs["imperfect"] = df_verbs.apply(
    {
        "imperfect": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
        if isinstance(s, str)
        else s
    }
)
df_verbs["abs"], df_verbs["cnc"] = zip(
    *df_verbs.apply(build_gf_abstract_entries, axis="columns")
)

In [20]:
df_verbs["wordnet_ara"] = df_verbs[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_V'", axis=1)

In [21]:
df_adjs

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,other_gender_form,masc_pl,fem_pl,root,abs,cnc,wordnet_ara
31,-1,added_A,مضاف,مُضاف,adj,ميم-ضمة | ضاد | ألف | فاء,[],muDAf,157_20231122.1302_Q79_Q34,masc,,,,,"fun 'مُضاف_A' : A ; -- source: wikitionary, idx: -1, senses: ['']","lin 'مُضاف_A' = wmkA { fem_sg = ""nan"" ; masc_sg = ""مُضاف"" } ;",lin added_A = 'مُضاف_A'
1,-1,authoritarian_1_A,استبدادي,اِسْتِبْدادِي,adj,ألف-كسرة | سين-سكون | تاء-كسرة | باء-سكون | دال | ألف | دال-كسرة | ياء,[],AisotibodAdiy,7_20231122.1302_Q79_Q34,masc,,,,,"fun 'اِسْتِبْدادِي_A' : A ; -- source: wikitionary, idx: -1, senses: ['']","lin 'اِسْتِبْدادِي_A' = wmkA { fem_sg = ""nan"" ; masc_sg = ""اِسْتِبْدادِي"" } ;",lin authoritarian_1_A = 'اِسْتِبْدادِي_A'
14,-1,median_3_A,الوسيط,الوَسِيط,adj,ألف | لام | واو-فتحة | سين-كسرة | ياء | طاء,[],AlwasiyT,76_20231122.1302_Q79_Q34,masc,,,,,"fun 'الوَسِيط_A' : A ; -- source: wikitionary, idx: -1, senses: ['']","lin 'الوَسِيط_A' = wmkA { fem_sg = ""nan"" ; masc_sg = ""الوَسِيط"" } ;",lin median_3_A = 'الوَسِيط_A'
16,-1,nordic_2_A,الشمال,الشَّمال,adj,,[],,84_20231122.1302_Q79_Q34,masc,,,,,"fun 'الشَّمال_A' : A ; -- source: wikitionary, idx: -1, senses: ['']","lin 'الشَّمال_A' = wmkA { fem_sg = ""nan"" ; masc_sg = ""الشَّمال"" } ;",lin nordic_2_A = 'الشَّمال_A'
2,4973,domestic_1_A,محلي,مَحَلِّي,adj,ميم-فتحة | حاء-فتحة | لام-شدة-كسرة | ياء-شدة,"[local, national]",maHal~iy~,31_20231122.1302_Q79_Q34,masc,مَحَلِّيَّة,مَحَلِّيُّون,مَحَلِّيَّات,,"fun 'مَحَلِّي_A' : A ; -- source: wikitionary, idx: 4973, senses: ['local, national']","lin 'مَحَلِّي_A' = wmkA { fem_pl = ""مَحَلِّيَّات"" ; fem_sg = ""مَحَلِّيَّة"" ; masc_pl = ""مَحَلِّيُّون"" ; masc_sg = ""مَحَلِّي"" } ;",lin domestic_1_A = 'مَحَلِّي_A'
3,117599,following_2_A,تالي,تَالِي,adj,تاء-فتحة | ألف | لام-كسرة | ياء,[],taAliy,41_20231122.1302_Q79_Q34,masc,تَالِية,,,,"fun 'تَالِي_A' : A ; -- source: wikitionary, idx: 117599, senses: ['']","lin 'تَالِي_A' = wmkA { fem_sg = ""تَالِية"" ; masc_sg = ""تَالِي"" } ;",lin following_2_A = 'تَالِي_A'
7,10802,full_3_A,ممتلئ,مُمْتَلِئ,adj,ميم-ضمة | ميم-سكون | تاء-فتحة | لام-كسرة | همزة على الياء,"[full, filled, filled up, replete]",mumotali},45_20231122.1302_Q79_Q34,masc,مُمْتَلِئَة,مُمْتَلِئُون,مُمْتَلِئَات,ملء,"fun 'مُمْتَلِئ_A' : A ; -- source: wikitionary, idx: 10802, senses: ['full, filled, filled up, replete']","lin 'مُمْتَلِئ_A' = wmkA { fem_pl = ""مُمْتَلِئَات"" ; fem_sg = ""مُمْتَلِئَة"" ; masc_pl = ""مُمْتَلِئُون"" ; masc_sg = ""مُمْتَلِئ"" ; root = ""ملء"" } ;",lin full_3_A = 'مُمْتَلِئ_A'
8,124714,gross_1_A,إجمالي,إِجْمَالِي,adj,همزة تحت الألف-كسرة | جيم-سكون | ميم-فتحة | ألف | لام-كسرة | ياء-شدة,"[comprehensive, general, total]",<ijomaAliy~,50_20231122.1302_Q79_Q34,masc,إِجْمَالِيَّة,إِجْمَالِيُّون,إِجْمَالِيَّات,جمل,"fun 'إِجْمَالِي_A' : A ; -- source: wikitionary, idx: 124714, senses: ['comprehensive, general, total']","lin 'إِجْمَالِي_A' = wmkA { fem_pl = ""إِجْمَالِيَّات"" ; fem_sg = ""إِجْمَالِيَّة"" ; masc_pl = ""إِجْمَالِيُّون"" ; masc_sg = ""إِجْمَالِي"" ; root = ""جمل"" } ;",lin gross_1_A = 'إِجْمَالِي_A'
11,124077,individual_4_A,فردي,فَرْدِي,adj,فاء-فتحة | راء-سكون | دال-كسرة | ياء-شدة,"[single / individual, personal / odd, uneven]",farodiy~,59_20231122.1302_Q79_Q34,masc,فَرْدِيَّة,فَرْدِيُّون,فَرْدِيَّات,فرد,"fun 'فَرْدِي_A' : A ; -- source: wikitionary, idx: 124077, senses: ['single / individual, personal / odd, uneven']","lin 'فَرْدِي_A' = wmkA { fem_pl = ""فَرْدِيَّات"" ; fem_sg = ""فَرْدِيَّة"" ; masc_pl = ""فَرْدِيُّون"" ; masc_sg = ""فَرْدِي"" ; root = ""فرد"" } ;",lin individual_4_A = 'فَرْدِي_A'
15,115192,moderate_1_A,معتدل,مُعْتَدِل,adj,ميم-ضمة | عين-سكون | تاء-فتحة | دال-كسرة | لام,"[straight, even, proportionate / temperate, mild, moderate]",muEotadil,79_20231122.1302_Q79_Q34,masc,مُعْتَدِلَة,مُعْتَدِلُون,مُعْتَدِلَات,عدل,"fun 'مُعْتَدِل_A' : A ; -- source: wikitionary, idx: 115192, senses: ['straight, even, proportionate / temperate, mild, moderate']","lin 'مُعْتَدِل_A' = wmkA { fem_pl = ""مُعْتَدِلَات"" ; fem_sg = ""مُعْتَدِلَة"" ; masc_pl = ""مُعْتَدِلُون"" ; masc_sg = ""مُعْتَدِل"" ; root = ""عدل"" } ;",lin moderate_1_A = 'مُعْتَدِل_A'


In [22]:
df_verbs

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,verb_form,root,imperfect,abs,cnc,wordnet_ara
194,116328,consider_6_V3,اعتبر,اِعْتَبَر,verb,ألف-كسرة | عين-سكون | تاء-فتحة | باء-فتحة | راء,"[to take example, to take warning, to learn a lesson (بِـ (bi-): from) / to acknowledge, value, respect / to consider, to regard as / to examine, to investigate, to put to test]",AiEotabar,20_20231122.1302_Q79_Q34,FormVIII,عبر,يَعْتَبِر,"fun 'اِعْتَبَر_V' : V ; -- source: wikitionary, idx: 116328, senses: ['to take example, to take warning, to learn a lesson (بِـ (bi-): from) / to acknowledge, value, respect / to consider, to regard as / to examine, to investigate, to put to test']","lin 'اِعْتَبَر_V' = wmkV { cls = FormVIII ; imperfect = ""يَعْتَبِر"" ; perfect = ""اِعْتَبَر"" ; root = ""عبر"" } ;",lin consider_6_V3 = 'اِعْتَبَر_V'
317,20863,succeed_V2,نجح,نَجَح,verb,نون-فتحة | جيم-فتحة | حاء-فتحة,"[to succeed, to be successful]",najaHa,50_20231201.1559_Q79_Q34,FormI,نجح,يَنْجَح,"fun 'نَجَح_V' : V ; -- source: wikitionary, idx: 20863, senses: ['to succeed, to be successful']","lin 'نَجَح_V' = wmkV { cls = FormI ; imperfect = ""يَنْجَح"" ; perfect = ""نَجَح"" ; root = ""نجح"" } ;",lin succeed_V2 = 'نَجَح_V'
165,8070,rank_2_V2,صنف,صَنَّف,verb,صاد-فتحة | نون-شدة-فتحة | فاء-فتحة,"[to sort, classify, or categorize something / to compile or compose]",San~afa,100_20231122.1302_Q79_Q34,FormII,صنف,يُصَنِّف,"fun 'صَنَّف_V' : V ; -- source: wikitionary, idx: 8070, senses: ['to sort, classify, or categorize something / to compile or compose']","lin 'صَنَّف_V' = wmkV { cls = FormII ; imperfect = ""يُصَنِّف"" ; perfect = ""صَنَّف"" ; root = ""صنف"" } ;",lin rank_2_V2 = 'صَنَّف_V'
236,148,designate_4_V2,عين,عَيَّن,verb,عين-فتحة | ياء-شدة-فتحة | نون-فتحة,"[to appoint, to assign, to name, to nominate / to define / to determine, to fix, to identify, to specify / to allocate, to allot, to apportion, to earmark, to itemize, to set aside]",Eay~ana,18_20231201.1559_Q79_Q34,FormII,عين,يُعَيِّن,"fun 'عَيَّن_V' : V ; -- source: wikitionary, idx: 148, senses: ['to appoint, to assign, to name, to nominate / to define / to determine, to fix, to identify, to specify / to allocate, to allot, to apportion, to earmark, to itemize, to set aside']","lin 'عَيَّن_V' = wmkV { cls = FormII ; imperfect = ""يُعَيِّن"" ; perfect = ""عَيَّن"" ; root = ""عين"" } ;",lin designate_4_V2 = 'عَيَّن_V'
190,4743,reach_V2,وصل,وَصَل,verb,واو-فتحة | صاد-فتحة | لام-فتحة,"[(transitive, with إِلَى (ʔilā)) to arrive (“to reach some place”), to arrive (“to reach some place”)]",waSala,158_20231122.1302_Q79_Q34,FormI,وصل,يَصِل,"fun 'وَصَل_V' : V ; -- source: wikitionary, idx: 4743, senses: ['(transitive, with إِلَى (ʔilā)) to arrive (“to reach some place”), to arrive (“to reach some place”)']","lin 'وَصَل_V' = wmkV { cls = FormI ; imperfect = ""يَصِل"" ; perfect = ""وَصَل"" ; root = ""وصل"" } ;",lin reach_V2 = 'وَصَل_V'
181,6248,speak_3_V2,تكلم,تَكَلَّم,verb,تاء-فتحة | كاف-فتحة | لام-شدة-فتحة | ميم-فتحة,"[to talk, to have a discussion or conversation]",takal~ama,118_20231122.1302_Q79_Q34,FormV,كلم,يَتَكَلَّم,"fun 'تَكَلَّم_V' : V ; -- source: wikitionary, idx: 6248, senses: ['to talk, to have a discussion or conversation']","lin 'تَكَلَّم_V' = wmkV { cls = FormV ; imperfect = ""يَتَكَلَّم"" ; perfect = ""تَكَلَّم"" ; root = ""كلم"" } ;",lin speak_3_V2 = 'تَكَلَّم_V'
158,698,have_1_V2,ملك,مَلَك,verb,ميم-فتحة | لام-فتحة | كاف-فتحة,"[to take in possession, to take over, to acquire, to seize / to possess, to lay hold, to own, to have, to be the owner / to dominate, to control / to be the master / to be capable, to be able, to be in a position to / to rule, to reign, to exercise authority, to hold sway, to lord over]",malaka,54_20231122.1302_Q79_Q34,FormI,ملك,يَمْلِك,"fun 'مَلَك_V' : V ; -- source: wikitionary, idx: 698, senses: ['to take in possession, to take over, to acquire, to seize / to possess, to lay hold, to own, to have, to be the owner / to dominate, to control / to be the master / to be capable, to be able, to be in a position to / to rule, to reign, to exercise authority, to hold sway, to lord over']","lin 'مَلَك_V' = wmkV { cls = FormI ; imperfect = ""يَمْلِك"" ; perfect = ""مَلَك"" ; root = ""ملك"" } ;",lin have_1_V2 = 'مَلَك_V'
169,8035,show_2_V2,عرض,عَرَض,verb,عين-فتحة | راء-فتحة | ضاد-فتحة,"[to appear / to happen, to occur / to show, to display, to present / to expose]",EaraDa,113_20231122.1302_Q79_Q34,FormI,عرض,يَعْرِض,"fun 'عَرَض_V' : V ; -- source: wikitionary, idx: 8035, senses: ['to appear / to happen, to occur / to show, to display, to present / to expose']","lin 'عَرَض_V' = wmkV { cls = FormI ; imperfect = ""يَعْرِض"" ; perfect = ""عَرَض"" ; root = ""عرض"" } ;",lin show_2_V2 = 'عَرَض_V'


In [23]:
df_nouns

Unnamed: 0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,plural,root,abs,cnc,wordnet_ara
213,-1,chinese_N,صينىة,صِينِيَّة,noun,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,[],Siyniy~ap,10_20231201.1559_Q79_Q34,fem,,,"fun 'صِينِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'صِينِيَّة_N' = wmkN { g = fem ; sg = ""صِينِيَّة"" } ;",lin chinese_N = 'صِينِيَّة_N'
231,-1,dari_N,دارية,دارِيَّة,noun,دال | ألف | راء-كسرة | ياء-شدة-فتحة | تاء مربوطة,[],dAriy~ap,15_20231201.1559_Q79_Q34,fem,,,"fun 'دارِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'دارِيَّة_N' = wmkN { g = fem ; sg = ""دارِيَّة"" } ;",lin dari_N = 'دارِيَّة_N'
69,-1,development_2_N,تطوير,تَطْوِير,noun,تاء-فتحة | طاء-سكون | واو-كسرة | ياء | راء,[],taTowiyr,29_20231122.1302_Q79_Q34,masc,,,"fun 'تَطْوِير_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'تَطْوِير_N' = wmkN { g = masc ; sg = ""تَطْوِير"" } ;",lin development_2_N = 'تَطْوِير_N'
263,-1,italian_N,ايطالية,إِيطالِيَّة,noun,همزة تحت الألف-كسرة | ياء | طاء | ألف | لام-كسرة | ياء-شدة-فتحة | تاء مربوطة,[],<iyTAliy~ap,31_20231201.1559_Q79_Q34,fem,,,"fun 'إِيطالِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'إِيطالِيَّة_N' = wmkN { g = fem ; sg = ""إِيطالِيَّة"" } ;",lin italian_N = 'إِيطالِيَّة_N'
108,-1,krona_1_N,كرون,كَرون,noun,كاف-فتحة | راء | واو | نون,[],karwn,69_20231122.1302_Q79_Q34,fem,,,"fun 'كَرون_N' : N ; -- source: wikitionary, idx: -1, senses: ['']","lin 'كَرون_N' = wmkN { g = fem ; sg = ""كَرون"" } ;",lin krona_1_N = 'كَرون_N'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,3927,starch_1_N,نشاء,نَشَاء,noun,نون-فتحة | شين-فتحة | ألف | همزة,"[starch, cornstarch, farina]",na$aA',49_20231201.1559_Q79_Q34,masc,,نشو,"fun 'نَشَاء_N' : N ; -- source: wikitionary, idx: 3927, senses: ['starch, cornstarch, farina']","lin 'نَشَاء_N' = wmkN { g = masc ; root = ""نشو"" ; sg = ""نَشَاء"" } ;",lin starch_1_N = 'نَشَاء_N'
44,2554,age_1_N,عمر,عُمْر,noun,عين-ضمة | ميم-سكون | راء,"[life as a period of time, length of life, lifespan, lifetime / age]",Eumor,0_20231122.1302_Q79_Q34,masc,أَعْمَار,عمر,"fun 'عُمْر_N' : N ; -- source: wikitionary, idx: 2554, senses: ['life as a period of time, length of life, lifespan, lifetime / age']","lin 'عُمْر_N' = wmkN { g = masc ; pl = ""أَعْمَار"" ; root = ""عمر"" ; sg = ""عُمْر"" } ;",lin age_1_N = 'عُمْر_N'
145,5764,west_2_N,غرب,غَرْب,noun,غين-فتحة | راء-سكون | باء,"[verbal noun of غَرَبَ (ḡaraba) (form I) / west, occident / vehemence, violence, tempestuousness]",garob,136_20231122.1302_Q79_Q34,masc,,,"fun 'غَرْب_N' : N ; -- source: wikitionary, idx: 5764, senses: ['verbal noun of غَرَبَ (ḡaraba) (form I) / west, occident / vehemence, violence, tempestuousness']","lin 'غَرْب_N' = wmkN { g = masc ; sg = ""غَرْب"" } ;",lin west_2_N = 'غَرْب_N'
97,695,king_1_N,ملِك,مَلِك,noun,ميم-فتحة | لام-كسرة | كاف,"[king, sovereign, monarch]",malik,67_20231122.1302_Q79_Q34,masc,مُلُوك,,"fun 'مَلِك_N' : N ; -- source: wikitionary, idx: 695, senses: ['king, sovereign, monarch']","lin 'مَلِك_N' = wmkN { g = masc ; pl = ""مُلُوك"" ; sg = ""مَلِك"" } ;",lin king_1_N = 'مَلِك_N'


# Export GF Files

In [24]:
nouns_abs = "\n".join(df_nouns["abs"].to_list())
adjs_abs = "\n".join(df_adjs["abs"].to_list())
verbs_abs = "\n".join(df_verbs["abs"].to_list())

nouns_lin = "\n".join(df_nouns["cnc"].to_list())
adjs_lin = "\n".join(df_adjs["cnc"].to_list())
verbs_lin = "\n".join(df_verbs["cnc"].to_list())

nouns_wordnet = "\n".join(df_nouns["wordnet_ara"].to_list())
adjs_wordnet = "\n".join(df_adjs["wordnet_ara"].to_list())
verbs_wordnet = "\n".join(df_verbs["wordnet_ara"].to_list())

ar_absolute_grammar = "abstract MorphoDictAraAbs = Cat ** {\n"
ar_absolute_grammar += f"{nouns_abs}" + "\n"
ar_absolute_grammar += f"{adjs_abs}" + "\n"
ar_absolute_grammar += f"{verbs_abs}" + "\n"
ar_absolute_grammar += "}"

ar_concrete_grammar = "concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {\n"
ar_concrete_grammar += f"{nouns_lin}" + "\n"
ar_concrete_grammar += f"{adjs_lin}" + "\n"
ar_concrete_grammar += f"{verbs_lin}" + "\n"
ar_concrete_grammar += "}"

wordnet_ar = """--# -path=.:./gf-wordnet
concrete WordNetAra of WordNet = CatAra ** open MorphoDictAra, MoreAra, ParadigmsAra in {
lin en = variants {} ; --- guess from"""
wordnet_ar += f"{nouns_wordnet}" + "\n"
wordnet_ar += f"{adjs_wordnet}" + "\n"
wordnet_ar += f"{verbs_wordnet}" + "\n"
wordnet_ar += "}"

In [25]:
with open(output_dir / "MorphoDictAraAbs.gf", encoding="utf-8", mode="at") as file:
    file.write(ar_absolute_grammar)

In [26]:
with open(output_dir / "MorphoDictAra.gf", encoding="utf-8", mode="at") as file:
    file.write(ar_concrete_grammar)

In [27]:
with open(output_dir / "WordNetAra.gf", encoding="utf-8", mode="at") as file:
    file.write(wordnet_ar)

In [28]:
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

# The word to be disambiguated
ambiguous_word = 'Egypt'

# The context sentence in which the word is used
english_context_sentence = ['the', 'branch', 'of', 'sociology', 'that', 'studies', 'the', 'characteristics', 'of', 'human', 'populations']

# Apply the Lesk algorithm using the English context sentence
best_sense = lesk(english_context_sentence, ambiguous_word)
print(best_sense.definition())

# If a best sense is found, find its Arabic equivalent
if best_sense:
    # Get the lemma names for the synset in Arabic ('arb')
    arabic_lemmas = best_sense.lemma_names('arb')
    if arabic_lemmas:
        print(f"Best English sense: {best_sense.name()}")
        # print(f"English definition: {best_sense.definition()}")
        print(f"Arabic equivalent(s): {', '.join(arabic_lemmas)}")
        print(f"ID                  : {best_sense.offset()}")

    else:
        print("No Arabic equivalent found.")
else:
    print("No best English sense found.")


a republic in northeastern Africa known as the United Arab Republic until 1971; site of an ancient civilization that flourished from 2600 to 30 BC
Best English sense: egypt.n.01
Arabic equivalent(s): أرْض_الكِنانة, جُمْهُورِيّة_مِصْر_العربِيّة, مِصْر
ID                  : 8897065
