# Imports

In [1]:
import re
from pathlib import Path
import pandas as pd
from unicodedata import normalize
from pyarabic.araby import DIACRITICS, SHADDA, LETTERS, is_arabicword
from time import time
from datetime import datetime

# Define paths

In [2]:
data_path = Path("../data/interim/lexicon/lexicon.xlsx")
# ar_adjectives_path = data_dir / "20240202.1101_Q79_Q34_Q16_adjectives_lexicon.csv"
# # ar_verbs_path = data_dir / "20231201.1559_Q79_Q34_verbs_lexicon.csv"
# ar_nouns_path = data_dir / "20240202.1101_Q79_Q34_Q16_nouns_lexicon.csv"
# ar_gn_path = Path("../data/interim/ambiguous/") / "20240202.1101_Q79_Q34_Q16_PN_ambiguous.csv"

In [3]:
output_dir = Path("../data/processed/gf")

In [4]:
pd.set_option('max_colwidth', 400)

# Define Variables

In [5]:
# Remove last Harakat from an Arabic Word
NOT_LETTERS_PATTERN = f"[^{LETTERS}]"
DIACRITICS_PATTERN = "".join(DIACRITICS)
LAST_HARAKAT_PATTERN = re.compile(
    rf"[{DIACRITICS_PATTERN}](?={NOT_LETTERS_PATTERN}*$)", re.UNICODE
)

In [6]:
# Set of Harakat
DIACRITICS_SET = set(DIACRITICS)  # Aabic diacritics/short vowels

In [7]:
by_pass_words = ["غير معدود"]


MORPHOLOGY_MAP = {
    "verb_form": "cls",
    "gender": "g",
    "root": "root",
    "plural": "pl",
    "masc_pl": "masc_pl",
    "fem_pl": "fem_pl",
    "imperfect": "imperfect",
}

# Define Functions

## Utilities Functions

In [8]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS_SET and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [9]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    if not isinstance(ar_vocalized, str):
        return ar_vocalized
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## Main Functions

In [10]:
def get_lin(row):
    row_dict = dict(row)
    dict_lins = {}
    for morpho, lin_value in row_dict.items():
        if lin_name := MORPHOLOGY_MAP.get(morpho):
            if isinstance(lin_value, str) and lin_value not in by_pass_words:
                if is_arabicword(lin_value):
                    lin_value = f'"{lin_value}"'
                dict_lins[lin_name] = f'{lin_name} = {lin_value}'
    return dict_lins

In [11]:
def build_gf_abstract_entries(row):
    cat = row["wiki_pos"][0].capitalize()
    lemma = row["vocal_forms"]
    idx = row["wiki_idx"]
    senses = row["senses"]
    source = "wikitionary"

    gf_fun_str = "fun '{}_{}' : {} ; "
    comment_str = "-- source: {}, idx: {}, senses: {}"

    gf_fun_str = gf_fun_str.format(lemma, cat, cat)
    comment_str = comment_str.format(source, idx, senses)

    dict_lins = get_lin(row)
    if cat == "V":
        LIN_ENTRY_V = ["cls", "imperfect", "perfect", "root"]
        list_lins = []
        for lin_entry in LIN_ENTRY_V:
            if lin_entry == "perfect":
                list_lins.append(f'perfect = "{lemma}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "N":
        LIN_ENTRY_N = ["g", "pl", "root", "sg"]
        list_lins = []
        for lin_entry in LIN_ENTRY_N:
            if lin_entry == "sg":
                list_lins.append(f'sg = "{lemma}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "A":
        LIN_ENTRY_A = ["fem_pl", "fem_sg", "masc_pl", "masc_sg", "root"]
        list_lins = []
        for lin_entry in LIN_ENTRY_A:
            if lin_entry == "fem_sg":
                if row["gender"] == "fem":
                    list_lins.append(f'fem_sg = "{lemma}"')
                else:
                    list_lins.append(f'fem_sg = "{row["other_gender_form"]}"')
            elif lin_entry == "masc_sg":
                if row["gender"] == "masc":
                    list_lins.append(f'masc_sg = "{lemma}"')
                else:
                    list_lins.append(f'masc_sg = "{row["other_gender_form"]}"')
            else:
                if lin_fun := dict_lins.get(lin_entry):
                    list_lins.append(lin_fun)

    elif cat == "GN":
        list_lins = [row[""]]

    str_lins = " ; ".join(list_lins)
    lin_entry = f"'{lemma}_{cat}'"
    lin = f"lin {lin_entry} = wmk{cat} {{ " + str_lins + " } ;"

    return f"{gf_fun_str}{comment_str}", lin

# Load Files

In [16]:
df_adjs = pd.read_excel(data_path, sheet_name="adjectives", index_col=0)
df_nouns = pd.read_excel(data_path, sheet_name="nouns", index_col=0)
df_verbs = pd.read_excel(data_path, sheet_name="verbs", index_col=0)

In [17]:
# df_adjs = pd.read_csv(ar_adjectives_path, index_col=0, converters={"senses": pd.eval})
# df_nouns = pd.read_csv(ar_nouns_path, index_col=0, converters={"senses": pd.eval})
# df_verbs = pd.read_csv(ar_verbs_path, index_col=0, converters={"senses": pd.eval})


In [18]:
df_nouns

Unnamed: 0_level_0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,ar_letters,senses,tim_translit,file,gender,plural,root
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,-1,chinese_N,صينىة,صِينِيَّة,noun,صاد-كسرة | ياء | نون-كسرة | ياء-شدة-فتحة | تاء مربوطة,"[""-""]",Siyniy~ap,10_20231201.1559_Q79_Q34,fem,,صين
1,-1,dari_N,دارية,دارِيَّة,noun,دال | ألف | راء-كسرة | ياء-شدة-فتحة | تاء مربوطة,"[""-""]",dAriy~ap,15_20231201.1559_Q79_Q34,fem,,داري
2,-1,development_2_N,تطوير,تَطْوِير,noun,تاء-فتحة | طاء-سكون | واو-كسرة | ياء | راء,"[""-""]",taTowiyr,29_20231122.1302_Q79_Q34,masc,,طور
3,-1,italian_N,ايطالية,إِيطالِيَّة,noun,همزة تحت الألف-كسرة | ياء | طاء | ألف | لام-كسرة | ياء-شدة-فتحة | تاء مربوطة,"[""-""]",<iyTAliy~ap,31_20231201.1559_Q79_Q34,fem,,
4,-1,krona_1_N,كرون,كَرون,noun,كاف-فتحة | راء | واو | نون,"[""-""]",karwn,69_20231122.1302_Q79_Q34,fem,,
...,...,...,...,...,...,...,...,...,...,...,...,...
118,117666,unemployment_N,بطالة,بِطَالَة,noun,باء-كسرة | طاء-فتحة | ألف | لام-فتحة | تاء مربوطة,['idleness / unemployment'],biTaAlap,,fem,,
119,2406,coptic_N,قبطي,قِبْطِيّ,noun,قاف-كسرة | باء-سكون | طاء-كسرة | ياء-شدة,['Copt'],qiboTiy~,,masc,,
120,3625,european_union_NP,أوروبي,أُورُوبِّيّ,noun,همزة على الألف-ضمة | واو | راء-ضمة | واو | باء-شدة-كسرة | ياء-شدة,['European'],>uwruwb~iy~,,masc,أُورُوبِّيُّونَ,
121,-1,yen_2_N,ين,يِن,noun,ياء-كسرة | نون-كسرة,"[""-""]",,,,,


# Build Abstract GF

In [19]:
df_adjs["other_gender_form"] = df_adjs.apply({"other_gender_form": normalize_ar})
df_adjs["other_gender_form"] = df_adjs.apply(
    {
        "other_gender_form": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
        if isinstance(s, str)
        else s
    }
)
df_adjs["masc_pl"] = df_adjs.apply({"masc_pl": normalize_ar})
df_adjs["masc_pl"] = df_adjs.apply(
    {"masc_pl": lambda s: LAST_HARAKAT_PATTERN.sub("", s) if isinstance(s, str) else s}
)
df_adjs["fem_pl"] = df_adjs.apply({"fem_pl": normalize_ar})
df_adjs["fem_pl"] = df_adjs.apply(
    {"fem_pl": lambda s: LAST_HARAKAT_PATTERN.sub("", s) if isinstance(s, str) else s}
)
df_adjs["vocal_forms"] = df_adjs.apply({"vocal_forms": normalize_ar})
df_adjs["vocal_forms"] = df_adjs.apply(
    {"vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)}
)
df_adjs["abs"], df_adjs["cnc"] = zip(
    *df_adjs.apply(build_gf_abstract_entries, axis="columns")
)

In [20]:
df_adjs["wordnet_ara"] = df_adjs[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_A'", axis=1)

In [21]:
df_nouns["vocal_forms"] = df_nouns.apply({"vocal_forms": normalize_ar})
df_nouns["vocal_forms"] = df_nouns.apply(
    {"vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)}
)
df_nouns["abs"], df_nouns["cnc"] = zip(
    *df_nouns.apply(build_gf_abstract_entries, axis="columns")
)

In [22]:
df_nouns["wordnet_ara"] = df_nouns[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_N'", axis=1)

In [23]:
df_verbs["vocal_forms"] = df_verbs.apply({"vocal_forms": normalize_ar})
df_verbs["vocal_forms"] = df_verbs.apply(
    {
        "vocal_forms": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
        if isinstance(s, str)
        else s
    }
)
df_verbs["imperfect"] = df_verbs.apply({"imperfect": normalize_ar})
df_verbs["imperfect"] = df_verbs.apply(
    {
        "imperfect": lambda s: LAST_HARAKAT_PATTERN.sub("", s)
        if isinstance(s, str)
        else s
    }
)
df_verbs["abs"], df_verbs["cnc"] = zip(
    *df_verbs.apply(build_gf_abstract_entries, axis="columns")
)

In [24]:
df_verbs["wordnet_ara"] = df_verbs[["en_entry", "vocal_forms"]].apply(lambda x: f"lin {x[0]} = '{x[1]}_V'", axis=1)

In [25]:
df_adjs.drop(["ar_letters", "senses"], axis=1)

Unnamed: 0_level_0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,tim_translit,gender,other_gender_form,masc_pl,fem_pl,root,abs,cnc,wordnet_ara
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-1,added_A,مضاف,مُضاف,adj,muDAf,masc,مُضافة,مُضافون,مُضافات,ضيف,"fun 'مُضاف_A' : A ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'مُضاف_A' = wmkA { fem_pl = ""مُضافات"" ; fem_sg = ""مُضافة"" ; masc_pl = ""مُضافون"" ; masc_sg = ""مُضاف"" ; root = ""ضيف"" } ;",lin added_A = 'مُضاف_A'
1,-1,authoritarian_1_A,استبدادي,اِسْتِبْدادِي,adj,AisotibodAdiy,masc,اِسْتِبْدادِية,اِسْتِبْدادِيون,اِسْتِبْدادِيات,بدد,"fun 'اِسْتِبْدادِي_A' : A ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'اِسْتِبْدادِي_A' = wmkA { fem_pl = ""اِسْتِبْدادِيات"" ; fem_sg = ""اِسْتِبْدادِية"" ; masc_pl = ""اِسْتِبْدادِيون"" ; masc_sg = ""اِسْتِبْدادِي"" ; root = ""بدد"" } ;",lin authoritarian_1_A = 'اِسْتِبْدادِي_A'
2,-1,median_3_A,وسيط,وَسِيط,adj,AlwasiyT,masc,وَسِيط,وُسَطَاء,وُسَطَاء,وسط,"fun 'وَسِيط_A' : A ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'وَسِيط_A' = wmkA { fem_pl = ""وُسَطَاء"" ; fem_sg = ""وَسِيط"" ; masc_pl = ""وُسَطَاء"" ; masc_sg = ""وَسِيط"" ; root = ""وسط"" } ;",lin median_3_A = 'وَسِيط_A'
3,-1,nordic_2_A,شمال أوروبي,شَّمال أوروبي,adj,,masc,,,,,"fun 'شَّمال أوروبي_A' : A ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'شَّمال أوروبي_A' = wmkA { fem_sg = ""nan"" ; masc_sg = ""شَّمال أوروبي"" } ;",lin nordic_2_A = 'شَّمال أوروبي_A'
4,4973,domestic_1_A,محلي,مَحَلِّي,adj,maHal~iy~,masc,مَحَلِّيَّة,مَحَلِّيُّون,مَحَلِّيَّات,خلل,"fun 'مَحَلِّي_A' : A ; -- source: wikitionary, idx: 4973, senses: ['local, national']","lin 'مَحَلِّي_A' = wmkA { fem_pl = ""مَحَلِّيَّات"" ; fem_sg = ""مَحَلِّيَّة"" ; masc_pl = ""مَحَلِّيُّون"" ; masc_sg = ""مَحَلِّي"" ; root = ""خلل"" } ;",lin domestic_1_A = 'مَحَلِّي_A'
5,117599,following_2_A,تالي,تَالِي,adj,taAliy,masc,تَالِية,تَالِيون,تَالِيات,,"fun 'تَالِي_A' : A ; -- source: wikitionary, idx: 117599, senses: [""-""]","lin 'تَالِي_A' = wmkA { fem_pl = ""تَالِيات"" ; fem_sg = ""تَالِية"" ; masc_pl = ""تَالِيون"" ; masc_sg = ""تَالِي"" } ;",lin following_2_A = 'تَالِي_A'
6,10802,full_3_A,ممتلئ,مُمْتَلِئ,adj,mumotali},masc,مُمْتَلِئَة,مُمْتَلِئُون,مُمْتَلِئَات,ملء,"fun 'مُمْتَلِئ_A' : A ; -- source: wikitionary, idx: 10802, senses: ['full, filled, filled up, replete']","lin 'مُمْتَلِئ_A' = wmkA { fem_pl = ""مُمْتَلِئَات"" ; fem_sg = ""مُمْتَلِئَة"" ; masc_pl = ""مُمْتَلِئُون"" ; masc_sg = ""مُمْتَلِئ"" ; root = ""ملء"" } ;",lin full_3_A = 'مُمْتَلِئ_A'
7,124714,gross_1_A,إجمالي,إِجْمَالِي,adj,<ijomaAliy~,masc,إِجْمَالِيَّة,إِجْمَالِيُّون,إِجْمَالِيَّات,جمل,"fun 'إِجْمَالِي_A' : A ; -- source: wikitionary, idx: 124714, senses: ['comprehensive, general, total']","lin 'إِجْمَالِي_A' = wmkA { fem_pl = ""إِجْمَالِيَّات"" ; fem_sg = ""إِجْمَالِيَّة"" ; masc_pl = ""إِجْمَالِيُّون"" ; masc_sg = ""إِجْمَالِي"" ; root = ""جمل"" } ;",lin gross_1_A = 'إِجْمَالِي_A'
8,124077,individual_4_A,فردي,فَرْدِي,adj,farodiy~,masc,فَرْدِيَّة,فَرْدِيُّون,فَرْدِيَّات,فرد,"fun 'فَرْدِي_A' : A ; -- source: wikitionary, idx: 124077, senses: ['single / individual, personal / odd, uneven']","lin 'فَرْدِي_A' = wmkA { fem_pl = ""فَرْدِيَّات"" ; fem_sg = ""فَرْدِيَّة"" ; masc_pl = ""فَرْدِيُّون"" ; masc_sg = ""فَرْدِي"" ; root = ""فرد"" } ;",lin individual_4_A = 'فَرْدِي_A'
9,115192,moderate_1_A,معتدل,مُعْتَدِل,adj,muEotadil,masc,مُعْتَدِلَة,مُعْتَدِلُون,مُعْتَدِلَات,عدل,"fun 'مُعْتَدِل_A' : A ; -- source: wikitionary, idx: 115192, senses: ['straight, even, proportionate / temperate, mild, moderate']","lin 'مُعْتَدِل_A' = wmkA { fem_pl = ""مُعْتَدِلَات"" ; fem_sg = ""مُعْتَدِلَة"" ; masc_pl = ""مُعْتَدِلُون"" ; masc_sg = ""مُعْتَدِل"" ; root = ""عدل"" } ;",lin moderate_1_A = 'مُعْتَدِل_A'


In [26]:
df_verbs.drop(["ar_letters", "senses"], axis=1)

Unnamed: 0_level_0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,tim_translit,file,verb_form,root,imperfect,abs,cnc,wordnet_ara
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,116328,consider_6_V3,اعتبر,اِعْتَبَر,verb,AiEotabar,20_20231122.1302_Q79_Q34,FormVIII,عبر,يَعْتَبِر,"fun 'اِعْتَبَر_V' : V ; -- source: wikitionary, idx: 116328, senses: ['to take example, to take warning, to learn a lesson (بِـ (bi-): from) / to acknowledge, value, respect / to consider, to regard as / to examine, to investigate, to put to test']","lin 'اِعْتَبَر_V' = wmkV { cls = FormVIII ; imperfect = ""يَعْتَبِر"" ; perfect = ""اِعْتَبَر"" ; root = ""عبر"" } ;",lin consider_6_V3 = 'اِعْتَبَر_V'
1,20863,succeed_V2,نجح,نَجَح,verb,najaHa,50_20231201.1559_Q79_Q34,FormI,نجح,يَنْجَح,"fun 'نَجَح_V' : V ; -- source: wikitionary, idx: 20863, senses: ['to succeed, to be successful']","lin 'نَجَح_V' = wmkV { cls = FormI ; imperfect = ""يَنْجَح"" ; perfect = ""نَجَح"" ; root = ""نجح"" } ;",lin succeed_V2 = 'نَجَح_V'
2,8070,rank_2_V2,صنف,صَنَّف,verb,San~afa,100_20231122.1302_Q79_Q34,FormII,صنف,يُصَنِّف,"fun 'صَنَّف_V' : V ; -- source: wikitionary, idx: 8070, senses: ['to sort, classify, or categorize something / to compile or compose']","lin 'صَنَّف_V' = wmkV { cls = FormII ; imperfect = ""يُصَنِّف"" ; perfect = ""صَنَّف"" ; root = ""صنف"" } ;",lin rank_2_V2 = 'صَنَّف_V'
3,148,designate_4_V2,عين,عَيَّن,verb,Eay~ana,18_20231201.1559_Q79_Q34,FormII,عين,يُعَيِّن,"fun 'عَيَّن_V' : V ; -- source: wikitionary, idx: 148, senses: ['to appoint, to assign, to name, to nominate / to define / to determine, to fix, to identify, to specify / to allocate, to allot, to apportion, to earmark, to itemize, to set aside']","lin 'عَيَّن_V' = wmkV { cls = FormII ; imperfect = ""يُعَيِّن"" ; perfect = ""عَيَّن"" ; root = ""عين"" } ;",lin designate_4_V2 = 'عَيَّن_V'
4,4743,reach_V2,وصل,وَصَل,verb,waSala,158_20231122.1302_Q79_Q34,FormI,وصل,يَصِل,"fun 'وَصَل_V' : V ; -- source: wikitionary, idx: 4743, senses: ['(transitive, with إِلَى (ʔilā)) to arrive (“to reach some place”), to arrive (“to reach some place”)']","lin 'وَصَل_V' = wmkV { cls = FormI ; imperfect = ""يَصِل"" ; perfect = ""وَصَل"" ; root = ""وصل"" } ;",lin reach_V2 = 'وَصَل_V'
5,6248,speak_3_V2,تكلم,تَكَلَّم,verb,takal~ama,118_20231122.1302_Q79_Q34,FormV,كلم,يَتَكَلَّم,"fun 'تَكَلَّم_V' : V ; -- source: wikitionary, idx: 6248, senses: ['to talk, to have a discussion or conversation']","lin 'تَكَلَّم_V' = wmkV { cls = FormV ; imperfect = ""يَتَكَلَّم"" ; perfect = ""تَكَلَّم"" ; root = ""كلم"" } ;",lin speak_3_V2 = 'تَكَلَّم_V'
6,698,have_1_V2,ملك,مَلَك,verb,malaka,54_20231122.1302_Q79_Q34,FormI,ملك,يَمْلِك,"fun 'مَلَك_V' : V ; -- source: wikitionary, idx: 698, senses: ['to take in possession, to take over, to acquire, to seize / to possess, to lay hold, to own, to have, to be the owner / to dominate, to control / to be the master / to be capable, to be able, to be in a position to / to rule, to reign, to exercise authority, to hold sway, to lord over']","lin 'مَلَك_V' = wmkV { cls = FormI ; imperfect = ""يَمْلِك"" ; perfect = ""مَلَك"" ; root = ""ملك"" } ;",lin have_1_V2 = 'مَلَك_V'
7,8035,show_2_V2,عرض,عَرَض,verb,EaraDa,113_20231122.1302_Q79_Q34,FormI,عرض,يَعْرِض,"fun 'عَرَض_V' : V ; -- source: wikitionary, idx: 8035, senses: ['to appear / to happen, to occur / to show, to display, to present / to expose']","lin 'عَرَض_V' = wmkV { cls = FormI ; imperfect = ""يَعْرِض"" ; perfect = ""عَرَض"" ; root = ""عرض"" } ;",lin show_2_V2 = 'عَرَض_V'


In [27]:
df_nouns.drop(["ar_letters", "senses"], axis=1)

Unnamed: 0_level_0,wiki_idx,en_entry,ar,vocal_forms,wiki_pos,tim_translit,file,gender,plural,root,abs,cnc,wordnet_ara
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,-1,chinese_N,صينىة,صِينِيَّة,noun,Siyniy~ap,10_20231201.1559_Q79_Q34,fem,,صين,"fun 'صِينِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'صِينِيَّة_N' = wmkN { g = fem ; root = ""صين"" ; sg = ""صِينِيَّة"" } ;",lin chinese_N = 'صِينِيَّة_N'
1,-1,dari_N,دارية,دارِيَّة,noun,dAriy~ap,15_20231201.1559_Q79_Q34,fem,,داري,"fun 'دارِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'دارِيَّة_N' = wmkN { g = fem ; root = ""داري"" ; sg = ""دارِيَّة"" } ;",lin dari_N = 'دارِيَّة_N'
2,-1,development_2_N,تطوير,تَطْوِير,noun,taTowiyr,29_20231122.1302_Q79_Q34,masc,,طور,"fun 'تَطْوِير_N' : N ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'تَطْوِير_N' = wmkN { g = masc ; root = ""طور"" ; sg = ""تَطْوِير"" } ;",lin development_2_N = 'تَطْوِير_N'
3,-1,italian_N,ايطالية,إِيطالِيَّة,noun,<iyTAliy~ap,31_20231201.1559_Q79_Q34,fem,,,"fun 'إِيطالِيَّة_N' : N ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'إِيطالِيَّة_N' = wmkN { g = fem ; sg = ""إِيطالِيَّة"" } ;",lin italian_N = 'إِيطالِيَّة_N'
4,-1,krona_1_N,كرون,كَرون,noun,karwn,69_20231122.1302_Q79_Q34,fem,,,"fun 'كَرون_N' : N ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'كَرون_N' = wmkN { g = fem ; sg = ""كَرون"" } ;",lin krona_1_N = 'كَرون_N'
...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,117666,unemployment_N,بطالة,بِطَالَة,noun,biTaAlap,,fem,,,"fun 'بِطَالَة_N' : N ; -- source: wikitionary, idx: 117666, senses: ['idleness / unemployment']","lin 'بِطَالَة_N' = wmkN { g = fem ; sg = ""بِطَالَة"" } ;",lin unemployment_N = 'بِطَالَة_N'
119,2406,coptic_N,قبطي,قِبْطِي,noun,qiboTiy~,,masc,,,"fun 'قِبْطِي_N' : N ; -- source: wikitionary, idx: 2406, senses: ['Copt']","lin 'قِبْطِي_N' = wmkN { g = masc ; sg = ""قِبْطِي"" } ;",lin coptic_N = 'قِبْطِي_N'
120,3625,european_union_NP,أوروبي,أُورُوبِّي,noun,>uwruwb~iy~,,masc,أُورُوبِّيُّونَ,,"fun 'أُورُوبِّي_N' : N ; -- source: wikitionary, idx: 3625, senses: ['European']","lin 'أُورُوبِّي_N' = wmkN { g = masc ; pl = ""أُورُوبِّيُّونَ"" ; sg = ""أُورُوبِّي"" } ;",lin european_union_NP = 'أُورُوبِّي_N'
121,-1,yen_2_N,ين,يِن,noun,,,,,,"fun 'يِن_N' : N ; -- source: wikitionary, idx: -1, senses: [""-""]","lin 'يِن_N' = wmkN { sg = ""يِن"" } ;",lin yen_2_N = 'يِن_N'


# Export GF Files

In [28]:
nouns_abs = "\n".join(df_nouns["abs"].drop_duplicates().to_list())
adjs_abs = "\n".join(df_adjs["abs"].drop_duplicates().to_list())
verbs_abs = "\n".join(df_verbs["abs"].drop_duplicates().to_list())

nouns_lin = "\n".join(df_nouns["cnc"].drop_duplicates().to_list())
adjs_lin = "\n".join(df_adjs["cnc"].drop_duplicates().to_list())
verbs_lin = "\n".join(df_verbs["cnc"].drop_duplicates().to_list())

nouns_wordnet = "\n".join(df_nouns["wordnet_ara"].drop_duplicates().to_list())
adjs_wordnet = "\n".join(df_adjs["wordnet_ara"].drop_duplicates().to_list())
verbs_wordnet = "\n".join(df_verbs["wordnet_ara"].drop_duplicates().to_list())

ar_absolute_grammar = "abstract MorphoDictAraAbs = Cat ** {\n"
ar_absolute_grammar += f"{nouns_abs}" + "\n"
ar_absolute_grammar += f"{adjs_abs}" + "\n"
ar_absolute_grammar += f"{verbs_abs}" + "\n"
ar_absolute_grammar += "}"

ar_concrete_grammar = "concrete MorphoDictAra of MorphoDictAraAbs = CatAra ** open ParadigmsAra, MoreAra in {\n"
ar_concrete_grammar += f"{nouns_lin}" + "\n"
ar_concrete_grammar += f"{adjs_lin}" + "\n"
ar_concrete_grammar += f"{verbs_lin}" + "\n"
ar_concrete_grammar += "}"

wordnet_ar = """--# -path=.:./gf-wordnet
concrete WordNetAra of WordNet = CatAra ** open MorphoDictAra, MoreAra, ParadigmsAra in {
lin en = variants {} ; --- guess from\n"""
wordnet_ar += f"{nouns_wordnet}" + "\n"
wordnet_ar += f"{adjs_wordnet}" + "\n"
wordnet_ar += f"{verbs_wordnet}" + "\n"
wordnet_ar += "}"

In [32]:
with open(output_dir / "MorphoDictAraAbs.gf", encoding="utf-8", mode="wt") as file:
    file.write(ar_absolute_grammar)

In [33]:
with open(output_dir / "MorphoDictAra.gf", encoding="utf-8", mode="wt") as file:
    file.write(ar_concrete_grammar)

In [34]:
with open(output_dir / "WordNetAra.gf", encoding="utf-8", mode="wt") as file:
    file.write(wordnet_ar)