# Imports

In [1]:
import re
from pathlib import Path
import pandas as pd
from unicodedata import normalize
from pyarabic.araby import DIACRITICS, SHADDA, LETTERS, is_arabicword

# Define Variables

In [2]:
# Remove last Harakat from an Arabic Word
NOT_LETTERS_PATTERN = f"[^{LETTERS}]"
DIACRITICS_PATTERN = "".join(DIACRITICS)
LAST_HARAKAT_PATTERN = re.compile(
    rf"[{DIACRITICS_PATTERN}](?={NOT_LETTERS_PATTERN}*$)", re.UNICODE
)

MORPHOLOGY_MAP = {
    "gender": "g",
    "root": "root",
    "plural": "pl",
    "masc_pl": "masc_pl",
    "fem_pl": "fem_pl",
    "imperfect": "imperfect",
    "verb_form": "cls",
}

# Set of Harakat
DIACRITICS_SET = set(DIACRITICS)  # Aabic diacritics/short vowels

# Define paths

In [3]:
data_dir = Path("../data/interim/lexicon")
ar_adjectives_path = data_dir / "20231028.172908_adjectives_lexicon.csv"
ar_verbs_path = data_dir / "20231028.172908_verbs_lexicon.csv"
ar_nouns_path = data_dir / "20231028.172908_nouns_lexicon.csv"

# Define Functions

## Utilities Functions

In [4]:
def reorder_shadda(ar_string: str) -> str:
    """unicodedata.normalize put shadda before diacritics; not correct"""
    list_ar_str = list(ar_string)

    for i in range(len(list_ar_str) - 1):
        char = list_ar_str[i]
        next_char = list_ar_str[i + 1]

        if char in DIACRITICS_SET and next_char == SHADDA:
            list_ar_str[i], list_ar_str[i + 1] = (
                next_char,
                char,
            )  # Swap shadda and diacritic

    return "".join(list_ar_str)

In [5]:
def normalize_ar(ar_vocalized: str, verbose: bool = False) -> str:
    """get the normal form for the Unicode string unistr using NFC then fix the shadda order issue"""
    ar_norm = normalize("NFC", ar_vocalized)
    ar_norm = reorder_shadda(ar_norm)
    if verbose:
        print([name(char) for char in ar_norm])
    return ar_norm

## Main Functions

In [6]:
def get_lin(row):
    row_dict = dict(row)
    list_lins = []
    for morpho in row_dict:
        if lin_feature := MORPHOLOGY_MAP.get(morpho):
            lin_value = row_dict[morpho]
            if is_arabicword(lin_value):
                lin_value = f'"{lin_value}"'
            list_lins.append(f'{lin_feature} = {lin_value}')
    return list_lins

In [7]:
def build_gf_abstract_entries(row):

    cat = row["en"].split("_")[-1][0]
    lemma = normalize_ar(row["vocal_forms"])
    lemma = LAST_HARAKAT_PATTERN.sub("", lemma)
    idx = row["wiki_idx"]
    senses = row["senses"]
    source = "wikitionary"

    gf_fun_str = "fun {}_{} : {} ; "
    comment_str = "-- source: {}, idx: {}, senses: {}"

    gf_fun_str = gf_fun_str.format(lemma, cat, cat)
    comment_str = comment_str.format(source, idx, senses)

    list_lins = get_lin(row)
    if cat == "V":
        list_lins.append(f'perfect = "{lemma}"')
    elif cat == "N":
        list_lins.append(f'sg = "{lemma}"')
    elif cat == "adj":
        if row["gender"] == "masc":
            list_lins.append(f'masc_sg = "{lemma}"')
            lemma_otherg = normalize_ar(row["other_gender_form"])
            lemma_otherg = LAST_HARAKAT_PATTERN.sub("", lemma_otherg)
            list_lins.append(f'fem_sg = "{lemma_otherg}"')
        else:
            list_lins.append(f'fem_sg = "{lemma}"')
            lemma_otherg = normalize_ar(row["other_gender_form"])
            lemma_otherg = LAST_HARAKAT_PATTERN.sub("", lemma_otherg)
            list_lins.append(f'masc_sg = "{lemma_otherg}"')

    str_lins = " ; ".join(list_lins)
    lin_entry = f"'{lemma}_{cat}'"
    lin = f"lin {lin_entry} = wmk{cat} { {str_lins} } ;"

    return f"{gf_fun_str}{comment_str}", lin

# Load CSV Files

In [8]:
df_adjs = pd.read_csv(ar_adjectives_path, index_col=0, converters={"senses": pd.eval})
df_nouns = pd.read_csv(ar_nouns_path, index_col=0, converters={"senses": pd.eval})
df_verbs = pd.read_csv(ar_verbs_path, index_col=0, converters={"senses": pd.eval})


# Build Abstract GF

In [9]:
df_adjs["abs"], df_adjs["cnc"] = df_adjs.apply(build_gf_abstract_entries, axis="columns")
df_nouns["abs"], df_nouns["cnc"] = df_nouns.apply(build_gf_abstract_entries, axis="columns")

df_verbs["imperfect"] = df_verbs.apply({"imperfect": normalize_ar})
df_verbs["imperfect"] = df_verbs.apply({"imperfect": lambda s: LAST_HARAKAT_PATTERN.sub("", s)})
df_verbs["abs"], df_nouns["cnc"] = df_verbs.apply(build_gf_abstract_entries, axis="columns")


TypeError: unhashable type: 'set'