In [3]:
# Install the required packages
!pip install python-bidi arabic-reshaper

import pandas as pd
import re
import unicodedata
import arabic_reshaper
from bidi.algorithm import get_display

Defaulting to user installation because normal site-packages is not writeable
Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-3.0.0


In [24]:
import json

with open("clean_shape_to_base.json", encoding="utf-8") as f:
    shape_to_base = json.load(f)

In [5]:
df = pd.read_csv("dataset/Qabas-dataset.csv")
df.head()

Unnamed: 0,lemma_id,lemma,language,pos_cat,pos,root,augmentation,number,person,gender,voice,transitivity,uninflected
0,2023254710,سَاوِي,عامية,اسم,صفة,س و ي,,مفرد,,مذكر,,,
1,2023254711,رْكِيد,عامية,اسم,اسم,ر ك د,,مفرد,,مذكر,,,
2,2023254712,دُمَاجٌ,عامية,اسم,اسم,د م ج,,مفرد,,مذكر,,,
3,2023254713,دَامِرٌ,عامية,اسم,اسم,د م ر,,مفرد,,مذكر,,,
4,2023254714,جَعَارٌ,عامية,اسم,صفة,ج ع ر,,مفرد,,مذكر,,,


In [6]:
# Step 2: Define Arabic diacritics regex
diacritics = re.compile(r'[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED]')

In [7]:
# Step 3: Build shape_to_base dictionary from Arabic Presentation Forms
presentation_forms = [(0xFB50, 0xFDFF), (0xFE70, 0xFEFF)]
shape_to_base = {}

for start, end in presentation_forms:
    for cp in range(start, end + 1):
        char = chr(cp)
        try:
            name = unicodedata.name(char)
            if "ARABIC LETTER" in name and any(pos in name for pos in ["ISOLATED", "INITIAL", "MEDIAL", "FINAL"]):
                base_name = name.split("ARABIC LETTER ")[1].split(" ")[0]
                base_char = None
                for code in range(0x0600, 0x06FF + 1):
                    if base_name in unicodedata.name(chr(code), ""):
                        base_char = chr(code)
                        break
                if base_char:
                    shape_to_base[char] = base_char
        except ValueError:
            continue

In [19]:
# Step 4: Clean & tokenize each word
def normalize_and_tokenize(word):
    word = str(word)
    word_no_diacritics = re.sub(diacritics, '', word)
    reshaped_word = arabic_reshaper.reshape(word_no_diacritics)
    return list(reshaped_word)

In [20]:
# Step 5: Get position tag
def get_position_tags(letters):
    if len(letters) == 1:
        return ['isolated']
    return ['initial'] + ['medial'] * (len(letters) - 2) + ['final']

In [25]:
# Step 6: Process each word in the dataset
records = []

for word in df['lemma'].dropna():
    tokens = normalize_and_tokenize(word)
    positions = get_position_tags(tokens)
    for letter, pos in zip(tokens, positions):
        base = shape_to_base.get(letter, letter)  # fallback to letter itself
        records.append({
            'word': word,
            'shape_variant': letter,
            'base_form': base,
            'position': pos
        })

In [28]:
# Step 7: Save final DataFrame
final_df = pd.DataFrame(records)
final_df.to_csv("Qabas-dataset-cleaned.csv", index=False)

In [29]:
final_df.head(20)

Unnamed: 0,word,shape_variant,base_form,position
0,سَاوِي,ﺳ,س,initial
1,سَاوِي,ﺎ,ا,medial
2,سَاوِي,ﻭ,و,medial
3,سَاوِي,ﻱ,ي,final
4,رْكِيد,ﺭ,ر,initial
5,رْكِيد,ﻛ,ك,medial
6,رْكِيد,ﻴ,ي,medial
7,رْكِيد,ﺪ,د,final
8,دُمَاجٌ,ﺩ,د,initial
9,دُمَاجٌ,ﻣ,م,medial


In [32]:
df2 = pd.read_csv("dataset/Qabas-dataset-cleaned.csv")

In [34]:
# Step 1: Create unique integer encodings for base_form and position
base_letters = sorted(df2['base_form'].unique())
positions = ['isolated', 'initial', 'medial', 'final']

# Create dictionaries for encoding
base_letter_to_idx = {char: idx for idx, char in enumerate(base_letters)}
position_to_idx = {pos: idx for idx, pos in enumerate(positions)}

# Step 2: Apply integer encodings to the dataframe
df2['base_form_idx'] = df2['base_form'].map(base_letter_to_idx)
df2['position_idx'] = df2['position'].map(position_to_idx)

In [35]:
# Create dictionaries for encoding
base_letter_to_idx = {char: idx for idx, char in enumerate(base_letters)}
position_to_idx = {pos: idx for idx, pos in enumerate(positions)}

In [36]:
# Step 2: Apply integer encodings to the dataframe
df2['base_form_idx'] = df2['base_form'].map(base_letter_to_idx)
df2['position_idx'] = df2['position'].map(position_to_idx)

In [39]:
df2.to_csv("encoded_char_features.csv", index=False)

In [40]:
data = pd.read_csv("encoded_char_features.csv")
data.head(20)

Unnamed: 0,word,shape_variant,base_form,position,base_form_idx,position_idx
0,سَاوِي,ﺳ,س,initial,25,1
1,سَاوِي,ﺎ,ا,medial,14,2
2,سَاوِي,ﻭ,و,medial,41,2
3,سَاوِي,ﻱ,ي,final,42,3
4,رْكِيد,ﺭ,ر,initial,23,1
5,رْكِيد,ﻛ,ك,medial,36,2
6,رْكِيد,ﻴ,ي,medial,42,2
7,رْكِيد,ﺪ,د,final,21,3
8,دُمَاجٌ,ﺩ,د,initial,21,1
9,دُمَاجٌ,ﻣ,م,medial,38,2
