In [1]:
from google.colab import files
uploaded = files.upload()

Saving Qabas-dataset.csv to Qabas-dataset.csv


In [None]:
# Install the necessary libraries to understand and preprocess Arabic letters
!pip install arabic_reshaper
!pip install python-bidi
!pip install farasapy

Collecting arabic_reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic_reshaper
Successfully installed arabic_reshaper-3.0.0
Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (292 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-bidi
Successfully installed python-bidi-0.6.6
Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl.metadata (8.9 kB)
Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [3]:
import pandas as pd
import numpy as np
import re
import arabic_reshaper
from bidi.algorithm import get_display
from farasa.segmenter import FarasaSegmenter
from farasa.ner import FarasaNamedEntityRecognizer
from farasa.pos import FarasaPOSTagger
from farasa.diacratizer import FarasaDiacritizer
from farasa.stemmer import FarasaStemmer
from collections import defaultdict

In [4]:
df = pd.read_csv('/content/Qabas-dataset.csv')
df.head()

Unnamed: 0,lemma_id,lemma,language,pos_cat,pos,root,augmentation,number,person,gender,voice,transitivity,uninflected
0,2023254710,سَاوِي,عامية,اسم,صفة,س و ي,,مفرد,,مذكر,,,
1,2023254711,رْكِيد,عامية,اسم,اسم,ر ك د,,مفرد,,مذكر,,,
2,2023254712,دُمَاجٌ,عامية,اسم,اسم,د م ج,,مفرد,,مذكر,,,
3,2023254713,دَامِرٌ,عامية,اسم,اسم,د م ر,,مفرد,,مذكر,,,
4,2023254714,جَعَارٌ,عامية,اسم,صفة,ج ع ر,,مفرد,,مذكر,,,


The dataset consists of lemma_id, lemma (the word), language (whether it is in spoken or written Arabic), its POS tag and the root of the word (meaning what letters compose of the word, written in their isolated form).

We will select the relevant columns we need that will be essential for preprocessing, feature extraction, model trainin and evaluation, and to consequently meet the goal of our project.

In [5]:
# Select relevant columns
df = df[['lemma_id', 'lemma', 'language', 'pos_cat', 'pos', 'root', 'gender']]

# Drop rows with missing lemmas
df = df.dropna(subset=['lemma'])

In [6]:
# Normalize Arabic Text: Remove diacritics, normalize Alef and Ya
def normalize_arabic(text):
    text = re.sub(r'[\u064B-\u065F]', '', text)  # Remove diacritics
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")  # Normalize Alef
    text = text.replace("ى", "ي")  # Normalize Ya
    return text

df['normalized_lemma'] = df['lemma'].apply(normalize_arabic)
df.head()

Unnamed: 0,lemma_id,lemma,language,pos_cat,pos,root,gender,normalized_lemma
0,2023254710,سَاوِي,عامية,اسم,صفة,س و ي,مذكر,ساوي
1,2023254711,رْكِيد,عامية,اسم,اسم,ر ك د,مذكر,ركيد
2,2023254712,دُمَاجٌ,عامية,اسم,اسم,د م ج,مذكر,دماج
3,2023254713,دَامِرٌ,عامية,اسم,اسم,د م ر,مذكر,دامر
4,2023254714,جَعَارٌ,عامية,اسم,صفة,ج ع ر,مذكر,جعار


The new column consists of the lemmas in a normalized forms, with no diatrics and normalized letters so that we can remove any noise and focus only on the shape of the letters in their respect positions.

In [8]:
# Reshape for Proper Display (For Visualization)
df['reshaped_lemma'] = df['normalized_lemma'].apply(lambda x: get_display(arabic_reshaper.reshape(x)))

In [9]:
df.head()

Unnamed: 0,lemma_id,lemma,language,pos_cat,pos,root,gender,normalized_lemma,reshaped_lemma
0,2023254710,سَاوِي,عامية,اسم,صفة,س و ي,مذكر,ساوي,ﻱﻭﺎﺳ
1,2023254711,رْكِيد,عامية,اسم,اسم,ر ك د,مذكر,ركيد,ﺪﻴﻛﺭ
2,2023254712,دُمَاجٌ,عامية,اسم,اسم,د م ج,مذكر,دماج,ﺝﺎﻣﺩ
3,2023254713,دَامِرٌ,عامية,اسم,اسم,د م ر,مذكر,دامر,ﺮﻣﺍﺩ
4,2023254714,جَعَارٌ,عامية,اسم,صفة,ج ع ر,مذكر,جعار,ﺭﺎﻌﺟ


The reshaped_lemma represents each of the letters in their form based on the position in the lemma, as seperate letters, to seperate the letters into their roots without changing their shape in the word, to help the model better understand the letters regardless of their shape or position.

In [10]:
# Tokenizing words into characters
df['char_tokens'] = df['normalized_lemma'].apply(lambda x: list(x))

In [11]:
df.head()

Unnamed: 0,lemma_id,lemma,language,pos_cat,pos,root,gender,normalized_lemma,reshaped_lemma,char_tokens
0,2023254710,سَاوِي,عامية,اسم,صفة,س و ي,مذكر,ساوي,ﻱﻭﺎﺳ,"[س, ا, و, ي]"
1,2023254711,رْكِيد,عامية,اسم,اسم,ر ك د,مذكر,ركيد,ﺪﻴﻛﺭ,"[ر, ك, ي, د]"
2,2023254712,دُمَاجٌ,عامية,اسم,اسم,د م ج,مذكر,دماج,ﺝﺎﻣﺩ,"[د, م, ا, ج]"
3,2023254713,دَامِرٌ,عامية,اسم,اسم,د م ر,مذكر,دامر,ﺮﻣﺍﺩ,"[د, ا, م, ر]"
4,2023254714,جَعَارٌ,عامية,اسم,صفة,ج ع ر,مذكر,جعار,ﺭﺎﻌﺟ,"[ج, ع, ا, ر]"


In [12]:
# Define Arabic letter shape variations
letter_shapes = {
    'ا': ['ﺍ', 'ﺎ'], 'ب': ['ﺏ', 'ﺒ', 'ﺑ', 'ﺒ'], 'ت': ['ﺕ', 'ﺘ', 'ﺗ', 'ﺘ'],
    'ث': ['ﺙ', 'ﺜ', 'ﺛ', 'ﺜ'], 'ج': ['ﺝ', 'ﺠ', 'ﺟ', 'ﺠ'], 'ح': ['ﺡ', 'ﺤ', 'ﺣ', 'ﺤ'],
    'خ': ['ﺥ', 'ﺨ', 'ﺧ', 'ﺨ'], 'د': ['ﺩ', 'ﺪ'], 'ذ': ['ﺫ', 'ﺬ'], 'ر': ['ﺭ', 'ﺮ'],
    'ز': ['ﺯ', 'ﺰ'], 'س': ['ﺱ', 'ﺴ', 'ﺳ', 'ﺴ'], 'ش': ['ﺵ', 'ﺸ', 'ﺷ', 'ﺸ'],
    'ص': ['ﺹ', 'ﺼ', 'ﺻ', 'ﺼ'], 'ض': ['ﺽ', 'ﻀ', 'ﺿ', 'ﻀ'], 'ط': ['ﻁ', 'ﻄ', 'ﻃ', 'ﻄ'],
    'ظ': ['ﻅ', 'ﻈ', 'ﻇ', 'ﻈ'], 'ع': ['ﻉ', 'ﻌ', 'ﻋ', 'ﻌ'], 'غ': ['ﻍ', 'ﻐ', 'ﻏ', 'ﻐ'],
    'ف': ['ﻑ', 'ﻔ', 'ﻓ', 'ﻔ'], 'ق': ['ﻕ', 'ﻘ', 'ﻗ', 'ﻘ'], 'ك': ['ﻙ', 'ﻜ', 'ﻛ', 'ﻜ'],
    'ل': ['ﻝ', 'ﻠ', 'ﻟ', 'ﻠ'], 'م': ['ﻡ', 'ﻤ', 'ﻣ', 'ﻤ'], 'ن': ['ﻥ', 'ﻨ', 'ﻧ', 'ﻨ'],
    'ه': ['ﻩ', 'ﻬ', 'ﻫ', 'ﻬ'], 'و': ['ﻭ', 'ﻮ'], 'ي': ['ﻱ', 'ﻴ', 'ﻳ', 'ﻴ']
}

In [13]:
# Generate shape variations for each lemma
def get_letter_shapes(word):
    return [[letter] + letter_shapes.get(letter, []) for letter in word]

In [14]:
df['letter_shapes'] = df['normalized_lemma'].apply(get_letter_shapes)

In [15]:
df.head()

Unnamed: 0,lemma_id,lemma,language,pos_cat,pos,root,gender,normalized_lemma,reshaped_lemma,char_tokens,letter_shapes
0,2023254710,سَاوِي,عامية,اسم,صفة,س و ي,مذكر,ساوي,ﻱﻭﺎﺳ,"[س, ا, و, ي]","[[س, ﺱ, ﺴ, ﺳ, ﺴ], [ا, ﺍ, ﺎ], [و, ﻭ, ﻮ], [ي, ﻱ,..."
1,2023254711,رْكِيد,عامية,اسم,اسم,ر ك د,مذكر,ركيد,ﺪﻴﻛﺭ,"[ر, ك, ي, د]","[[ر, ﺭ, ﺮ], [ك, ﻙ, ﻜ, ﻛ, ﻜ], [ي, ﻱ, ﻴ, ﻳ, ﻴ], ..."
2,2023254712,دُمَاجٌ,عامية,اسم,اسم,د م ج,مذكر,دماج,ﺝﺎﻣﺩ,"[د, م, ا, ج]","[[د, ﺩ, ﺪ], [م, ﻡ, ﻤ, ﻣ, ﻤ], [ا, ﺍ, ﺎ], [ج, ﺝ,..."
3,2023254713,دَامِرٌ,عامية,اسم,اسم,د م ر,مذكر,دامر,ﺮﻣﺍﺩ,"[د, ا, م, ر]","[[د, ﺩ, ﺪ], [ا, ﺍ, ﺎ], [م, ﻡ, ﻤ, ﻣ, ﻤ], [ر, ﺭ,..."
4,2023254714,جَعَارٌ,عامية,اسم,صفة,ج ع ر,مذكر,جعار,ﺭﺎﻌﺟ,"[ج, ع, ا, ر]","[[ج, ﺝ, ﺠ, ﺟ, ﺠ], [ع, ﻉ, ﻌ, ﻋ, ﻌ], [ا, ﺍ, ﺎ], ..."


The letter_shapes were placed for each column because this can help the model understand that this is the same letter regardless of its position, to help the model better understand the Arabic language during training. In other words, this helps the model understand that even though a letter may appear differently depending on its position in a word, it is still the same underlying character, helping the model better the complex morphology of the Arabic letters.

In [22]:
# Save the cleaned dataset
df.to_csv('/content/Qabas-dataset-cleaned.csv', index=False)