In [4]:
# GDRIVE_ID_DATA = "1ONRQ36PFPnYNA4R6ZlmM7UQJ4LiAzEH0"
# !gdown $GDRIVE_ID_DATA -O Arabic-Text-Diacritization.zip
# !unzip Arabic-Text-Diacritization.zip

<h1> Arabic letters and diacritics

In [5]:
import pickle
import re
import pyarabic.araby as araby
import pyarabic.number as number
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import unicodedata
import tensorflow as tf
from tensorflow.keras import layers, models
import joblib

## GPU Configuration for Kaggle
Check GPU availability and configure TensorFlow to use GPU

In [6]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Get GPU details
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to avoid OOM errors
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        print(f"GPU detected: {gpus}")
        print(f"GPU Name: {tf.test.gpu_device_name()}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. Training will use CPU.")

Num GPUs Available:  0
No GPU found. Training will use CPU.


In [7]:
arabic_letters = []
diacritics = []
diacritics_to_id = {}
with open('./utils/arabic_letters.pickle', 'rb') as f:
    arabic_letters = pickle.load(f)
with open('./utils/diacritics.pickle', 'rb') as f:
    diacritics = pickle.load(f)
with open('./utils/diacritic2id.pickle', 'rb') as f:
    diacritics_to_id = pickle.load(f)

arabic_letters_sorted = sorted(arabic_letters)
char_to_id = {char: idx + 1 for idx, char in enumerate(arabic_letters_sorted)}
char_to_id['<PAD>'] = 0
char_to_id['UNK'] = len(char_to_id)

## Load Dictionaries and Create Mappings

<h2> Read train and val data

In [8]:
train_data = []
val_data = []
with open('./data/train.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        train_data.append(line.strip())
with open('./data/val.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        val_data.append(line.strip())
print(len(train_data))
print(len(val_data))

50000
2500


<h2> Clean data

In [9]:
def clean_arabic_text(text):
    """
    Clean text to keep only Arabic letters, diacritics, and spaces
    """
    # Create a set of allowed characters (Arabic letters + diacritics + space)
    allowed_chars = arabic_letters.union(diacritics, {' ', '\t', '\n'})
    
    # Filter the text to keep only allowed characters
    cleaned_text = ''.join(char for char in text if char in allowed_chars)
    
    # Normalize whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

In [11]:
train_data_witout_diacritics = []
val_data_witout_diacritics = []
for i in range(len(train_data)):
    train_data[i] = clean_arabic_text(train_data[i])
    train_data_witout_diacritics.append(araby.strip_diacritics(train_data[i]))
for i in range(len(val_data)):
    val_data[i] = clean_arabic_text(val_data[i])
    val_data_witout_diacritics.append(araby.strip_diacritics(val_data[i]))

In [12]:
def is_diacritic(ch):
    # Unicode combining marks (Arabic diacritics are combining marks)
    return unicodedata.combining(ch) != 0

def extract_base_and_diacritics(text):
    # normalize to NFC so base+combining marks are consistent
    text = unicodedata.normalize('NFC', text)
    bases = []
    diacs = []
    current_base = None
    current_diac = ''
    for ch in text:
        if is_diacritic(ch):
            # accumulate diacritics for current base
            current_diac += ch
        else:
            # new base character
            if current_base is not None:
                bases.append(current_base)
                diacs.append(current_diac)
            current_base = ch
            current_diac = ''
    # append last
    if current_base is not None:
        bases.append(current_base)
        diacs.append(current_diac)
    return bases, diacs

In [13]:
# Prepare training data - extract characters and their diacritics
x_train_raw = []
y_train_raw = []

# Use a constant for unknown diacritic instead of hardcoded value
UNKNOWN_DIACRITIC_ID = diacritics_to_id.get('', len(diacritics_to_id) - 1)

for text in train_data:
    bases, diacs = extract_base_and_diacritics(text)
    # convert letters to IDs
    x_train_raw.append([char_to_id.get(c, char_to_id['UNK']) for c in bases])
    y_train_raw.append([diacritics_to_id.get(d, UNKNOWN_DIACRITIC_ID) for d in diacs])

KeyboardInterrupt: 

In [None]:
vocab_size = len(char_to_id)
num_diacritics = len(diacritics_to_id)

model = models.Sequential([
    # TRAINABLE EMBEDDINGS (THIS LAYER LEARNS)
    layers.Embedding(input_dim=vocab_size,
                     output_dim=128,     # embedding size (trainable)
                     mask_zero=True),

    # BiLSTM for sequence modeling
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),

    # Predict diacritic for each character - use Dense directly instead of TimeDistributed
    layers.Dense(num_diacritics, activation='softmax')
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [None]:
# Pad sequences to same length
PAD_DIACRITIC_ID = diacritics_to_id.get('', 0)  # Use empty string diacritic for padding

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train_raw, padding='post', value=0)
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")

x_train shape: (50000, 7095)
y_train shape: (50000, 7095)


In [None]:
# # Train model on GPU
# with tf.device('/GPU:0'):
#     history = model.fit(x_train, y_train, epochs=20, batch_size=64, verbose=1)

In [None]:
# joblib.dump(model, "/kaggle/working/model1.joblib")

In [None]:
# Prepare validation data
x_val_raw = []
y_val_raw = []

for text in val_data:
    bases, diacs = extract_base_and_diacritics(text)
    # convert letters to IDs
    x_val_raw.append([char_to_id.get(c, char_to_id['UNK']) for c in bases])
    y_val_raw.append([diacritics_to_id.get(d, UNKNOWN_DIACRITIC_ID) for d in diacs])

In [None]:
# Pad validation sequences
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val_raw, padding='post', value=0)
y_val = tf.keras.preprocessing.sequence.pad_sequences(y_val_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_val shape: {x_val.shape}")
print(f"y_val shape: {y_val.shape}")

x_val shape: (2500, 1172)
y_val shape: (2500, 1172)


In [None]:
y_pred = model.predict(x_val)

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 331ms/step


In [None]:
from sklearn.metrics import accuracy_score
y_pred_classes = np.argmax(y_pred, axis=-1)
y_true = y_val
accuracy = accuracy_score(y_true.flatten(), y_pred_classes.flatten())
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.0287


In [14]:
model = joblib.load("./models/LSTM.joblib")

In [15]:
def test_model(test_sent):
    test_sent.strip()
    x_test_raw, y_test_raw = [], []
    bases_test, diacs_test = extract_base_and_diacritics(test_sent)
    x_test_raw.append([char_to_id.get(c, char_to_id['UNK']) for c in bases_test])
    return x_test_raw

In [16]:
def merge(x_test, y_pred_classes, char_to_id, diacritics_to_id):
    """
    Merge character sequences with predicted diacritics to reconstruct text
    
    Args:
        x_test: numpy array of character IDs (samples × sequence_length)
        y_pred_classes: numpy array of predicted diacritic IDs (samples × sequence_length)
        char_to_id: dictionary mapping characters to IDs
        diacritics_to_id: dictionary mapping diacritics to IDs
    
    Returns:
        List of reconstructed diacritized text strings
    """
    # Create reverse mappings
    id_to_char = {v: k for k, v in char_to_id.items()}
    id_to_diacritic = {v: k for k, v in diacritics_to_id.items()}
    
    reconstructed_texts = []
    
    # Process each sample
    for char_seq, diac_seq in zip(x_test, y_pred_classes):
        text = ""
        
        for char_id, diac_id in zip(char_seq, diac_seq):
            # Skip padding
            if char_id == 0:  # PAD character
                break
            
            # Get character
            char = id_to_char.get(char_id, '')
            
            # Get diacritic
            diacritic = id_to_diacritic.get(diac_id, '')
            
            # Combine character with diacritic
            text += char + diacritic
        
        text = text.replace("UNK", " ")
        reconstructed_texts.append(text)
    
    return reconstructed_texts

In [None]:
test_sent = "هذا نص تجريبي لاختبار نموذج تشكيل النص العربي."

x_test = test_model(test_sent)
y_test_pred = model.predict(np.array(x_test))
y_test_pred_classes = np.argmax(y_test_pred, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step


In [32]:
y_test_pred.shape

(1, 703, 15)

In [36]:
output_sentences = merge(x_test, y_test_pred_classes, char_to_id, diacritics_to_id)[0]
print(output_sentences)

قَوْلُهُ وَلَوْ ادعَى وَلَدَ أَمَةٍ مُشْتَرِكَةٍ ثَبَتَ نَسَبُهُ وَهِيَ أُم وَلَدِهِ وَلَزِمَهُ نِصْفُ قِيمَتِهَا وَنِصْفَ عُقْرِهَا لَا قِيمَتُهُ أَما ثُبُوتُ النسَبِ فَلِأَنهُ لَما ثَبَتَ فِي نِصْفِهِ لِمُصَادَفَتِهِ مِلْكَهُ ثَبَتَ فِي الْبَاقِي ضَرُورَةً أَنهُ لَا يَتَجَزأُ لِمَا أَن سَبَبَهُ لَا يَتَجَزأُ وَهُوَ الْعُلُوقُ إذْ الْوَلَدُ الْوَاحِدُ لَا يُعَلَقُ مِنْ مَاءَيْنِ وَأَما صَيْرُورَتُهَا أُم وَلَدٍ فَلِأَن الِاسْتِيلَادَ لَا يَتَجَزأُ عِنْدَهُ وَعِنْدَهُمَا يَصِيرُ نَصِيبَهُ أُم وَلَدٍ لَهُ ثُم يَتَمَلكُ نَصِيبَ صَاحِبِهِ إذْ هُوَ قَابِلٌ لِلْمِلْكِ وَأَما ضَمَانُ نِصْفِ الْقِيمَةِ فَلِأَنهُ تَمْلكُ نَصِيبَ صَاحِبِهِ لِمَا اسْتَكْمَلَ الِاسْتِيلَادُ وَأَما ضَمَانُ نِصْفِ الْعُقْرِ فَلِأَنهُ وَطِئَ جَارِيَةً مُشْتَرِكَةٍ إذْ الْمِلْكُ ثَبَتَ حُكْمًا لِلِاسْتِيلَادِ فَيَعْقِبُهُ الْمِلْكُ فِي نَصِيبِ صَاحِبِهِ بِخِلَافِ الْأَبِ إذَا اسْتَوَلَدَ جَارِيَةَ ابْنِهِ لِأَن الْمِلْكَ هُنَاكَ ثَبَتَ شَرْطًا لِلِاسْتِيلَادِ فَيَتَقَدمُهُ فَصَارَ وَاطِئًا مِلْكَ نَفْسِهِ وَأَما عَدَ