# Download data and models

In [113]:
# GDRIVE_ID_DATA = "1ONRQ36PFPnYNA4R6ZlmM7UQJ4LiAzEH0"
# !gdown $GDRIVE_ID_DATA -O Arabic-Text-Diacritization.zip
# !unzip Arabic-Text-Diacritization.zip

In [114]:
# GDRIVE_ID_LSTM_MODEL = "1kLRQ3o7m57qK1OJOTA-K9OBYL29zuXjo"
# !gdown $GDRIVE_ID_LSTM_MODEL -O LSTM.joblib

In [115]:
# import tarfile

# file_path = "/kaggle/input/tashkeela/Tashkeela-arabic-diacritized-text-utf8-0.3.tar.bz2"
# extract_path = "/kaggle/working/tashkeela_extracted"

# # Extract tar.bz2 file
# with tarfile.open(file_path, "r:bz2") as tar:
#     tar.extractall(path=extract_path)

# extract_path

<h1> Arabic letters and diacritics

In [116]:
import pickle
import re
import pyarabic.araby as araby
import pyarabic.number as number
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import unicodedata
import tensorflow as tf
from tensorflow.keras import layers, models
import joblib
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import EarlyStopping

<h2> Constants

In [117]:
window_size = 1500

## GPU Configuration for Kaggle
Check GPU availability and configure TensorFlow to use GPU

In [118]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Get GPU details
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to avoid OOM errors
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        print(f"GPU detected: {gpus}")
        print(f"GPU Name: {tf.test.gpu_device_name()}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. Training will use CPU.")

Num GPUs Available:  2
GPU detected: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
GPU Name: /device:GPU:0


I0000 00:00:1764391369.209563      47 gpu_device.cc:2022] Created device /device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1764391369.209792      47 gpu_device.cc:2022] Created device /device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


## Load Dictionaries and Create Mappings

In [119]:
arabic_letters = []
diacritics = []
diacritics_to_id = {}
with open('./utils/arabic_letters.pickle', 'rb') as f:
    arabic_letters = pickle.load(f)
with open('./utils/diacritics.pickle', 'rb') as f:
    diacritics = pickle.load(f)
with open('./utils/diacritic2id.pickle', 'rb') as f:
    diacritics_to_id = pickle.load(f)

arabic_letters_sorted = sorted(arabic_letters)
char_to_id = {char: idx + 1 for idx, char in enumerate(arabic_letters_sorted)}
char_to_id['<PAD>'] = 0
char_to_id['UNK'] = len(char_to_id)

<h2> Read train and val data

In [138]:
train_data = []
val_data = []
test_data = []
with open('./data/train.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        train_data.append(line.strip())
with open('./data/val.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        val_data.append(line.strip())
with open('/kaggle/working/tashkeela_extracted/Tashkeela-arabic-diacritized-text-utf8-0.3/texts.txt/غذاء الألباب في شرح منظومة الآداب.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        test_data.append(line.strip())
        
print(len(train_data))
print(len(val_data))
print(len(test_data))

50000
2500
12213


<h2> Clean data

In [139]:
def clean_arabic_text(text):
    """
    Clean text to keep only Arabic letters, diacritics, and spaces
    """
    # Create a set of allowed characters (Arabic letters + diacritics + space)
    allowed_chars = arabic_letters.union(diacritics, {' ', '\t', '\n'})
    
    # Filter the text to keep only allowed characters
    cleaned_text = ''.join(char for char in text if char in allowed_chars)
    
    # Normalize whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text


def split_sentences(sentences, window_size=window_size):
    all_segments = []
    
    for sentence in sentences:
        words = araby.tokenize(sentence)
        current_segment = []
        current_len = 0
        
        for word in words:
            word_len = len(word)
            add_space = 1 if current_segment else 0
            
            if current_len + word_len + add_space <= window_size:
                current_segment.append(word)
                current_len += word_len + add_space
            else:
                # save the segment
                if current_segment:
                    all_segments.append(" ".join(current_segment))
                
                # start new segment
                current_segment = [word]
                current_len = word_len
        
        # append the final segment of the sentence
        if current_segment:
            all_segments.append(" ".join(current_segment))

    return all_segments



def sentence_tokeniz(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        subsentences = araby.sentence_tokenize(sentence)
        tokenized_sentences.extend(subsentences)
    return tokenized_sentences
        


In [140]:
train_data = sentence_tokeniz(train_data)
val_data = sentence_tokeniz(val_data)
test_data = sentence_tokeniz(test_data)

for i in range(len(train_data)):
    train_data[i] = clean_arabic_text(train_data[i])
for i in range(len(val_data)):
    val_data[i] = clean_arabic_text(val_data[i])
for i in range(len(test_data)):
    test_data[i] = clean_arabic_text(test_data[i])

train_data = split_sentences(train_data, window_size)
val_data = split_sentences(val_data, window_size)
test_data = split_sentences(test_data, window_size)

In [143]:
def is_diacritic(ch):
    # Unicode combining marks (Arabic diacritics are combining marks)
    return unicodedata.combining(ch) != 0

def extract_base_and_diacritics(text):
    # normalize to NFC so base+combining marks are consistent
    text = unicodedata.normalize('NFC', text)
    bases = []
    diacs = []
    current_base = None
    current_diac = ''
    for ch in text:
        if is_diacritic(ch):
            # accumulate diacritics for current base
            current_diac += ch
        else:
            # new base character
            if current_base is not None:
                bases.append(current_base)
                diacs.append(current_diac)
            current_base = ch
            current_diac = ''
    # append last
    if current_base is not None:
        bases.append(current_base)
        diacs.append(current_diac)
    return bases, diacs

<h2> Prepare data for the model

In [144]:
# Pad sequences to same length
PAD_DIACRITIC_ID = diacritics_to_id.get('', 0)  # Use empty string diacritic for padding

In [145]:
# Prepare training data - extract characters and their diacritics
x_train_raw = []
y_train_raw = []

# Use a constant for unknown diacritic instead of hardcoded value
UNKNOWN_DIACRITIC_ID = diacritics_to_id.get('', len(diacritics_to_id) - 1)

for text in train_data:
    bases, diacs = extract_base_and_diacritics(text)
    # convert letters to IDs
    x_train_raw.append([char_to_id.get(c, char_to_id['UNK']) for c in bases])
    y_train_raw.append([diacritics_to_id.get(d, UNKNOWN_DIACRITIC_ID) for d in diacs])



In [146]:
# Pad validation sequences
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train_raw, padding='post', value=0)
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")

x_train shape: (171895, 909)
y_train shape: (171895, 909)


In [147]:
# Prepare validation data
x_val_raw = []
y_val_raw = []

for text in val_data:
    bases, diacs = extract_base_and_diacritics(text)
    # convert letters to IDs
    x_val_raw.append([char_to_id.get(c, char_to_id['UNK']) for c in bases])
    y_val_raw.append([diacritics_to_id.get(d, UNKNOWN_DIACRITIC_ID) for d in diacs])

In [149]:
# Pad validation sequences
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val_raw, padding='post', value=0)
y_val = tf.keras.preprocessing.sequence.pad_sequences(y_val_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_val shape: {x_val.shape}")
print(f"y_val shape: {y_val.shape}")

x_val shape: (8314, 899)
y_val shape: (8314, 899)


In [150]:
# Prepare test data
x_test_raw = []
y_test_raw = []

for text in test_data:
    bases, diacs = extract_base_and_diacritics(text)
    # convert letters to IDs
    x_test_raw.append([char_to_id.get(c, char_to_id['UNK']) for c in bases])
    y_test_raw.append([diacritics_to_id.get(d, UNKNOWN_DIACRITIC_ID) for d in diacs])

In [151]:
# Pad test sequences
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test_raw, padding='post', value=0)
y_test = tf.keras.preprocessing.sequence.pad_sequences(y_test_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_test shape: (35302, 909)
y_test shape: (35302, 909)


In [131]:
vocab_size = len(char_to_id)
num_diacritics = len(diacritics_to_id)

model = models.Sequential([
    # TRAINABLE EMBEDDINGS (THIS LAYER LEARNS)
    layers.Embedding(input_dim=vocab_size,
                     output_dim=128,     # embedding size (trainable)
                     mask_zero=True),

    # BiLSTM for sequence modeling
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),

    # Predict diacritic for each character - use Dense directly instead of TimeDistributed
    layers.Dense(num_diacritics, activation='softmax')
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [132]:
# Train model on GPU
early = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

with tf.device('/GPU:0'):
    history = model.fit(x_train, y_train, 
                        epochs=10, 
                        batch_size=64,
                        verbose=1)

Epoch 1/3
[1m2686/2686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 39ms/step - accuracy: 0.1215 - loss: 0.5949
Epoch 2/3
[1m2686/2686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 39ms/step - accuracy: 0.9953 - loss: 0.2135
Epoch 3/3
[1m2686/2686[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 39ms/step - accuracy: 0.9963 - loss: 0.1735


In [133]:
joblib.dump(model, "/kaggle/working/model1.joblib")

['/kaggle/working/model1.joblib']

In [155]:
# model = joblib.load("model1.joblib")

In [156]:
def calculate_der_by_position(x_val, y_true, y_pred, char_to_id):
    """
    Calculate DER separately for last characters and non-last characters in words
    
    Args:
        x_val: Character sequences (samples × sequence_length)
        y_true: Ground truth diacritic labels (samples × sequence_length)
        y_pred: Predicted diacritic labels (samples × sequence_length)
        char_to_id: Dictionary mapping characters to IDs
    
    Returns:
        Tuple of (DER_non_last, DER_last, overall_DER)
    """
    # Get space character ID
    space_id = char_to_id.get(' ', char_to_id.get('UNK'))
    pad_id = char_to_id.get('<PAD>', 0)
    
    non_last_errors = 0
    non_last_total = 0
    last_errors = 0
    last_total = 0
    
    # Process each sequence
    for char_seq, y_true_seq, y_pred_seq in zip(x_val, y_true, y_pred):
        # Find valid (non-padding) characters
        valid_mask = char_seq != pad_id
        valid_indices = np.where(valid_mask)[0]
        
        if len(valid_indices) == 0:
            continue
        
        # Identify word boundaries (spaces and end of sequence)
        i = 0
        while i < len(valid_indices):
            idx = valid_indices[i]
            
            # Skip spaces
            if char_seq[idx] == space_id:
                i += 1
                continue
            
            # Find the end of current word
            word_start = i
            while i < len(valid_indices) and char_seq[valid_indices[i]] != space_id:
                i += 1
            word_end = i - 1
            
            # Mark positions in the word
            for j in range(word_start, word_end + 1):
                pos_idx = valid_indices[j]
                
                # Skip if this position has padding in y_true
                if y_true_seq[pos_idx] == PAD_DIACRITIC_ID:
                    continue
                
                is_correct = (y_true_seq[pos_idx] == y_pred_seq[pos_idx])
                
                # Last character in word
                if j == word_end:
                    last_total += 1
                    if not is_correct:
                        last_errors += 1
                # Non-last character in word
                else:
                    non_last_total += 1
                    if not is_correct:
                        non_last_errors += 1
    
    # Calculate DER for each category
    der_non_last = (non_last_errors / non_last_total * 100) if non_last_total > 0 else 0
    der_last = (last_errors / last_total * 100) if last_total > 0 else 0
    
    total_errors = non_last_errors + last_errors
    total_chars = non_last_total + last_total
    der_overall = (total_errors / total_chars * 100) if total_chars > 0 else 0
    
    return der_non_last, der_last, der_overall

In [157]:
y_pred = model.predict(x_test)

[1m1104/1104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step


In [158]:
y_pred_classes = np.argmax(y_pred, axis=-1)
y_true = y_test

# Calculate accuracy
accuracy = accuracy_score(y_true.flatten(), y_pred_classes.flatten())
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9964


In [159]:
# Calculate DER by character position in words
der_non_last, der_last, der_overall = calculate_der_by_position(x_test, y_true, y_pred_classes, char_to_id)

print("\n" + "="*60)
print("DER Analysis by Character Position in Words")
print("="*60)
print(f"DER for non-last characters: {der_non_last:.2f}%")
print(f"DER for last characters:     {der_last:.2f}%")
print(f"Overall DER:                 {der_overall:.2f}%")
print(f"\nAccuracy for non-last characters: {100 - der_non_last:.2f}%")
print(f"Accuracy for last characters:     {100 - der_last:.2f}%")
print(f"Acutual Accuracy: {100 - der_overall:.2f}%")
print("="*60)


DER Analysis by Character Position in Words
DER for non-last characters: 9.01%
DER for last characters:     14.80%
Overall DER:                 10.38%

Accuracy for non-last characters: 90.99%
Accuracy for last characters:     85.20%
Acutual Accuracy: 89.62%


<h2> Test data

In [None]:
def merge(x_test, y_pred_classes, char_to_id, diacritics_to_id):
    """
    Merge character sequences with predicted diacritics to reconstruct text
    
    Args:
        x_test: numpy array of character IDs (samples × sequence_length)
        y_pred_classes: numpy array of predicted diacritic IDs (samples × sequence_length)
        char_to_id: dictionary mapping characters to IDs
        diacritics_to_id: dictionary mapping diacritics to IDs
    
    Returns:
        List of reconstructed diacritized text strings
    """
    # Create reverse mappings
    id_to_char = {v: k for k, v in char_to_id.items()}
    id_to_diacritic = {v: k for k, v in diacritics_to_id.items()}
    
    reconstructed_texts = []
    
    # Process each sample
    for char_seq, diac_seq in zip(x_test, y_pred_classes):
        text = ""
        
        for char_id, diac_id in zip(char_seq, diac_seq):
            # Skip padding
            if char_id == 0:  # PAD character
                break
            
            # Get character
            char = id_to_char.get(char_id, '')
            
            # Get diacritic
            diacritic = id_to_diacritic.get(diac_id, '')
            
            # Combine character with diacritic
            text += char + diacritic
        
        text = text.replace("UNK", " ")
        reconstructed_texts.append(text)
    
    return reconstructed_texts

In [None]:
test_sent = "هذا نص تجريبي لاختبار نموذج تشكيل النص العربي."

x_test = test_model(test_sent)
y_test_pred = model.predict(np.array(x_test))
y_test_pred_classes = np.argmax(y_test_pred, axis=-1)

In [None]:
y_test_pred.shape

In [None]:
output_sentences = merge(x_test, y_test_pred_classes, char_to_id, diacritics_to_id)[0]
print(output_sentences)