# Download data and models

In [29]:
# GDRIVE_ID_DATA = "1ONRQ36PFPnYNA4R6ZlmM7UQJ4LiAzEH0"
# !gdown $GDRIVE_ID_DATA -O Arabic-Text-Diacritization.zip
# !unzip Arabic-Text-Diacritization.zip

In [30]:
# import tarfile

# file_path = "/kaggle/input/tashkeela/Tashkeela-arabic-diacritized-text-utf8-0.3.tar.bz2"
# extract_path = "/kaggle/working/tashkeela_extracted"

# # Extract tar.bz2 file
# with tarfile.open(file_path, "r:bz2") as tar:
#     tar.extractall(path=extract_path)

# extract_path

<h1> Arabic letters and diacritics

In [31]:
import pickle
import re
import pyarabic.araby as araby
import numpy as np
import unicodedata
import tensorflow as tf
from tensorflow.keras import layers, Input, Model
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.models import load_model
import os

<h2> Constants

In [32]:
window_size = 1000

ARABIC_LETTERS_PATH = './utils/arabic_letters.pickle'
DIACRITICS_PATH = './utils/diacritics.pickle'
DIACRITICS_TO_ID_PATH = './utils/diacritic2id.pickle'
CHAR_TO_ID_PATH = './utils/char2id.pickle'
WORD_TO_ID_PATH = './utils/word2id.pickle'

TRAIN_PATH = './data/train.txt'
VAL_PATH = './data/val.txt'

MODEL_SAVE_DIR = "./models"
MODEL_WEIGHTS_PATH = './models/best_weights.keras'


## GPU Configuration for Kaggle
Check GPU availability and configure TensorFlow to use GPU

In [33]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        print(f"GPU detected: {gpus}")
        print(f"GPU Name: {tf.test.gpu_device_name()}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. Training will use CPU.")

Num GPUs Available:  2
GPU detected: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
GPU Name: /device:GPU:0


I0000 00:00:1765282087.107092      47 gpu_device.cc:2022] Created device /device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1765282087.107326      47 gpu_device.cc:2022] Created device /device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


## Load Dictionaries and Create Mappings

In [34]:
arabic_letters = []
diacritics = []
diacritics_to_id = {}

with open(ARABIC_LETTERS_PATH, 'rb') as f:
    arabic_letters = pickle.load(f)
with open(DIACRITICS_PATH, 'rb') as f:
    diacritics = pickle.load(f)
with open(DIACRITICS_TO_ID_PATH, 'rb') as f:
    diacritics_to_id = pickle.load(f)
    
arabic_letters_sorted = sorted(arabic_letters)
char_to_id = {char: idx + 1 for idx, char in enumerate(arabic_letters_sorted)}
char_to_id['<PAD>'] = 0
char_to_id['UNK'] = len(char_to_id)

In [35]:
def build_word_vocabulary(data):
    """
    Build word vocabulary from training data
    
    Args:
        data: List of text samples
    
    Returns:
        Dictionary mapping words to IDs
    """
    word_counts = {}
    for text in data:
        text_no_diac = araby.strip_diacritics(text)
        words = araby.tokenize(text_no_diac)
        for word in words:
            if word.strip():  
                word_counts[word] = word_counts.get(word, 0) + 1
    
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    
    word_to_id = {'<PAD>': 0, '<UNK>': 1}
    for idx, (word, _) in enumerate(sorted_words):
        word_to_id[word] = idx + 2
    
    return word_to_id

word_to_id = {}

<h2> Read train and val data

In [36]:
train_data = []
val_data = []
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        train_data.append(line.strip())
with open(VAL_PATH, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        val_data.append(line.strip())
print(len(train_data))
print(len(val_data))

50000
2500


<h2> Clean data

In [37]:
def clean_arabic_text(text):
    """
    Clean text to keep only Arabic letters, diacritics, and spaces
    """
    allowed_chars = arabic_letters.union(diacritics, {' ', '\t', '\n'})
    
    cleaned_text = ''.join(char for char in text if char in allowed_chars)
    
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text


def split_sentences(sentences, window_size=window_size):
    all_segments = []
    
    for sentence in sentences:
        words = araby.tokenize(sentence)
        current_segment = []
        current_len = 0
        
        for word in words:
            word_len = len(word)
            add_space = 1 if current_segment else 0
            
            if current_len + word_len + add_space <= window_size:
                current_segment.append(word)
                current_len += word_len + add_space
            else:
                if current_segment:
                    all_segments.append(" ".join(current_segment))
                
                current_segment = [word]
                current_len = word_len
        
        if current_segment:
            all_segments.append(" ".join(current_segment))

    return all_segments


def sentence_tokeniz(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        subsentences = araby.sentence_tokenize(sentence)
        tokenized_sentences.extend(subsentences)
    return tokenized_sentences

In [38]:
train_data = sentence_tokeniz(train_data)
val_data = sentence_tokeniz(val_data)

for i in range(len(train_data)):
    train_data[i] = clean_arabic_text(train_data[i])
for i in range(len(val_data)):
    val_data[i] = clean_arabic_text(val_data[i])

train_data = split_sentences(train_data, window_size)
val_data = split_sentences(val_data, window_size)

print("Building word vocabulary...")
word_to_id = build_word_vocabulary(train_data)
print(f"Word vocabulary size: {len(word_to_id)}")

Building word vocabulary...
Word vocabulary size: 105864


In [39]:
def is_diacritic(ch):
    return unicodedata.combining(ch) != 0

def extract_base_and_diacritics(text):
    text = unicodedata.normalize('NFC', text)
    bases = []
    diacs = []
    current_base = None
    current_diac = ''
    for ch in text:
        if is_diacritic(ch):
            current_diac += ch
        else:
            if current_base is not None:
                bases.append(current_base)
                diacs.append(current_diac)
            current_base = ch
            current_diac = ''
    if current_base is not None:
        bases.append(current_base)
        diacs.append(current_diac)
    return bases, diacs

In [40]:
def extract_char_and_word_features(text, word_to_id):
    """
    Extract both character-level and word-level features from text
    
    Args:
        text: Input text with diacritics
        word_to_id: Dictionary mapping words to IDs
    
    Returns:
        Tuple of (char_ids, diacritic_ids, word_ids, word_positions)
        - char_ids: List of character IDs
        - diacritic_ids: List of diacritic IDs for each character
        - word_ids: List of word IDs aligned with characters
        - word_positions: List indicating position in word (0=not end, 1=end, 2=space)
    """
    bases, diacs = extract_base_and_diacritics(text)
    
    UNKNOWN_DIACRITIC_ID = diacritics_to_id.get('', len(diacritics_to_id) - 1)

    char_ids = [char_to_id.get(c, char_to_id['UNK']) for c in bases]
    diacritic_ids = [diacritics_to_id.get(d, UNKNOWN_DIACRITIC_ID) for d in diacs]
    
    text_no_diac = araby.strip_diacritics(text)
    words = araby.tokenize(text_no_diac)
    
    word_ids = []
    word_positions = [] # 0=not end, 1=end, 2=space
    
    char_idx = 0
    for word in words:
        if not word.strip():
            continue
        
        word_id = word_to_id.get(word, word_to_id['<UNK>'])
        word_len = len(word)
        
        for i in range(word_len):
            if char_idx < len(char_ids):
                word_ids.append(word_id)
                if i == word_len - 1:
                    word_positions.append(1)
                else:
                    word_positions.append(0)
                char_idx += 1
        
        if char_idx < len(char_ids) and bases[char_idx] == ' ':
            word_ids.append(0)  # for padding
            word_positions.append(2)
            char_idx += 1
    
    while len(word_ids) < len(char_ids):
        word_ids.append(0)
        word_positions.append(2)
    
    return char_ids, diacritic_ids, word_ids, word_positions

In [41]:
pickle.dump(char_to_id, open(CHAR_TO_ID_PATH, 'wb'))
pickle.dump(word_to_id, open(WORD_TO_ID_PATH, 'wb'))

<h2> Prepare data for the model

In [42]:
x_train_char_raw = []
y_train_raw = []
x_train_word_raw = []
x_train_word_position_raw = []

UNKNOWN_DIACRITIC_ID = diacritics_to_id.get('', len(diacritics_to_id) - 1)

print("Extracting character and word-level features from training data...")
for text in train_data:
    char_ids, diacritic_ids, word_ids, word_positions = extract_char_and_word_features(text, word_to_id)
    
    x_train_char_raw.append(char_ids)
    y_train_raw.append(diacritic_ids)
    x_train_word_raw.append(word_ids)
    x_train_word_position_raw.append(word_positions)

print(f"Training samples: {len(x_train_char_raw)}")

Extracting character and word-level features from training data...
Training samples: 172467


In [43]:
PAD_DIACRITIC_ID = diacritics_to_id.get('', 0)

x_train_char = tf.keras.preprocessing.sequence.pad_sequences(x_train_char_raw, padding='post', value=0)
x_train_word = tf.keras.preprocessing.sequence.pad_sequences(x_train_word_raw, padding='post', value=0)
x_train_position = tf.keras.preprocessing.sequence.pad_sequences(x_train_word_position_raw, padding='post', value=2)
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_train_char shape: {x_train_char.shape}")
print(f"x_train_word shape: {x_train_word.shape}")
print(f"x_train_position shape: {x_train_position.shape}")
print(f"y_train shape: {y_train.shape}")

x_train_char shape: (172467, 607)
x_train_word shape: (172467, 607)
x_train_position shape: (172467, 607)
y_train shape: (172467, 607)


In [44]:
x_val_char_raw = []
y_val_raw = []
x_val_word_raw = []
x_val_word_position_raw = []

print("Extracting character and word-level features from validation data...")
for text in val_data:
    char_ids, diacritic_ids, word_ids, word_positions = extract_char_and_word_features(text, word_to_id)
    
    x_val_char_raw.append(char_ids)
    y_val_raw.append(diacritic_ids)
    x_val_word_raw.append(word_ids)
    x_val_word_position_raw.append(word_positions)

print(f"Validation samples: {len(x_val_char_raw)}")

Extracting character and word-level features from validation data...
Validation samples: 8332


In [45]:
x_val_char = tf.keras.preprocessing.sequence.pad_sequences(x_val_char_raw, padding='post', value=0)
x_val_word = tf.keras.preprocessing.sequence.pad_sequences(x_val_word_raw, padding='post', value=0)
x_val_position = tf.keras.preprocessing.sequence.pad_sequences(x_val_word_position_raw, padding='post', value=2)
y_val = tf.keras.preprocessing.sequence.pad_sequences(y_val_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_val_char shape: {x_val_char.shape}")
print(f"x_val_word shape: {x_val_word.shape}")
print(f"x_val_position shape: {x_val_position.shape}")
print(f"y_val shape: {y_val.shape}")

x_val_char shape: (8332, 597)
x_val_word shape: (8332, 597)
x_val_position shape: (8332, 597)
y_val shape: (8332, 597)


In [46]:
class DERMetric(tf.keras.metrics.Metric):
    def __init__(self, pad_id=0, space_id=None, name='DER', **kwargs):
        super(DERMetric, self).__init__(name=name, **kwargs)
        self.pad_id = pad_id
        self.space_id = space_id
        self.total = self.add_weight(name='total', initializer='zeros')
        self.errors = self.add_weight(name='errors', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        """
        y_true: shape (batch_size, seq_len)
        y_pred: shape (batch_size, seq_len, num_classes)
        """
        y_pred_labels = tf.argmax(y_pred, axis=-1, output_type=tf.int32)

        mask = tf.not_equal(y_true, self.pad_id)
        if self.space_id is not None:
            mask = tf.logical_and(mask, tf.not_equal(y_true, self.space_id))

        correct = tf.equal(y_true, y_pred_labels)
        correct = tf.logical_and(correct, mask)

        batch_errors = tf.reduce_sum(tf.cast(~correct, tf.float32))
        batch_total = tf.reduce_sum(tf.cast(mask, tf.float32))

        self.errors.assign_add(batch_errors)
        self.total.assign_add(batch_total)

    def result(self):
        return (self.errors / self.total) * 100  # DER in %

    def reset_state(self):
        self.errors.assign(0)
        self.total.assign(0)

In [47]:
def calculate_der_by_position(x_val_char, y_true, y_pred, char_to_id):
    """
    Calculate DER separately for last characters and non-last characters in words
    
    Args:
        x_val_char: Character sequences (samples x sequence_length)
        y_true: Ground truth diacritic labels (samples x sequence_length)
        y_pred: Predicted diacritic labels (samples x sequence_length)
        char_to_id: Dictionary mapping characters to IDs
    
    Returns:
        Tuple of (DER_non_last, DER_last, overall_DER)
    """
    space_id = char_to_id.get(' ', char_to_id.get('UNK'))
    pad_id = char_to_id.get('<PAD>', 0)
    
    non_last_errors = 0
    non_last_total = 0
    last_errors = 0
    last_total = 0
    
    for char_seq, y_true_seq, y_pred_seq in zip(x_val_char, y_true, y_pred):
        valid_mask = char_seq != pad_id
        valid_indices = np.where(valid_mask)[0]
        
        if len(valid_indices) == 0:
            continue
        
        i = 0
        while i < len(valid_indices):
            idx = valid_indices[i]
            
            if char_seq[idx] == space_id:
                i += 1
                continue
            
            word_start = i
            while i < len(valid_indices) and char_seq[valid_indices[i]] != space_id:
                i += 1
            word_end = i - 1
            
            for j in range(word_start, word_end + 1):
                pos_idx = valid_indices[j]
                
                if y_true_seq[pos_idx] == PAD_DIACRITIC_ID:
                    continue
                
                is_correct = (y_true_seq[pos_idx] == y_pred_seq[pos_idx])
                
                if j == word_end:
                    last_total += 1
                    if not is_correct:
                        last_errors += 1
                else:
                    non_last_total += 1
                    if not is_correct:
                        non_last_errors += 1
    
    der_non_last = (non_last_errors / non_last_total * 100) if non_last_total > 0 else 0
    der_last = (last_errors / last_total * 100) if last_total > 0 else 0
    
    total_errors = non_last_errors + last_errors
    total_chars = non_last_total + last_total
    der_overall = (total_errors / total_chars * 100) if total_chars > 0 else 0
    
    return der_non_last, der_last, der_overall

In [None]:
def build_model(vocab_size, word_vocab_size, num_diacritics, pad_id):
    char_input = Input(shape=(None,), name='char_input')
    char_embedding = layers.Embedding(
        input_dim=vocab_size,
        output_dim=128,
        mask_zero=True,
        name='char_embedding'
    )(char_input)

    word_input = Input(shape=(None,), name='word_input')
    word_embedding = layers.Embedding(
        input_dim=word_vocab_size,
        output_dim=128,
        mask_zero=True,
        name='word_embedding'
    )(word_input)

    position_input = Input(shape=(None,), name='position_input')
    position_embedding = layers.Embedding(
        input_dim=3,
        output_dim=16,
        mask_zero=False,
        name='position_embedding'
    )(position_input)

    combined = layers.Concatenate(name='feature_concat')([
        char_embedding,
        word_embedding,
        position_embedding
    ])

    combined._keras_mask = char_embedding._keras_mask

    bilstm1 = layers.Bidirectional(
        layers.LSTM(
            256,
            return_sequences=True,
            activation='tanh',
            recurrent_activation='sigmoid'
        ),
        name='bilstm_1'
    )(combined)


    bilstm2 = layers.Bidirectional(
        layers.LSTM(
            256,
            return_sequences=True,
            activation='tanh',
            recurrent_activation='sigmoid'
        ),
        name='bilstm_2'
    )(bilstm1)


    dense1 = layers.Dense(
        256,
        activation='relu',
        name='dense_1'
    )(bilstm2)


    dense2 = layers.Dense(
        256,
        activation='relu',
        name='dense_2'
    )(dense1)

    output = layers.Dense(
        num_diacritics,
        activation='softmax',
        name='diacritic_output'
    )(dense2)

    model = Model(
        inputs=[char_input, word_input, position_input],
        outputs=output
    )

    der_metric = DERMetric(pad_id=0, space_id=pad_id)

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=[der_metric]
    )

    print("\nModel Architecture:")
    print(model.summary())
    
    return model


In [49]:
model = build_model(
    vocab_size=len(char_to_id),
    word_vocab_size=len(word_to_id),
    num_diacritics=len(diacritics_to_id),
    pad_id=char_to_id.get('<PAD>', 0)
)


Model Architecture:


None


## Setup Callbacks for Training
Configure early stopping and model checkpointing

In [50]:
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)

early_stopping = EarlyStopping(
    monitor='val_DER',           
    patience=3,                 
    mode='min',                 
    verbose=1,
    restore_best_weights=True    
)

checkpoint_path = MODEL_WEIGHTS_PATH
model_checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='val_DER',           
    mode='min',                  
    save_best_only=True,    
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_DER',
    factor=0.5,                  
    patience=2,                  
    mode='min',
    verbose=1,
    min_lr=1e-7                
)

callbacks = [early_stopping, model_checkpoint, reduce_lr]

## Train Model with Validation and Callbacks

In [51]:
with tf.device('/GPU:0'):
    history = model.fit(
        {'char_input': x_train_char, 'word_input': x_train_word, 'position_input': x_train_position},
        y_train,
        validation_data=(
            {'char_input': x_val_char, 'word_input': x_val_word, 'position_input': x_val_position},
            y_val
        ),
        epochs=10,                    
        batch_size=64,
        callbacks=callbacks,
        verbose=1
    )

Epoch 1/10
[1m2695/2695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - DER: 4.2778 - loss: 0.1674
Epoch 1: val_DER improved from inf to 3.19949, saving model to ./models/best_weights.keras
[1m2695/2695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m353s[0m 129ms/step - DER: 4.2774 - loss: 0.1673 - val_DER: 3.1995 - val_loss: 0.0085 - learning_rate: 0.0010
Epoch 2/10
[1m2695/2695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - DER: 3.0416 - loss: 0.0065
Epoch 2: val_DER improved from 3.19949 to 3.14476, saving model to ./models/best_weights.keras
[1m2695/2695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 130ms/step - DER: 3.0416 - loss: 0.0065 - val_DER: 3.1448 - val_loss: 0.0072 - learning_rate: 0.0010
Epoch 3/10
[1m2695/2695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - DER: 3.0003 - loss: 0.0048
Epoch 3: val_DER improved from 3.14476 to 3.12581, saving model to ./models/best_weights.keras
[1m2695/2695[0m [

In [52]:
model.save("BiLSTM_model.keras")

## Load Best Model 
Load the best model saved during training locally

In [53]:
MODEL_PATH = './models/BiLSTM_model.keras'
model = tf.keras.models.load_model(MODEL_PATH, custom_objects={"DERMetric": DERMetric})
model.save_weights("BiLSTM_weights.weights.h5")



ValueError: File not found: filepath=./models/BiLSTM_model.keras. Please ensure the file is an accessible `.keras` zip file.

<h2> Test Val</h2>

In [None]:
y_pred = model.predict({'char_input': x_val_char, 'word_input': x_val_word, 'position_input': x_val_position})
y_pred_classes = np.argmax(y_pred, axis=-1)

In [None]:
y_true = y_val

accuracy = accuracy_score(y_true.flatten(), y_pred_classes.flatten())
print(f'Validation Accuracy: {accuracy:.4f}')

In [None]:
der_non_last, der_last, der_overall = calculate_der_by_position(x_val_char, y_true, y_pred_classes, char_to_id)

print("\n" + "="*60)
print("DER Analysis by Character Position in Words")
print("="*60)
print(f"DER for non-last characters: {der_non_last:.2f}%")
print(f"DER for last characters:     {der_last:.2f}%")
print(f"Overall DER:                 {der_overall:.2f}%")
print(f"\nAccuracy for non-last characters: {100 - der_non_last:.2f}%")
print(f"Accuracy for last characters:     {100 - der_last:.2f}%")
print(f"Total Accuracy: {100 - der_overall:.2f}%")
print("="*60)