<h1> Arabic letters and diacritics

In [33]:
import pickle
import re
import pyarabic.araby as araby
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Input, Model
import pandas as pd

<h2> Constants

In [34]:
window_size = 1000

ARABIC_LETTERS_PATH = './utils/arabic_letters.pickle'
DIACRITICS_PATH = './utils/diacritics.pickle'
DIACRITICS_TO_ID_PATH = './utils/diacritic2id.pickle'
CHAR_TO_ID_PATH = './utils/char2id.pickle'
WORD_TO_ID_PATH = './utils/word2id.pickle'

TEST_PATH = './data/dataset_no_diacritics.txt'

MODEL_WEIGHTS_PATH = './models/best_weights.ckpt'

MODEL_PATH = './models/BiLSTM_V2_model.keras'

GOLD_PATH = './data/sample_test_set_gold.csv'

## GPU Configuration for Kaggle
Check GPU availability and configure TensorFlow to use GPU

In [35]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        print(f"GPU detected: {gpus}")
        print(f"GPU Name: {tf.test.gpu_device_name()}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. Training will use CPU.")

Num GPUs Available:  0
No GPU found. Training will use CPU.


## Load Dictionaries and Create Mappings

In [36]:
arabic_letters = []
diacritics = []
diacritics_to_id = {}
char_to_id = {}
word_to_id = {}

with open(ARABIC_LETTERS_PATH, 'rb') as f:
    arabic_letters = pickle.load(f)
with open(DIACRITICS_PATH, 'rb') as f:
    diacritics = pickle.load(f)
with open(DIACRITICS_TO_ID_PATH, 'rb') as f:
    diacritics_to_id = pickle.load(f)
with open(CHAR_TO_ID_PATH, 'rb') as f:
    char_to_id = pickle.load(f)
with open(WORD_TO_ID_PATH, 'rb') as f:
    word_to_id = pickle.load(f)


gold = pd.read_csv(GOLD_PATH)

<h2> Read test data

In [37]:
test_data = []
with open(TEST_PATH, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        test_data.append(line.strip())
print(len(test_data))

2461


<h2> Clean data

In [38]:
def clean_arabic_text(text):
    """
    Clean text to keep only Arabic letters, diacritics, and spaces
    """
    allowed_chars = arabic_letters.union(diacritics, {' ', '\t', '\n'})
    
    cleaned_text = ''.join(char for char in text if char in allowed_chars)
    
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text


def split_sentences(sentences, window_size=window_size):
    all_segments = []
    
    for sentence in sentences:
        words = araby.tokenize(sentence)
        current_segment = []
        current_len = 0
        
        for word in words:
            word_len = len(word)
            add_space = 1 if current_segment else 0
            
            if current_len + word_len + add_space <= window_size:
                current_segment.append(word)
                current_len += word_len + add_space
            else:
                if current_segment:
                    all_segments.append(" ".join(current_segment))
                
                current_segment = [word]
                current_len = word_len
        
        if current_segment:
            all_segments.append(" ".join(current_segment))

    return all_segments


def sentence_tokeniz(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        subsentences = araby.sentence_tokenize(sentence)
        tokenized_sentences.extend(subsentences)
    return tokenized_sentences

In [39]:
test_data = sentence_tokeniz(test_data)

for i in range(len(test_data)):
    test_data[i] = clean_arabic_text(test_data[i])

test_data = split_sentences(test_data, window_size)

In [40]:
import unicodedata

def is_diacritic(ch):
    return unicodedata.combining(ch) != 0

def extract_base_and_diacritics(text):
    text = unicodedata.normalize('NFC', text)
    bases = []
    diacs = []
    current_base = None
    current_diac = ''
    for ch in text:
        if is_diacritic(ch):
            current_diac += ch
        else:
            if current_base is not None:
                bases.append(current_base)
                diacs.append(current_diac)
            current_base = ch
            current_diac = ''
    if current_base is not None:
        bases.append(current_base)
        diacs.append(current_diac)
    return bases, diacs

def extract_char_and_word_features(text, word_to_id):
    """
    Extract both character-level and word-level features from text (without diacritics)
    
    Args:
        text: Input text WITHOUT diacritics
        word_to_id: Dictionary mapping words to IDs
    
    Returns:
        Tuple of (char_ids, word_ids, word_positions)
        - char_ids: List of character IDs
        - word_ids: List of word IDs aligned with characters
        - word_positions: List indicating position in word (0=not end, 1=end, 2=space)
    """
    # For prediction, text has no diacritics, so we create dummy diacritics
    # to match the training format
    text_no_diac = araby.strip_diacritics(text)
    
    # Create a version with spaces to match training
    bases = []
    for ch in text_no_diac:
        bases.append(ch)
    
    UNKNOWN_DIACRITIC_ID = diacritics_to_id.get('', len(diacritics_to_id) - 1)

    char_ids = [char_to_id.get(c, char_to_id['UNK']) for c in bases]
    
    words = araby.tokenize(text_no_diac)
    
    word_ids = []
    word_positions = [] # 0=not end, 1=end, 2=space
    
    char_idx = 0
    for word in words:
        if not word.strip():
            continue
        
        word_id = word_to_id.get(word, word_to_id['<UNK>'])
        word_len = len(word)
        
        for i in range(word_len):
            if char_idx < len(char_ids):
                word_ids.append(word_id)
                if i == word_len - 1:
                    word_positions.append(1)
                else:
                    word_positions.append(0)
                char_idx += 1
        
        # Handle space after word
        if char_idx < len(char_ids) and bases[char_idx] == ' ':
            word_ids.append(0)  # padding for space
            word_positions.append(2)
            char_idx += 1
    
    # Fill remaining with padding
    while len(word_ids) < len(char_ids):
        word_ids.append(0)
        word_positions.append(2)
    
    return char_ids, word_ids, word_positions


<h2> Prepare data for the model

In [41]:
x_test_char_raw = []
x_test_word_raw = []
x_test_word_position_raw = []

print("Extracting character and word-level features from testing data...")
for text in test_data:
    char_ids, word_ids, word_positions = extract_char_and_word_features(text, word_to_id)
    
    x_test_char_raw.append(char_ids)
    x_test_word_raw.append(word_ids)
    x_test_word_position_raw.append(word_positions)

print(f"Testing samples: {len(x_test_char_raw)}")

Extracting character and word-level features from testing data...
Testing samples: 5735


In [42]:
x_test_char = tf.keras.preprocessing.sequence.pad_sequences(x_test_char_raw, padding='post', value=0)
x_test_word = tf.keras.preprocessing.sequence.pad_sequences(x_test_word_raw, padding='post', value=0)
x_test_position = tf.keras.preprocessing.sequence.pad_sequences(x_test_word_position_raw, padding='post', value=2)

print(f"x_test_char shape: {x_test_char.shape}")
print(f"x_test_word shape: {x_test_word.shape}")
print(f"x_test_position shape: {x_test_position.shape}")

x_test_char shape: (5735, 999)
x_test_word shape: (5735, 999)
x_test_position shape: (5735, 999)


In [43]:
class DERMetric(tf.keras.metrics.Metric):
    def __init__(self, pad_id=0, space_id=None, name='DER', **kwargs):
        super(DERMetric, self).__init__(name=name, **kwargs)
        self.pad_id = pad_id
        self.space_id = space_id
        self.total = self.add_weight(name='total', initializer='zeros')
        self.errors = self.add_weight(name='errors', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        """
        y_true: shape (batch_size, seq_len)
        y_pred: shape (batch_size, seq_len, num_classes)
        """
        y_pred_labels = tf.argmax(y_pred, axis=-1, output_type=tf.int32)

        mask = tf.not_equal(y_true, self.pad_id)
        if self.space_id is not None:
            mask = tf.logical_and(mask, tf.not_equal(y_true, self.space_id))

        correct = tf.equal(y_true, y_pred_labels)
        correct = tf.logical_and(correct, mask)

        batch_errors = tf.reduce_sum(tf.cast(~correct, tf.float32))
        batch_total = tf.reduce_sum(tf.cast(mask, tf.float32))

        self.errors.assign_add(batch_errors)
        self.total.assign_add(batch_total)

    def result(self):
        return (self.errors / self.total) * 100  # DER in %

    def reset_state(self):
        self.errors.assign(0)
        self.total.assign(0)

In [44]:
def calculate_der_by_position(x_val_char, y_true, y_pred, char_to_id):
    """
    Calculate DER separately for last characters and non-last characters in words
    
    Args:
        x_val_char: Character sequences (samples x sequence_length)
        y_true: Ground truth diacritic labels (samples x sequence_length)
        y_pred: Predicted diacritic labels (samples x sequence_length)
        char_to_id: Dictionary mapping characters to IDs
    
    Returns:
        Tuple of (DER_non_last, DER_last, overall_DER)
    """
    PAD_DIACRITIC_ID = diacritics_to_id.get('', 0)

    space_id = char_to_id.get(' ', char_to_id.get('UNK'))
    pad_id = char_to_id.get('<PAD>', 0)
    
    non_last_errors = 0
    non_last_total = 0
    last_errors = 0
    last_total = 0
    
    for char_seq, y_true_seq, y_pred_seq in zip(x_val_char, y_true, y_pred):
        valid_mask = char_seq != pad_id
        valid_indices = np.where(valid_mask)[0]
        
        if len(valid_indices) == 0:
            continue
        
        i = 0
        while i < len(valid_indices):
            idx = valid_indices[i]
            
            if char_seq[idx] == space_id:
                i += 1
                continue
            
            word_start = i
            while i < len(valid_indices) and char_seq[valid_indices[i]] != space_id:
                i += 1
            word_end = i - 1
            
            for j in range(word_start, word_end + 1):
                pos_idx = valid_indices[j]
                
                if y_true_seq[pos_idx] == PAD_DIACRITIC_ID:
                    continue
                
                is_correct = (y_true_seq[pos_idx] == y_pred_seq[pos_idx])
                
                if j == word_end:
                    last_total += 1
                    if not is_correct:
                        last_errors += 1
                else:
                    non_last_total += 1
                    if not is_correct:
                        non_last_errors += 1
    
    der_non_last = (non_last_errors / non_last_total * 100) if non_last_total > 0 else 0
    der_last = (last_errors / last_total * 100) if last_total > 0 else 0
    
    total_errors = non_last_errors + last_errors
    total_chars = non_last_total + last_total
    der_overall = (total_errors / total_chars * 100) if total_chars > 0 else 0
    
    return der_non_last, der_last, der_overall

## Load and test the model

In [45]:
def build_model(vocab_size, word_vocab_size, num_diacritics, pad_id):
    char_input = Input(shape=(None,), name='char_input')
    char_embedding = layers.Embedding(
        input_dim=vocab_size,
        output_dim=128,
        mask_zero=True,
        name='char_embedding'
    )(char_input)

    word_input = Input(shape=(None,), name='word_input')
    word_embedding = layers.Embedding(
        input_dim=word_vocab_size,
        output_dim=128,
        mask_zero=True,
        name='word_embedding'
    )(word_input)

    position_input = Input(shape=(None,), name='position_input')
    position_embedding = layers.Embedding(
        input_dim=3,
        output_dim=16,
        mask_zero=False,
        name='position_embedding'
    )(position_input)

    combined = layers.Concatenate(name='feature_concat')([
        char_embedding,
        word_embedding,
        position_embedding
    ])

    combined._keras_mask = char_embedding._keras_mask

    lstm_out = layers.Bidirectional(
        layers.LSTM(
            256,
            return_sequences=True,
            activation='tanh',
            recurrent_activation='sigmoid'
        ),
        name='bilstm'
    )(combined)

    output = layers.Dense(
        num_diacritics,
        activation='softmax',
        name='diacritic_output'
    )(lstm_out)

    model = Model(
        inputs=[char_input, word_input, position_input],
        outputs=output
    )

    der_metric = DERMetric(pad_id=0, space_id=pad_id)

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=[der_metric]
    )

    print("\nModel Architecture:")
    print(model.summary())
    
    return model


In [46]:
model = tf.keras.models.load_model(MODEL_PATH, custom_objects={"DERMetric": DERMetric})

In [47]:
y_pred = model.predict({'char_input': x_test_char, 'word_input': x_test_word, 'position_input': x_test_position})

y_pred_classes = np.argmax(y_pred, axis=-1)
print(y_pred_classes.shape)

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m519s[0m 3s/step
(5735, 999)


In [48]:
id_to_diacritic = {v: k for k, v in diacritics_to_id.items()}
id_to_char = {v: k for k, v in char_to_id.items()}

space_id = char_to_id.get(' ', None)

all_predictions = []
for sample_idx in range(len(x_test_char)):
    sample_chars = x_test_char[sample_idx]
    sample_preds = y_pred_classes[sample_idx]
    
    non_padded_mask = (sample_chars != 0) & (sample_chars != char_to_id.get('PAD', 0)) & (sample_chars != char_to_id.get('UNK'))
    
    if space_id is not None:
        non_space_mask = sample_chars != space_id
        valid_mask = non_padded_mask & non_space_mask
    else:
        valid_mask = non_padded_mask
    
    valid_preds = sample_preds[valid_mask]
    all_predictions.extend(valid_preds.tolist())

y_pred_classes_filtered = np.array(all_predictions)
df = pd.DataFrame({
    "ID": np.arange(len(y_pred_classes_filtered)),
    "label": y_pred_classes_filtered
})

# save to CSV
df.to_csv("predictions.csv", index=False)

df_test = pd.read_csv("test_no_diacritics.csv")
mask = df_test["case_ending"].astype(str).str.upper() == "TRUE"
df_test_filtered = df_test[mask]

df_pred_filtered = df.loc[df_test_filtered.index]
df_pred_filtered.to_csv("predictions_case_ending.csv", index=False)




print(f"Total predictions after filtering padding and spaces: {len(y_pred_classes_filtered)}")
print(f"Gold labels count: {len(gold)}")


Total predictions after filtering padding and spaces: 237240
Gold labels count: 182


In [49]:
if len(y_pred_classes_filtered) == len(gold):
    accuracy = (gold['label'] == y_pred_classes_filtered).sum() / len(gold) * 100
    print(f"\n{'='*60}")
    print(f"Test Set Evaluation")
    print(f"{'='*60}")
    print(f"Total characters (excluding spaces): {len(gold)}")
    print(f"Correct predictions: {(gold['label'] == y_pred_classes_filtered).sum()}")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"DER (Diacritization Error Rate): {100 - accuracy:.2f}%")
    print(f"{'='*60}")
else:
    print(f"ERROR: Mismatch in lengths!")
    print(f"Predictions: {len(y_pred_classes_filtered)}")
    print(f"Gold labels: {len(gold)}")
    print(f"Difference: {abs(len(y_pred_classes_filtered) - len(gold))}")
    
    print("\nFirst 10 predictions:", y_pred_classes_filtered[:10])
    print("First 10 gold labels:", gold['label'].values[:10])






ERROR: Mismatch in lengths!
Predictions: 237240
Gold labels: 182
Difference: 237058

First 10 predictions: [ 4 14 14  6  0  0 14  4 14  6]
First 10 gold labels: [ 0  6  0  4  6  0  4 14  4  4]


In [None]:
def reconstruct_diacritized_text(char_sequences, pred_sequences, char_to_id, id_to_diacritic, original_texts):
    """
    Reconstruct the full diacritized text from character sequences and predictions.
    
    Args:
        char_sequences: Array of character IDs
        pred_sequences: Array of predicted diacritic IDs
        char_to_id: Dictionary mapping characters to IDs
        id_to_diacritic: Dictionary mapping IDs to diacritics
        original_texts: List of original text segments
    
    Returns:
        List of diacritized text segments
    """
    id_to_char = {v: k for k, v in char_to_id.items()}
    pad_id = char_to_id.get('<PAD>', 0)
    
    diacritized_texts = []
    
    for sample_idx, original_text in enumerate(original_texts):
        sample_chars = char_sequences[sample_idx]
        sample_preds = pred_sequences[sample_idx]
        
        diacritized_text = ""
        for char_id, diac_id in zip(sample_chars, sample_preds):
            if char_id == pad_id or char_id == 0:
                diacritized_text += ' '
                continue
            
            char = id_to_char.get(char_id, '')
            diacritic = id_to_diacritic.get(diac_id, '')
            
            diacritized_text += char + diacritic
        
        diacritized_texts.append(diacritized_text)
    
    return diacritized_texts

print("Reconstructing diacritized text...")
diacritized_results = reconstruct_diacritized_text(
    x_test_char, 
    y_pred_classes, 
    char_to_id, 
    id_to_diacritic,
    test_data
)

In [None]:
output_file = "diacritized_output.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for text in diacritized_results:
        f.write(text + '\n')