# Download data and models

In [75]:
# GDRIVE_ID_DATA = "1ONRQ36PFPnYNA4R6ZlmM7UQJ4LiAzEH0"
# !gdown $GDRIVE_ID_DATA -O Arabic-Text-Diacritization.zip
# !unzip Arabic-Text-Diacritization.zip

In [76]:
# GDRIVE_ID_LSTM_MODEL = "1kLRQ3o7m57qK1OJOTA-K9OBYL29zuXjo"
# !gdown $GDRIVE_ID_LSTM_MODEL -O LSTM.joblib

In [77]:
# import tarfile

# file_path = "/kaggle/input/tashkeela/Tashkeela-arabic-diacritized-text-utf8-0.3.tar.bz2"
# extract_path = "/kaggle/working/tashkeela_extracted"

# # Extract tar.bz2 file
# with tarfile.open(file_path, "r:bz2") as tar:
#     tar.extractall(path=extract_path)

# extract_path

<h1> Arabic letters and diacritics

In [78]:
import pickle
import re
import pyarabic.araby as araby
import pyarabic.number as number
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import unicodedata
import tensorflow as tf
from tensorflow.keras import layers, models, Input, Model
import joblib
from sklearn.metrics import accuracy_score

<h2> Constants

In [79]:
window_size = 1000

## GPU Configuration for Kaggle
Check GPU availability and configure TensorFlow to use GPU

In [80]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Get GPU details
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to avoid OOM errors
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        print(f"GPU detected: {gpus}")
        print(f"GPU Name: {tf.test.gpu_device_name()}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. Training will use CPU.")

Num GPUs Available:  1
Physical devices cannot be modified after being initialized


## Load Dictionaries and Create Mappings

In [81]:
arabic_letters = []
diacritics = []
diacritics_to_id = {}
with open('./utils/arabic_letters.pickle', 'rb') as f:
    arabic_letters = pickle.load(f)
with open('./utils/diacritics.pickle', 'rb') as f:
    diacritics = pickle.load(f)
with open('./utils/diacritic2id.pickle', 'rb') as f:
    diacritics_to_id = pickle.load(f)

arabic_letters_sorted = sorted(arabic_letters)
char_to_id = {char: idx + 1 for idx, char in enumerate(arabic_letters_sorted)}
char_to_id['<PAD>'] = 0
char_to_id['UNK'] = len(char_to_id)

In [82]:
# Build word vocabulary from training data
def build_word_vocabulary(data):
    """
    Build word vocabulary from training data
    
    Args:
        data: List of text samples
    
    Returns:
        Dictionary mapping words to IDs
    """
    word_counts = {}
    for text in data:
        # Remove diacritics for word tokenization
        text_no_diac = araby.strip_diacritics(text)
        words = araby.tokenize(text_no_diac)
        for word in words:
            if word.strip():  # Skip empty strings
                word_counts[word] = word_counts.get(word, 0) + 1
    
    # Sort by frequency and create word_to_id mapping
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    
    word_to_id = {'<PAD>': 0, '<UNK>': 1}
    for idx, (word, _) in enumerate(sorted_words):
        word_to_id[word] = idx + 2
    
    return word_to_id

# Will be populated after loading training data
word_to_id = {}

<h2> Read train and val data

In [83]:
train_data = []
val_data = []
with open('./data/train.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        train_data.append(line.strip())
with open('./data/val.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        val_data.append(line.strip())
print(len(train_data))
print(len(val_data))

50000
2500


<h2> Clean data

In [84]:
def clean_arabic_text(text):
    """
    Clean text to keep only Arabic letters, diacritics, and spaces
    """
    # Create a set of allowed characters (Arabic letters + diacritics + space)
    allowed_chars = arabic_letters.union(diacritics, {' ', '\t', '\n'})
    
    # Filter the text to keep only allowed characters
    cleaned_text = ''.join(char for char in text if char in allowed_chars)
    
    # Normalize whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text


def split_sentences(sentences, window_size=window_size):
    all_segments = []
    
    for sentence in sentences:
        words = araby.tokenize(sentence)
        current_segment = []
        current_len = 0
        
        for word in words:
            word_len = len(word)
            add_space = 1 if current_segment else 0
            
            if current_len + word_len + add_space <= window_size:
                current_segment.append(word)
                current_len += word_len + add_space
            else:
                # save the segment
                if current_segment:
                    all_segments.append(" ".join(current_segment))
                
                # start new segment
                current_segment = [word]
                current_len = word_len
        
        # append the final segment of the sentence
        if current_segment:
            all_segments.append(" ".join(current_segment))

    return all_segments



def sentence_tokeniz(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        subsentences = araby.sentence_tokenize(sentence)
        tokenized_sentences.extend(subsentences)
    return tokenized_sentences
        


In [85]:
train_data = sentence_tokeniz(train_data)
val_data = sentence_tokeniz(val_data)

for i in range(len(train_data)):
    train_data[i] = clean_arabic_text(train_data[i])
for i in range(len(val_data)):
    val_data[i] = clean_arabic_text(val_data[i])

train_data = split_sentences(train_data, window_size)
val_data = split_sentences(val_data, window_size)

# Build word vocabulary from training data
print("Building word vocabulary...")
word_to_id = build_word_vocabulary(train_data)
print(f"Word vocabulary size: {len(word_to_id)}")

Building word vocabulary...
Word vocabulary size: 105864


In [86]:
def is_diacritic(ch):
    # Unicode combining marks (Arabic diacritics are combining marks)
    return unicodedata.combining(ch) != 0

def extract_base_and_diacritics(text):
    # normalize to NFC so base+combining marks are consistent
    text = unicodedata.normalize('NFC', text)
    bases = []
    diacs = []
    current_base = None
    current_diac = ''
    for ch in text:
        if is_diacritic(ch):
            # accumulate diacritics for current base
            current_diac += ch
        else:
            # new base character
            if current_base is not None:
                bases.append(current_base)
                diacs.append(current_diac)
            current_base = ch
            current_diac = ''
    # append last
    if current_base is not None:
        bases.append(current_base)
        diacs.append(current_diac)
    return bases, diacs

In [87]:
def extract_char_and_word_features(text, word_to_id):
    """
    Extract both character-level and word-level features from text
    
    Args:
        text: Input text with diacritics
        word_to_id: Dictionary mapping words to IDs
    
    Returns:
        Tuple of (char_ids, diacritic_ids, word_ids, word_positions)
        - char_ids: List of character IDs
        - diacritic_ids: List of diacritic IDs for each character
        - word_ids: List of word IDs aligned with characters
        - word_positions: List indicating position in word (0=start, 1=middle, 2=end, 3=single-char word)
    """
    # Extract base characters and diacritics
    bases, diacs = extract_base_and_diacritics(text)
    
    UNKNOWN_DIACRITIC_ID = diacritics_to_id.get('', len(diacritics_to_id) - 1)

    # Convert characters to IDs
    char_ids = [char_to_id.get(c, char_to_id['UNK']) for c in bases]
    diacritic_ids = [diacritics_to_id.get(d, UNKNOWN_DIACRITIC_ID) for d in diacs]
    
    # Extract word-level features
    text_no_diac = araby.strip_diacritics(text)
    words = araby.tokenize(text_no_diac)
    
    # Create word ID sequence aligned with characters
    word_ids = []
    word_positions = []  # 0=start, 1=middle, 2=end, 3=single-char, 4=space
    
    char_idx = 0
    for word in words:
        if not word.strip():
            continue
        
        word_id = word_to_id.get(word, word_to_id['<UNK>'])
        word_len = len(word)
        
        # Assign same word ID to all characters in the word
        for i in range(word_len):
            if char_idx < len(char_ids):
                word_ids.append(word_id)
                
                # Determine position in word
                if word_len == 1:
                    word_positions.append(3)  # Single character word
                elif i == 0:
                    word_positions.append(0)  # Start of word
                elif i == word_len - 1:
                    word_positions.append(2)  # End of word
                else:
                    word_positions.append(1)  # Middle of word
                
                char_idx += 1
        
        # Handle space after word
        if char_idx < len(char_ids) and bases[char_idx] == ' ':
            word_ids.append(0)  # PAD for space
            word_positions.append(4)  # Space position
            char_idx += 1
    
    # Fill remaining with PAD
    while len(word_ids) < len(char_ids):
        word_ids.append(0)
        word_positions.append(4)  # Treat padding as space
    
    return char_ids, diacritic_ids, word_ids, word_positions

<h2> Prepare data for the model

In [88]:
# Prepare training data - extract both character and word level features
x_train_char_raw = []
y_train_raw = []
x_train_word_raw = []
x_train_word_position_raw = []

# Use a constant for unknown diacritic instead of hardcoded value
UNKNOWN_DIACRITIC_ID = diacritics_to_id.get('', len(diacritics_to_id) - 1)

print("Extracting character and word-level features from training data...")
for text in train_data:
    char_ids, diacritic_ids, word_ids, word_positions = extract_char_and_word_features(text, word_to_id)
    
    x_train_char_raw.append(char_ids)
    y_train_raw.append(diacritic_ids)
    x_train_word_raw.append(word_ids)
    x_train_word_position_raw.append(word_positions)

print(f"Training samples: {len(x_train_char_raw)}")

Extracting character and word-level features from training data...
Training samples: 172467


In [89]:
# vocab sizes (your variables)
vocab_size = len(char_to_id)
num_diacritics = len(diacritics_to_id)
word_vocab_size = len(word_to_id)

# ==============================
# 1. CHARACTER INPUT
# ==============================
char_input = Input(shape=(None,), name='char_input')
char_embedding = layers.Embedding(
    input_dim=vocab_size,
    output_dim=128,
    mask_zero=True,
    name='char_embedding'
)(char_input)

# ==============================
# 2. WORD INPUT
# ==============================
word_input = Input(shape=(None,), name='word_input')
word_embedding = layers.Embedding(
    input_dim=word_vocab_size,
    output_dim=128,
    mask_zero=True,
    name='word_embedding'
)(word_input)

# ==============================
# 3. WORD POSITION INPUT (0=start, 1=middle, 2=end, 3=single-char, 4=space)
# ==============================
position_input = Input(shape=(None,), name='position_input')
position_embedding = layers.Embedding(
    input_dim=5,  # 5 position types
    output_dim=16,
    mask_zero=False,
    name='position_embedding'
)(position_input)

# ==============================
# 4. CONCATENATE FEATURES
# ==============================
combined = layers.Concatenate(name='feature_concat')([
    char_embedding,
    word_embedding,
    position_embedding
])

# IMPORTANT FIX: Use only char mask to avoid GPU mask mismatch
combined._keras_mask = char_embedding._keras_mask

# ==============================
# 5. BiLSTM FOR DIACRITIZATION
# ==============================
lstm_out = layers.Bidirectional(
    layers.LSTM(
        256,
        return_sequences=True,
        activation='tanh',
        recurrent_activation='sigmoid'
    ),
    name='bilstm'
)(combined)

# ==============================
# 6. OUTPUT LAYER
# ==============================
output = layers.Dense(
    num_diacritics,
    activation='softmax',
    name='diacritic_output'
)(lstm_out)

# ==============================
# 7. CREATE & COMPILE MODEL
# ==============================
model = Model(
    inputs=[char_input, word_input, position_input],
    outputs=output
)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

print("\nModel Architecture:")
print("Position encoding: 0=start, 1=middle, 2=end, 3=single-char, 4=space")
model.summary()



Model Architecture:
Position encoding: 0=start, 1=middle, 2=end, 3=single-char, 4=space


In [90]:
# Pad sequences to same length
PAD_DIACRITIC_ID = diacritics_to_id.get('', 0)  # Use empty string diacritic for padding

x_train_char = tf.keras.preprocessing.sequence.pad_sequences(x_train_char_raw, padding='post', value=0)
x_train_word = tf.keras.preprocessing.sequence.pad_sequences(x_train_word_raw, padding='post', value=0)
x_train_position = tf.keras.preprocessing.sequence.pad_sequences(x_train_word_position_raw, padding='post', value=4)
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_train_char shape: {x_train_char.shape}")
print(f"x_train_word shape: {x_train_word.shape}")
print(f"x_train_position shape: {x_train_position.shape}")
print(f"y_train shape: {y_train.shape}")

x_train_char shape: (172467, 607)
x_train_word shape: (172467, 607)
x_train_position shape: (172467, 607)
y_train shape: (172467, 607)


In [91]:
# # Train model with multi-input (character + word features)
with tf.device('/GPU:0'):
    history = model.fit(
        {'char_input': x_train_char, 'word_input': x_train_word, 'position_input': x_train_position},
        y_train,
        epochs=10,
        batch_size=128,
        verbose=1
    )

Epoch 1/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 104ms/step - accuracy: 0.9784 - loss: 1.9717
Epoch 2/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 104ms/step - accuracy: 0.9974 - loss: 0.5436
Epoch 3/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 104ms/step - accuracy: 0.9980 - loss: 0.1661
Epoch 4/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 104ms/step - accuracy: 0.9983 - loss: 0.0683
Epoch 5/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 104ms/step - accuracy: 0.9985 - loss: 0.0332
Epoch 6/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 103ms/step - accuracy: 0.9988 - loss: 0.0178
Epoch 7/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 104ms/step - accuracy: 0.9990 - loss: 0.0103
Epoch 8/10
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 104ms/step - accuracy: 0.9991 - loss:

In [92]:
# joblib.dump(model, "/kaggle/working/LSTM2.joblib")

In [93]:
# model = joblib.load("./models/LSTM.joblib")

In [94]:
# Prepare validation data - extract both character and word level features
x_val_char_raw = []
y_val_raw = []
x_val_word_raw = []
x_val_word_position_raw = []

print("Extracting character and word-level features from validation data...")
for text in val_data:
    char_ids, diacritic_ids, word_ids, word_positions = extract_char_and_word_features(text, word_to_id)
    
    x_val_char_raw.append(char_ids)
    y_val_raw.append(diacritic_ids)
    x_val_word_raw.append(word_ids)
    x_val_word_position_raw.append(word_positions)

print(f"Validation samples: {len(x_val_char_raw)}")

Extracting character and word-level features from validation data...
Validation samples: 8332


In [95]:
# Pad validation sequences
x_val_char = tf.keras.preprocessing.sequence.pad_sequences(x_val_char_raw, padding='post', value=0)
x_val_word = tf.keras.preprocessing.sequence.pad_sequences(x_val_word_raw, padding='post', value=0)
x_val_position = tf.keras.preprocessing.sequence.pad_sequences(x_val_word_position_raw, padding='post', value=4)
y_val = tf.keras.preprocessing.sequence.pad_sequences(y_val_raw, padding='post', value=PAD_DIACRITIC_ID)

print(f"x_val_char shape: {x_val_char.shape}")
print(f"x_val_word shape: {x_val_word.shape}")
print(f"x_val_position shape: {x_val_position.shape}")
print(f"y_val shape: {y_val.shape}")

x_val_char shape: (8332, 597)
x_val_word shape: (8332, 597)
x_val_position shape: (8332, 597)
y_val shape: (8332, 597)


In [96]:
y_pred = model.predict({'char_input': x_val_char, 'word_input': x_val_word, 'position_input': x_val_position})

[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step


In [97]:
def calculate_der_by_position(x_val_char, y_true, y_pred, char_to_id):
    """
    Calculate DER separately for last characters and non-last characters in words
    
    Args:
        x_val_char: Character sequences (samples × sequence_length)
        y_true: Ground truth diacritic labels (samples × sequence_length)
        y_pred: Predicted diacritic labels (samples × sequence_length)
        char_to_id: Dictionary mapping characters to IDs
    
    Returns:
        Tuple of (DER_non_last, DER_last, overall_DER)
    """
    # Get space character ID
    space_id = char_to_id.get(' ', char_to_id.get('UNK'))
    pad_id = char_to_id.get('<PAD>', 0)
    
    non_last_errors = 0
    non_last_total = 0
    last_errors = 0
    last_total = 0
    
    # Process each sequence
    for char_seq, y_true_seq, y_pred_seq in zip(x_val_char, y_true, y_pred):
        # Find valid (non-padding) characters
        valid_mask = char_seq != pad_id
        valid_indices = np.where(valid_mask)[0]
        
        if len(valid_indices) == 0:
            continue
        
        # Identify word boundaries (spaces and end of sequence)
        i = 0
        while i < len(valid_indices):
            idx = valid_indices[i]
            
            # Skip spaces
            if char_seq[idx] == space_id:
                i += 1
                continue
            
            # Find the end of current word
            word_start = i
            while i < len(valid_indices) and char_seq[valid_indices[i]] != space_id:
                i += 1
            word_end = i - 1
            
            # Mark positions in the word
            for j in range(word_start, word_end + 1):
                pos_idx = valid_indices[j]
                
                # Skip if this position has padding in y_true
                if y_true_seq[pos_idx] == PAD_DIACRITIC_ID:
                    continue
                
                is_correct = (y_true_seq[pos_idx] == y_pred_seq[pos_idx])
                
                # Last character in word
                if j == word_end:
                    last_total += 1
                    if not is_correct:
                        last_errors += 1
                # Non-last character in word
                else:
                    non_last_total += 1
                    if not is_correct:
                        non_last_errors += 1
    
    # Calculate DER for each category
    der_non_last = (non_last_errors / non_last_total * 100) if non_last_total > 0 else 0
    der_last = (last_errors / last_total * 100) if last_total > 0 else 0
    
    total_errors = non_last_errors + last_errors
    total_chars = non_last_total + last_total
    der_overall = (total_errors / total_chars * 100) if total_chars > 0 else 0
    
    return der_non_last, der_last, der_overall

In [98]:
y_pred_classes = np.argmax(y_pred, axis=-1)
y_true = y_val

# Calculate accuracy
accuracy = accuracy_score(y_true.flatten(), y_pred_classes.flatten())
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.9974


In [99]:
# Calculate DER by character position in words
der_non_last, der_last, der_overall = calculate_der_by_position(x_val_char, y_true, y_pred_classes, char_to_id)

print("\n" + "="*60)
print("DER Analysis by Character Position in Words")
print("="*60)
print(f"DER for non-last characters: {der_non_last:.2f}%")
print(f"DER for last characters:     {der_last:.2f}%")
print(f"Overall DER:                 {der_overall:.2f}%")
print(f"\nAccuracy for non-last characters: {100 - der_non_last:.2f}%")
print(f"Accuracy for last characters:     {100 - der_last:.2f}%")
print(f"Acutual Accuracy: {100 - der_overall:.2f}%")
print("="*60)


DER Analysis by Character Position in Words
DER for non-last characters: 2.97%
DER for last characters:     5.81%
Overall DER:                 3.65%

Accuracy for non-last characters: 97.03%
Accuracy for last characters:     94.19%
Acutual Accuracy: 96.35%


<h2> Test data

In [100]:
def merge(x_test, y_pred_classes, char_to_id, diacritics_to_id):
    """
    Merge character sequences with predicted diacritics to reconstruct text
    
    Args:
        x_test: numpy array of character IDs (samples × sequence_length)
        y_pred_classes: numpy array of predicted diacritic IDs (samples × sequence_length)
        char_to_id: dictionary mapping characters to IDs
        diacritics_to_id: dictionary mapping diacritics to IDs
    
    Returns:
        List of reconstructed diacritized text strings
    """
    # Create reverse mappings
    id_to_char = {v: k for k, v in char_to_id.items()}
    id_to_diacritic = {v: k for k, v in diacritics_to_id.items()}
    
    reconstructed_texts = []
    
    # Process each sample
    for char_seq, diac_seq in zip(x_test, y_pred_classes):
        text = ""
        
        for char_id, diac_id in zip(char_seq, diac_seq):
            # Skip padding
            if char_id == 0:  # PAD character
                break
            
            # Get character
            char = id_to_char.get(char_id, '')
            
            # Get diacritic
            diacritic = id_to_diacritic.get(diac_id, '')
            
            # Combine character with diacritic
            text += char + diacritic
        
        text = text.replace("UNK", " ")
        reconstructed_texts.append(text)
    
    return reconstructed_texts

In [101]:
def test_model_with_word_features(text, word_to_id):
    """
    Prepare test text for prediction with word-level features
    
    Args:
        text: Input text without diacritics
        word_to_id: Dictionary mapping words to IDs
    
    Returns:
        Tuple of (char_ids, word_ids, word_positions) padded and ready for prediction
    """
    # Strip diacritics if present
    text_clean = araby.strip_diacritics(text)
    
    # Get characters
    char_ids = [char_to_id.get(c, char_to_id['UNK']) for c in text_clean]
    
    # Get words
    words = araby.tokenize(text_clean)
    
    # Create word ID sequence aligned with characters
    word_ids = []
    word_positions = []  # 0=start, 1=middle, 2=end, 3=single-char, 4=space
    
    char_idx = 0
    for word in words:
        if not word.strip():
            continue
        
        word_id = word_to_id.get(word, word_to_id['<UNK>'])
        word_len = len(word)
        
        for i in range(word_len):
            if char_idx < len(char_ids):
                word_ids.append(word_id)
                
                # Determine position in word
                if word_len == 1:
                    word_positions.append(3)  # Single character word
                elif i == 0:
                    word_positions.append(0)  # Start of word
                elif i == word_len - 1:
                    word_positions.append(2)  # End of word
                else:
                    word_positions.append(1)  # Middle of word
                
                char_idx += 1
        
        # Handle space
        if char_idx < len(char_ids) and text_clean[char_idx] == ' ':
            word_ids.append(0)
            word_positions.append(4)  # Space position
            char_idx += 1
    
    # Fill remaining
    while len(word_ids) < len(char_ids):
        word_ids.append(0)
        word_positions.append(4)
    
    # Pad to model expected shape (add batch dimension)
    char_ids_padded = tf.keras.preprocessing.sequence.pad_sequences([char_ids], padding='post', value=0)
    word_ids_padded = tf.keras.preprocessing.sequence.pad_sequences([word_ids], padding='post', value=0)
    positions_padded = tf.keras.preprocessing.sequence.pad_sequences([word_positions], padding='post', value=4)
    
    return char_ids_padded, word_ids_padded, positions_padded

In [102]:
test_sent = "هذا نص تجريبي لاختبار نموذج تشكيل النص العربي"

x_test_char, x_test_word, x_test_position = test_model_with_word_features(test_sent, word_to_id)
y_test_pred = model.predict({
    'char_input': x_test_char, 
    'word_input': x_test_word, 
    'position_input': x_test_position
})
y_test_pred_classes = np.argmax(y_test_pred, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step


In [103]:
y_test_pred.shape

(1, 45, 15)

In [104]:
output_sentences = merge(x_test_char, y_test_pred_classes, char_to_id, diacritics_to_id)[0]
print("Input text:", test_sent)
print("Diacritized output:", output_sentences)

Input text: هذا نص تجريبي لاختبار نموذج تشكيل النص العربي
Diacritized output: هَذَا نَص تَجْرِيبِي لِاخْتِبَارٍ نَمُوذِجٍ تَشْكِيلِ النص الْعَرَبِي
