In [None]:
!pip install -q "protobuf==3.20.*"
!pip install -q transformers arabert preprocess

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>=5.29.5, but you have protobuf 3.20.3 which is incompatible.
ray 2.51.1 requires click!=8.3.0,>=7.0, but you have click 8.3.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
tensorflow-metadata 1.17.2 requires protobuf>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.

In [None]:
import re
import pickle
import numpy as np
import tensorflow as tf
import itertools
import json
from tqdm import tqdm
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from arabert.preprocess import ArabertPreprocessor

2025-12-09 16:41:38.717103: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765298498.949045      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765298499.022595      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Utils

In [None]:
char2idx_path = '/kaggle/input/arabicia-3/char2idx.json'
arabic_letters_map = '/kaggle/input/arabic-letters-map/arabic_letters.pickle'
# model_path = '/kaggle/input/arabic-diacritizer-residual/keras/1/1/model_with_features_v2_res.keras'
char_embeddings_path = '/kaggle/input/embeddings-chars/keras/default/1/embedding_matrix(1).npy'

In [None]:
test_text_path = '/kaggle/input/normal/dataset_no_diacritics.txt'

In [None]:
def get_diacritics_map():
    # with open(diacritic2id_path, 'r', encoding='utf-8') as f:
    #     diacritic2id = json.load(f)
    diacritic2id = {
        "َ": 0,
        "ً": 1,
        "ُ": 2,
        "ٌ": 3,
        "ِ": 4,
        "ٍ": 5,
        "ْ": 6,
        "ّ": 7,
        "َّ": 8,
        "ًّ": 9,
        "ُّ": 10,
        "ٌّ": 11,
        "ِّ": 12,
        "ٍّ": 13,
        "": 14
    }
    idx2label = {v: k for k, v in diacritic2id.items()}

    return diacritic2id, idx2label

def get_char_map():
    with open(char2idx_path, 'r', encoding='utf-8') as f:
        char2idx = json.load(f)
    for key, value in char2idx.items():
        if value != 0:
            char2idx[key] = value - 1
    idx2char = {k : v for v, k in char2idx.items()}

    return char2idx, idx2char

In [None]:
def get_arabic_characters():
    with open(arabic_letters_map, 'rb') as f:
        arabic_letters = pickle.load(f)
    return arabic_letters

In [None]:
char2idx, idx2char = get_char_map()
diacritic2id, idx2label = get_diacritics_map()

# Preprocessing

In [None]:
DIACRITICS_PATTERN = re.compile(r'[\u064B-\u0652]')

In [None]:
def split_text_and_diacritics(text):

    letters = []
    labels = []

    i = 0
    while i < len(text):
        char = text[i]

        if DIACRITICS_PATTERN.match(char):
            if labels:
                labels[-1] += char
        else:
            letters.append(char)
            labels.append("")

        i += 1

    return "".join(letters), labels

In [None]:
!pip install -q emoji

In [None]:
import emoji
import unicodedata

In [None]:
numeric_pattern = r"\(\s*\d+\s*/\s*\d+\s*\)"
english = r"[a-zA-Z]"
numbers = r"\s*\d+\s*"
numering_items = r"\s*\d+\s*[-]\s*"
empty_brackets = r'\(\s*\)|\[\s*\]|\{\s*\}|<<\s*>>|"\s*"|\'\s*\''


def clean_punctuation_sequence(text):
    collapsible = re.escape(".,:;!?'\"/،؛؟")
    pattern = rf"([{collapsible}])(?:\s*\1)+"

    return re.sub(pattern, r"\1", text)

def remove_emojis(text):
    # replace_emoji removes emojis. replace='' effectively deletes them.
    return emoji.replace_emoji(text, replace='')


def remove_unbalanced_brackets(text):
    pair_map = {')': '(', '}': '{', ']': '[', '>':'<', '»': '«', '"':'"', "'":"'"}
    openers = set(['(', '{', '[', '<', '«', '"', "'"])

    stack = []
    indices_to_remove = set()

    for i, char in enumerate(text):
        if char in openers:
            stack.append((char, i))

        elif char in pair_map:
            if stack:
                last_opener, _ = stack[-1]
                if last_opener == pair_map[char]:
                    stack.pop()
                else:
                    indices_to_remove.add(i)
            else:
                indices_to_remove.add(i)

    for char, index in stack:
        indices_to_remove.add(index)

    return "".join([char for i, char in enumerate(text) if i not in indices_to_remove])

def remove_formatting_codes(text):

    return "".join(ch for ch in text
                   if unicodedata.category(ch) != "Cf"
                   and unicodedata.category(ch) != "No")

def initial_process(line):
    res = re.sub(numering_items, '', line)
    res = re.sub(numeric_pattern, '', res)
    res = re.sub(english, ' ', res)
    res = re.sub(numbers, '', res)
    res = re.sub(empty_brackets, '', res)
    res = re.sub(',', '،', res)
    res = re.sub(';', '؛', res)
    res = re.sub(r'\?', '؟', res)
    res = re.sub(r'%', ' ', res)
    res = re.sub(r'/', '', res)
    res = re.sub(r'\*', '', res)
    res = re.sub(r'–', '-', res)
    res = re.sub(r'_', ' - ', res)
    res = re.sub(r'\u2026', '.', res)
    res = res.replace('\u200f', '')
    res = res.replace('\u200d', '')
    res = remove_formatting_codes(res)

    res = remove_emojis(res)

    res = clean_punctuation_sequence(res)

    res = remove_unbalanced_brackets(res)

    res = re.sub(r"\s+", " ", res).strip()

    return res


def split_citations_raw(line):
    qal_list = [
        "قال", "قالت", "قالوا", "قلت", "قلنا",
        "أقول", "يقول", "يقولون", "قيل", "يقال"
    ]

    qal_regex = "|".join(qal_list)

    qal_with_colon = rf"(?:{qal_regex})\s*[:：]"


    qawloho_regex = r"(?:و|ف)?قول(?:ه)?(?:\s*تعالى)?"

    trigger = rf"({qal_with_colon}|{qawloho_regex})"

    final_lines = []
    matches = list(re.finditer(trigger, line))

    if not matches:
        final_lines.append(line.strip())
    else:
        last_idx = 0
        for m in matches:
            start = m.start()
            if line[last_idx:start]:
                final_lines.append(line[last_idx:start])
            last_idx = start

        final_lines.append(line[last_idx:])

    return final_lines

def slide_window_raw(text, overlap=50, max_len=807):
    if len(text) <= max_len:
        return [text], [0]

    chunks = []
    overlaps = []

    chunks.append(text[:max_len])
    overlaps.append(0)

    current_start = 0
    text_len = len(text)

    while True:
        ideal_stride = max_len - overlap

        ideal_next_start = current_start + ideal_stride

        if ideal_next_start >= text_len:
            break

        found_next_start = -1

        search_limit = current_start

        for i in range(ideal_next_start, search_limit, -1):
            if i < text_len and text[i] == ' ':
                found_next_start = i + 1
                break

        if found_next_start == -1:
            found_next_start = ideal_next_start

        actual_overlap = (current_start + max_len) - found_next_start

        if actual_overlap < 0:
            actual_overlap = 0

        next_chunk = text[found_next_start : found_next_start + max_len]

        chunks.append(next_chunk)
        overlaps.append(actual_overlap)

        current_start = found_next_start

        if current_start + max_len >= text_len:
            break

    return chunks, overlaps


def prepare_for_predict():
    all_recovery = []
    assertions_text = []
    assertions_tashkeel = []
    test = True
    curr_chunks = []
    curr_overlaps = []

    with open(f'{test_text_path}', "r", encoding="utf-8") as file:

        for line in file:

            cleaned = initial_process(line.strip())
            if test == True:
                assertions_text.append(cleaned)
                line = cleaned
            else:
                line, tashkeel = split_text_and_diacritics(cleaned)
                assertions_text.append(line)
                assertions_tashkeel.append(tashkeel)

            raw_segments = split_citations_raw(line)
            recovery = []

            for seg in raw_segments:
                t_chunks, t_overlaps = slide_window_raw(seg, overlap=50, max_len=807)
                assert len(t_chunks) == len(t_overlaps), print(len(t_chunks), len(t_overlaps))

                for i, chunk in enumerate(t_chunks):
                    recovery.append(i)
                    curr_chunks.append(chunk)

                curr_overlaps.extend(t_overlaps)
            all_recovery.append(recovery)

    print(f"Generated {len(curr_chunks)} chunks.")
    return curr_chunks, curr_overlaps, all_recovery, assertions_text, assertions_tashkeel

In [None]:
chunks, overlaps, recovery, assertions_text, assertions_tashkeel = prepare_for_predict()

Generated 3191 chunks.


# Post Processing

In [None]:
def reconstruct_text_window(chunks, overlaps):
    if not chunks:
        return ""

    reconstructed_parts = []

    for chunk, ov in zip(chunks[0:], overlaps):
        reconstructed_parts.append(chunk[ov:])

    return "".join(reconstructed_parts)


def arabic_only_text_and_tashkeel(text, tashkeel):
    ARABIC_CHARS = get_arabic_characters()
    return "".join([char for char in text if char in ARABIC_CHARS or char == " "]), [tashkeel[i] for i, char in enumerate(text) if char in ARABIC_CHARS or char == " "]

def arabic_only_text_and_tashkeel_no_spaces(text, tashkeel):
    ARABIC_CHARS = get_arabic_characters()
    return "".join([char for char in text if char in ARABIC_CHARS]), [tashkeel[i] for i, char in enumerate(text) if char in ARABIC_CHARS]

def post_process(chunks, overlaps, recovery):
    results = []
    start_chnk_idx = 0
    end_chnk_idx = 0

    for i in range(len(recovery)):
        zero_before = False
        res = ''
        for j in recovery[i]:
            if j == 0:
                if zero_before:
                    res += reconstruct_text_window(chunks[start_chnk_idx:end_chnk_idx + 1], overlaps[start_chnk_idx:end_chnk_idx + 1])
                    start_chnk_idx = end_chnk_idx + 1
                    end_chnk_idx += 1
                zero_before = True
            else:
                end_chnk_idx += 1

        res += reconstruct_text_window(chunks[start_chnk_idx:end_chnk_idx + 1], overlaps[start_chnk_idx:end_chnk_idx + 1])
        start_chnk_idx = end_chnk_idx + 1
        end_chnk_idx += 1
        results.append(res)

    return results

In [None]:
def reconstruct_diacritics_window(chunks, overlaps):
    if not chunks:
        return np.array([])

    reconstructed_parts = []

    for chunk, ov in zip(chunks, overlaps):
        reconstructed_parts.append(chunk[ov:])

    return np.concatenate(reconstructed_parts)


def post_process_diacritics(chunks, overlaps, recovery):
    results = []
    start_chnk_idx = 0
    end_chnk_idx = 0

    for i in range(len(recovery)):
        zero_before = False

        res = np.array([], dtype=int)

        for j in recovery[i]:
            if j == 0:
                if zero_before:
                    segment = reconstruct_diacritics_window(
                        chunks[start_chnk_idx : end_chnk_idx + 1],
                        overlaps[start_chnk_idx : end_chnk_idx + 1]
                    )
                    res = np.concatenate([res, segment])

                    start_chnk_idx = end_chnk_idx + 1
                    end_chnk_idx += 1
                zero_before = True
            else:
                end_chnk_idx += 1

        segment = reconstruct_diacritics_window(
            chunks[start_chnk_idx : end_chnk_idx + 1],
            overlaps[start_chnk_idx : end_chnk_idx + 1]
        )
        res = np.concatenate([res, segment])

        start_chnk_idx = end_chnk_idx + 1
        end_chnk_idx += 1

        results.append(res)

    return results

In [None]:
def get_finals(results, labels, tokens=True):
    flat_labels = list(itertools.chain.from_iterable(labels))
    if tokens:
        new_flat_labels = [idx2label[label] for label in flat_labels]
    else:
        new_flat_labels = flat_labels
    idx = 0
    final_results = []
    for result in results:
        final_str = ''
        for char in result:
            final_str += char + new_flat_labels[idx]
            idx += 1
        final_results.append(final_str)
    return final_results

# Extract Features

In [None]:

# arabert_model_name = "aubmindlab/bert-base-arabertv02"
# bert_tokenizer = AutoTokenizer.from_pretrained(arabert_model_name)
# bert_model = AutoModel.from_pretrained(arabert_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# bert_model.to(device)
# bert_model.eval()
# arabert_prep = ArabertPreprocessor(model_name=arabert_model_name)


MODEL_NAME = "aubmindlab/araelectra-base-discriminator"

electra_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
electra_model = AutoModel.from_pretrained(MODEL_NAME)
electra_model.to(device)
electra_model.eval()


custom_char_embedding = np.load(char_embeddings_path)

# def get_arabert_embeddings(sentence: str):

#     tokens = bert_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
#     tokens = {k: v.to(device) for k, v in tokens.items()}

#     with torch.no_grad():
#         output = bert_model(**tokens)

#     emb = output.last_hidden_state.squeeze(0).cpu()
#     token_list = bert_tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

#     return emb.numpy(), token_list


def get_araelectra_embeddings(sentence, device="cuda"):
    """
    Get token-level embeddings from AraELECTRA
    Returns a list of sentence embeddings (list of token embeddings)
    """
    electra_model.to(device)
    inputs = electra_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    token_ids = inputs['input_ids'][0]

    # 2. Convert those IDs back to text tokens
    tokens = electra_tokenizer.convert_ids_to_tokens(token_ids)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Get outputs (last hidden state)
    with torch.no_grad():
        outputs = electra_model(**inputs)
        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
    # Remove batch dimension and convert to list of embeddings per token
    token_embeddings = last_hidden_state.squeeze(0)  # [seq_len, hidden_size]
    return token_embeddings, tokens

def extract_custom_char_embeddings(char):
    char2idx, _ = get_char_map()
    return custom_char_embedding[char2idx[char]]

def tokens_to_word_embeddings(tokens, embeddings):
    word_embeddings = []
    current_word_embs = []

    for token, emb in zip(tokens, embeddings):
        emb_tensor = torch.tensor(emb) if isinstance(emb, np.ndarray) else emb

        if token.startswith("##"):
            current_word_embs.append(emb_tensor)
        else:
            if current_word_embs:
                word_embeddings.append(torch.mean(torch.stack(current_word_embs), dim=0))
            current_word_embs = [emb_tensor]

    if current_word_embs:
        word_embeddings.append(torch.mean(torch.stack(current_word_embs), dim=0))

    return torch.stack(word_embeddings)

tokenizer_config.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

In [None]:
# def zizo_features(sentence: str):

#     sentence_vec = []

#     arabert_emb, tokens = get_arabert_embeddings(sentence)
#     final_arabert_emb = tokens_to_word_embeddings(tokens, arabert_emb)

#     words_raw = sentence.split()
#     word_idx = 0
#     char_in_word_idx = 0

#     emb_dim = final_arabert_emb[0].shape[0]

#     for i, char in enumerate(sentence):

#         char_emb = extract_custom_char_embeddings(char)
#         char_emb_array = np.array(char_emb).flatten()

#         if char == ' ':
#             bert_vec = np.zeros(emb_dim)

#         else:
#             bert_vec = final_arabert_emb[word_idx]
#             if isinstance(bert_vec, torch.Tensor):
#                 bert_vec = bert_vec.numpy()

#             char_in_word_idx += 1

#             if char_in_word_idx == len(words_raw[word_idx]):
#                 word_idx += 1
#                 char_in_word_idx = 0

#         char_vector = np.concatenate([bert_vec, char_emb_array])
#         sentence_vec.append(char_vector)

#     return sentence_vec

def zizo_features_electra(sentence: str):

    sentence_vec = []

    araelectra_emb, tokens = get_araelectra_embeddings(sentence)
    final_araelectra_emb = tokens_to_word_embeddings(tokens, araelectra_emb)

    words_raw = sentence.split()
    word_idx = 0
    char_in_word_idx = 0

    emb_dim = final_araelectra_emb[0].shape[0]

    for i, char in enumerate(sentence):

        char_emb = extract_custom_char_embeddings(char)
        char_emb_array = np.array(char_emb).flatten()

        if char in punctuation:
            araelectra_vec = np.zeros(emb_dim)

        else:
            araelectra_vec = final_araelectra_emb[word_idx]
            if isinstance(araelectra_vec, torch.Tensor):
                araelectra_vec = araelectra_vec.cpu().numpy()

            char_in_word_idx += 1

            if char_in_word_idx == len(words_raw[word_idx]):
                word_idx += 1
                char_in_word_idx = 0

        char_vector = np.concatenate([araelectra_vec, char_emb_array])
        sentence_vec.append(char_vector)

    return sentence_vec

# def extract_features(sentences):
#     all_sentence_features = []

#     for i in tqdm(range(len(sentences)), total=len(sentences), desc="extracting features"):
#         sent = sentences[i]
#         features_list = zizo_features("".join(sent))

#         all_sentence_features.append(np.array(features_list, dtype=np.float16))

#     return all_sentence_features

def extract_features_electra(sentences):
    all_sentence_features = []

    for i in tqdm(range(len(sentences)), total=len(sentences), desc="extracting features"):
        sent = sentences[i]
        features_list = zizo_features_electra("".join(sent))

        all_sentence_features.append(np.array(features_list, dtype=np.float16))

    return all_sentence_features

# Predict

In [None]:
electra_all_model = tf.keras.models.load_model(f'/kaggle/input/arabic-diacritizer-araelectra-all-v2/keras/1/1/araelectra_all_v2.keras', compile=False)
electra_lastchar_model = tf.keras.models.load_model(f'/kaggle/input/last_char_electra_test_v3/keras/1/1/last_char_electra_test_v3.keras', compile=False)

I0000 00:00:1765298542.991061      47 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13392 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1765298542.991757      47 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [None]:

# Configuration
INTAHA = r'\s+ا\s*هـ?\s+'
BATCH_SIZE = 32
PADDING_INPUT = -99999.0
INPUT_DIM = 1024

# [FIX 1] Added 'features' argument (it was missing in your definition)
def predict(text_chunks):

    # [FIX 2] Calculate Global Max Length to prevent retracing
    # We find the longest sentence in the entire dataset
    global_max_len = max([len(f) for f in features])

    # Optional: You can cap this if you have outliers (e.g., min(global_max_len, 512))
    print(f"Padding all batches to fixed length: {global_max_len}")

    sentence_lengths = [len(f) for f in features]

    def test_set_generator():
        for i in range(len(features)):
            yield features[i], [sentence_lengths[i]], text_chunks[i]

    test_dataset = tf.data.Dataset.from_generator(
        test_set_generator,
        output_signature=(
            tf.TensorSpec(shape=(None, INPUT_DIM), dtype=tf.float32),
            tf.TensorSpec(shape=(1,), dtype=tf.int32),
            tf.TensorSpec(shape=(), dtype=tf.string)
        )
    ).padded_batch(
        BATCH_SIZE,
        # [FIX 3] Enforce Fixed Shapes
        # Instead of 'None' (dynamic), we use 'global_max_len'
        padded_shapes=(
            [global_max_len, INPUT_DIM],
            [1],
            []
        ),
        padding_values=(PADDING_INPUT, 0, "")
    )

    all_predictions = []

    print("Starting prediction...")

    # Using predict_on_batch inside a loop is okay now because
    # the input shape is CONSTANT (Batch_Size, Global_Max_Len, 1024)
    for batch_x, batch_lens, batch_text in test_dataset:

        batch_probs = electra_all_model.predict_on_batch(batch_x)
        batch_pred_ids = np.argmax(batch_probs, axis=-1)

        batch_probs_lc = electra_lastchar_model.predict_on_batch(batch_x)
        batch_pred_ids_lc = np.argmax(batch_probs_lc, axis=-1)

        current_batch_lengths = batch_lens.numpy().flatten()
        batch_size_current = batch_pred_ids.shape[0]

        for k in range(batch_size_current):

            valid_len = current_batch_lengths[k]

            # Decode text
            current_text_str = batch_text[k].numpy().decode('utf-8')

            # Slice to valid length (remove the global padding)
            pred_seq = batch_pred_ids[k][:valid_len]

            # Safety slice for text
            current_text_str = current_text_str[:valid_len]

            # Logic to merge models
            for i, char in enumerate(current_text_str):
                if char == ' ':
                    if i > 0:
                        pred_seq[i - 1] = batch_pred_ids_lc[k][i - 1]

            if len(current_text_str) > 0 and current_text_str[-1] != ' ':
                pred_seq[-1] = batch_pred_ids_lc[k][valid_len - 1]

            all_predictions.append(pred_seq)

    return all_predictions

In [None]:
punctuation = ['.', ':', '{', '}', '[', ']', '(', ')', '؛', '«', '»', '!', '،', '؟', '-', ' ']
features = extract_features_electra(chunks)

extracting features: 100%|██████████| 3191/3191 [04:58<00:00, 10.71it/s]


In [None]:
all_predictions = predict(chunks)

Padding all batches to fixed length: 807
Starting prediction...


I0000 00:00:1765298854.091937     161 cuda_dnn.cc:529] Loaded cuDNN version 90300


In [None]:
# results = post_process(chunks, overlaps, recovery)

In [None]:
pred_diac = post_process_diacritics(all_predictions, overlaps, recovery)

In [None]:
# predicted_text = get_finals(results, pred_diac)

In [None]:
with open(f'{test_text_path}', "r", encoding="utf-8") as file:
    lines = file.readlines()

In [None]:
# start_index = 0
# current_lines = lines
# current_preds = predicted_text
# matches = 0
# total = 0

# for line_str, pred_str in zip(current_lines, current_preds):
#     og_text, og_tashkeel = split_text_and_diacritics(initial_process(line_str.strip()))
#     ll, og = arabic_only_text_and_tashkeel(og_text, og_tashkeel)

#     pred_text, pred_tashkeel = split_text_and_diacritics(pred_str.strip())
#     _, pred = arabic_only_text_and_tashkeel(pred_text, pred_tashkeel)

#     for i, (char, o, p) in enumerate(zip(ll, og, pred)):

#         if char == ' ':
#             continue

#         is_last_char = (i == len(ll) - 1) or (ll[i+1] == ' ')

#         if not is_last_char:
#             if o == p:
#                 matches += 1
#             total += 1

# print(f"Internal Diacritic Accuracy (No Last Char): {matches * 100 / total:.2f}%")

In [None]:
import pandas as pd

In [None]:
start_index = 0
current_lines = lines
current_preds = pred_diac
matches = 0
total = 0
all_labels = []

for line_str, pred_label in zip(current_lines, current_preds):
    cleaned = initial_process(line_str.strip())
    new_sent, pred = arabic_only_text_and_tashkeel_no_spaces(cleaned, pred_label)
    all_labels.extend(pred)

ids = [i for i in range(len(all_labels))]
# print(f"Internal Diacritic Accuracy (No Last Char): {matches * 100 / total:.2f}%")

In [None]:
df = pd.DataFrame({
    'ID': ids,
    'label': all_labels
})

In [None]:
test_df = pd.read_csv('/kaggle/input/normal/test_no_diacritics.csv')

In [None]:
df.to_csv('submissions_all_chars_4.csv', index=False)

In [None]:
test_df

Unnamed: 0,id,line_number,letter,case_ending
0,0,0,ف,False
1,1,0,ي,True
2,2,0,ا,False
3,3,0,ل,False
4,4,0,م,False
...,...,...,...,...
237235,237235,2468,ب,False
237236,237236,2468,ن,True
237237,237237,2468,ش,False
237238,237238,2468,ي,False


In [None]:
test_df_ce = test_df[test_df['case_ending'] == True]

In [None]:
test_df_ce

Unnamed: 0,id,line_number,letter,case_ending
1,1,0,ي,True
7,7,0,ل,True
12,12,0,م,True
18,18,0,د,True
20,20,0,ن,True
...,...,...,...,...
237224,237224,2468,ا,True
237228,237228,2468,ن,True
237231,237231,2468,ى,True
237236,237236,2468,ن,True


In [None]:
ce_df = df[df['ID'].isin(test_df_ce['id'])]

In [None]:
ce_df[ce_df['ID'] == 208]

Unnamed: 0,ID,label
208,208,14


In [None]:
ce_df.to_csv('submissions_last_char_2.csv', index=False)