In [1]:
import regex as re
import json
import os
from typing import Pattern

from tokenizer import BPETokenizerV3
from quraan_utils import get_quraan_uthmani, get_raw_quraan

# Reading and Processing Quraan

In [2]:
filename = "artifacts/quraan.json"
if os.path.isfile(filename):
    with open(filename) as f:
        surahs = json.loads(f.read())['surahs']
else:
    surahs = get_quraan_uthmani(filename)['surahs']

## Understanding Quranic Encodings

Reading Arabic [encodings](https://www.unicode.org/charts/nameslist/n_0600.html).
We will first try to remove unnecessary characters like:

- Honorifics
    - 0610	 ◌ؐ 	Arabic Sign Sallallahou Alayhe Wassallam
 	 	•	represents sallallahu alayhe wasallam "may God's peace and blessings be upon him"
    - 0611	 ◌ؑ 	Arabic Sign Alayhe Assallam
 	 	•	represents alayhe assalam "upon him be peace"
 	 	→	FD47 ﵇ arabic ligature alayhi as-salaam
    - 0612	 ◌ؒ 	Arabic Sign Rahmatullah Alayhe
 	 	•	represents rahmatullah alayhe "may God have mercy upon him"
 	 	→	FD40 ﵀ arabic ligature rahimahu allaah
    - 0613	 ◌ؓ 	Arabic Sign Radi Allahou Anhu
 	 	•	represents radi allahu 'anhu "may God be pleased with him"
 	 	→	FD41 ﵁ arabic ligature radi allaahu anh
    - 0614	 ◌ؔ 	Arabic Sign Takhallus
 	 	•	sign placed over the name or nom-de-plume of a poet, or in some writings used to mark all proper names
- Extended Arabic mark:
    - 0616	 ◌ؖ 	Arabic Small High Ligature Alef With Lam With Yeh
 	 	※	ARABIC SMALL HIGH LIGATURE ALEF WITH YEH BARREE
 	 	•	early Persian
- Quaranic annotation sign:
    - 06E5	 ‎ۥ‎ 	Arabic Small Waw
 	 	→	08D3 ◌࣓ arabic small low waw
 	 	→	08F3 ◌ࣳ arabic small high waw
    - 0617	 ◌ؗ 	Arabic Small High Zain
    - 0618	 ◌ؘ 	Arabic Small Fatha
    	 	•	should not be confused with 064E ◌َ FATHA
    - 0619	 ◌ؙ 	Arabic Small Damma
 	 	•	should not be confused with 064F ◌ُ DAMMA
    - 061A	 ◌ؚ 	Arabic Small Kasra
 	 	•	should not be confused with 0650 ◌ِ KASRA
    - 06DC	 ◌ۜ 	Arabic Small High Seen
    -  06DE	 ۞ 	Arabic Start Of Rub El Hizb
 	 	•	indicates boundaries of parts of sections
 	 	•	typically depicted as an eight-sided symbol, which may or may not appear starlike
    - 06DF	 ◌۟ 	Arabic Small High Rounded Zero
 	 	•	smaller than the typical circular shape used for 0652 ◌ْ
    - 06E0	 ◌۠ 	Arabic Small High Upright Rectangular Zero
 	 	•	the term "rectangular zero" is a translation of the Arabic name of this sign
    - 06E1	 ◌ۡ 	Arabic Small High Dotless Head Of Khah
 	 	=	Arabic jazm
 	 	•	presentation form of 0652 ◌ْ, using font technology to select the variant is preferred
 	 	•	used in some Qurans to mark absence of a vowel
 	 	→	0652 ◌ْ arabic sukun
    - 06E2	 ◌ۢ 	Arabic Small High Meem Isolated Form
    - 06E3	 ◌ۣ 	Arabic Small Low Seen
    - 06E4	 ◌ۤ 	Arabic Small High Madda
 	 	•	typically used with 06E5 ‎ۥ‎, 06E6 ‎ۦ‎, 06E7 ◌ۧ, and 08F3 ◌ࣳ
    - 06E5	 ‎ۥ‎ 	Arabic Small Waw
 	 	→	08D3 ◌࣓ arabic small low waw
 	 	→	08F3 ◌ࣳ arabic small high waw
    - 06E6	 ‎ۦ‎ 	Arabic Small Yeh
    - 06E7	 ◌ۧ 	Arabic Small High Yeh
    - 06E8	 ◌ۨ 	Arabic Small High Noon
    - 06E9	 ۩ 	Arabic Place Of Sajdah
 	 	•	there is a range of acceptable glyphs for this character
    - 06EA	 ◌۪ 	Arabic Empty Centre Low Stop
    - 06EB	 ◌۫ 	Arabic Empty Centre High Stop
    - 06EC	 ◌۬ 	Arabic Rounded High Stop With Filled Centre
 	 	•	also used in Quranic text in African and other orthographies to represent wasla, ikhtilas, etc.
    - 06ED	 ◌ۭ 	Arabic Small Low Meem

Then, we will split following GPT4 pattern but including:

- Arabic words with all recitations.
- 06D6	 ◌ۖ 	Arabic Small High Ligature Sad With Lam With Alef Maksura
- 06D7	 ◌ۗ 	Arabic Small High Ligature Qaf With Lam With Alef Maksura
- 06D8	 ◌ۘ 	Arabic Small High Meem Initial Form
- 06D9	 ◌ۙ 	Arabic Small High Lam Alef
- 06DA	 ◌ۚ 	Arabic Small High Jeem
- 06DB	 ◌ۛ 	Arabic Small High Three Dots

In [3]:
def compress_regex(verbose_pattern: str) -> str:
    """Remove whitespace and comments from a VERBOSE/expanded regex pattern,
    so you can compile it without re.VERBOSE"""
    out = []
    in_char_class = False
    escaped       = False
    i             = 0
    L             = len(verbose_pattern)

    while i < L:
        c = verbose_pattern[i]
        if escaped:
            # always emit an escaped char literally
            out.append(c)
            escaped = False

        else:
            if c == "\\":
                # start escape sequence
                out.append(c)
                escaped = True

            elif c == "[":
                out.append(c)
                in_char_class = True

            elif c == "]" and in_char_class:
                out.append(c)
                in_char_class = False

            elif not in_char_class and c == "#":
                # skip comment until end of line
                while i < L and verbose_pattern[i] != "\n":
                    i += 1
                # newline itself will be handled next iteration (and stripped)

            elif not in_char_class and c in " \t\n\r\f\v":
                # strip any unescaped, out-of-class whitespace
                pass

            else:
                # any other character is significant
                out.append(c)

        i += 1

    return "".join(out)


def compile_from_verbose(verbose_pattern: str) -> Pattern:
    """Compress and compile without re.VERBOSE."""
    packed = compress_regex(verbose_pattern)
    return re.compile(packed)

In [5]:
ayah = surahs[3]['ayahs'][0]['text']

# 1) drop Arabic Extended-A entirely
#    (U+08D3–U+08E1 plus U+08F0)
text = re.sub(r'[\u08D3-\u08E1\u08F0\u0610-\u061A\u06DC-\u06ED]', '', ayah)

# 2) compile an extended‐BPE‐style split‐pattern
quraan_pattern = r"""
    # — clitics like ’s, ’d, etc.
    '(?i:(?:s|d|m|t)|ll|ve|re)

  | # — an entire “word”  (letters + ANY number of diacritics)
    [^\r\n\p{L}\p{N}]?                      # optional leading non-letter (e.g. space or “”)
    (?:
      \p{L}                                #   a base letter
      (?:[\u06D6-\u06DB\u06DD\u064B-\u065F\u06D6-\u06ED\u0670])*
                                           #   zero-or-more of:
                                           #     • Qur’ānic signs (\u06D6-\u06DB\u06DD)
                                           #     • all standard Arabic diacritics U+064B–065F
                                           #     • superscript‐Alef U+0670
    )+                                     # one-or-more of (letter+diacritics)

  | # — small numbers
    \p{N}{1,3}

  | # — punctuation (one-or-more) + possible newlines
    [^\s\p{L}\p{N}]+[\r\n]*

  | # — whitespace at very end of string
    \s+$

  | # — standalone newlines
    \s*[\r\n]

  | # — whitespace before non-space
    \s+(?!\S)

  | # — single spaces
    \s
"""

quraan_pattern = compress_regex(quraan_pattern)

# 3) do the split
tokens = re.findall(re.compile(quraan_pattern), text)

ayah, tokens

('بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ يَٰٓأَيُّهَا ٱلنَّاسُ ٱتَّقُوا۟ رَبَّكُمُ ٱلَّذِى خَلَقَكُم مِّن نَّفْسٍۢ وَٰحِدَةٍۢ وَخَلَقَ مِنْهَا زَوْجَهَا وَبَثَّ مِنْهُمَا رِجَالًۭا كَثِيرًۭا وَنِسَآءًۭ ۚ وَٱتَّقُوا۟ ٱللَّهَ ٱلَّذِى تَسَآءَلُونَ بِهِۦ وَٱلْأَرْحَامَ ۚ إِنَّ ٱللَّهَ كَانَ عَلَيْكُمْ رَقِيبًۭا',
 ['بِسْمِ',
  ' ٱللَّهِ',
  ' ٱلرَّحْمَٰنِ',
  ' ٱلرَّحِيمِ',
  ' يَٰٓأَيُّهَا',
  ' ٱلنَّاسُ',
  ' ٱتَّقُوا',
  ' رَبَّكُمُ',
  ' ٱلَّذِى',
  ' خَلَقَكُم',
  ' مِّن',
  ' نَّفْسٍ',
  ' وَٰحِدَةٍ',
  ' وَخَلَقَ',
  ' مِنْهَا',
  ' زَوْجَهَا',
  ' وَبَثَّ',
  ' مِنْهُمَا',
  ' رِجَالًا',
  ' كَثِيرًا',
  ' وَنِسَآءً',
  ' ',
  'ۚ',
  ' وَٱتَّقُوا',
  ' ٱللَّهَ',
  ' ٱلَّذِى',
  ' تَسَآءَلُونَ',
  ' بِهِ',
  ' وَٱلْأَرْحَامَ',
  ' ',
  'ۚ',
  ' إِنَّ',
  ' ٱللَّهَ',
  ' كَانَ',
  ' عَلَيْكُمْ',
  ' رَقِيبًا'])

## Prepare Training Corpus

In [6]:
# no need to encode Ayat split and end of surah in the vocab.
raw_quraan = get_raw_quraan(surahs,
                            ayat_split_token=" ",
                            surah_split_token="")

In [7]:
len(raw_quraan), raw_quraan[:1000]

(709683,
 'سُورَةُ ٱلْفَاتِحَةِ: بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ مَٰلِكِ يَوْمِ ٱلدِّينِ إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُوبِ عَلَيْهِمْ وَلَا ٱلضَّآلِّينَ سُورَةُ البَقَرَةِ: بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ الٓمٓ ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى لِّلْمُتَّقِينَ ٱلَّذِينَ يُؤْمِنُونَ بِٱلْغَيْبِ وَيُقِيمُونَ ٱلصَّلَوٰةَ وَمِمَّا رَزَقْنَٰهُمْ يُنفِقُونَ وَٱلَّذِينَ يُؤْمِنُونَ بِمَآ أُنزِلَ إِلَيْكَ وَمَآ أُنزِلَ مِن قَبْلِكَ وَبِٱلْءَاخِرَةِ هُمْ يُوقِنُونَ أُولَٰٓئِكَ عَلَىٰ هُدًى مِّن رَّبِّهِمْ ۖ وَأُولَٰٓئِكَ هُمُ ٱلْمُفْلِحُونَ إِنَّ ٱلَّذِينَ كَفَرُوا سَوَآءٌ عَلَيْهِمْ ءَأَنذَرْتَهُمْ أَمْ لَمْ تُنذِرْهُمْ لَا يُؤْمِنُونَ خَتَمَ ٱللَّهُ عَلَىٰ قُلُوبِهِمْ وَعَلَىٰ سَمْعِهِمْ ۖ وَعَلَىٰٓ أَبْصَٰرِهِمْ غِشَٰوَةٌ ۖ وَلَهُمْ عَذَابٌ عَظِيمٌ وَمِنَ ٱلنَّاسِ مَن يَقُولُ ءَامَنَّا بِٱللَّهِ وَبِٱ

## Train Tokenizer

In [8]:
GPT4_N_TOKENS = 100256
END_SURAH_TOKEN = "<|endoftext|>"
tokenizer_file_name = "artifacts/quraan_tokenizer.bpe"

if os.path.isfile(tokenizer_file_name):
    quraan_tokenizer = BPETokenizerV3.load(tokenizer_file_name)
else:
    quraan_tokenizer = BPETokenizerV3(vocab_size=GPT4_N_TOKENS, pattern=quraan_pattern)
    quraan_tokenizer.fit(raw_quraan, special_tokens=[END_SURAH_TOKEN])
    quraan_tokenizer.save(tokenizer_file_name)

In [9]:
len(quraan_tokenizer)  # tokenizer converged earlier than GPT4_N_TOKENS

32826

In [10]:
quraan_tokenizer.encode("بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ يَٰٓأَيُّهَا ٱلنَّاسُ ٱتَّقُوا۟ رَبَّكُمُ ٱلَّذِى خَلَقَكُم مِّن نَّفْسٍۢ وَٰحِدَةٍۢ وَخَلَقَ مِنْهَا زَوْجَهَا وَبَثَّ مِنْهُمَا رِجَالًۭا كَثِيرًۭا وَنِسَآءًۭ ۚ وَٱتَّقُوا۟ ٱللَّهَ ٱلَّذِى تَسَآءَلُونَ بِهِۦ وَٱلْأَرْحَامَ ۚ إِنَّ ٱللَّهَ", allowed_special=True)

[216,
 168,
 8751,
 388,
 841,
 892,
 816,
 1586,
 3198,
 219,
 159,
 6057,
 660,
 4638,
 384,
 5390,
 219,
 162,
 5666,
 219,
 162,
 6438,
 1316,
 9038,
 8615,
 7100,
 2243,
 366,
 300,
 219,
 173,
 284,
 3728,
 300,
 219,
 173,
 284,
 11630,
 219,
 173,
 32,
 219,
 154,
 1692,
 219,
 159,
 449,
 660,
 17553,
 581,
 219,
 166,
 17556,
 32,
 219,
 154,
 440,
 449]

# Dataloader

When encoding for training, we need to tell the LLM model to differentiate between Ayat and Surahs, that is why we need to provide a special tokens to split between different ayat and surahs.

- Ayat split token: ۝
- Surah split token: <|endoftext|>

In [11]:
raw_quraan = get_raw_quraan(surahs)
print(raw_quraan[:1200])

سُورَةُ ٱلْفَاتِحَةِ: بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ ۝ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝ مَٰلِكِ يَوْمِ ٱلدِّينِ ۝ إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ ۝ ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ ۝ صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُوبِ عَلَيْهِمْ وَلَا ٱلضَّآلِّينَ ۝  <|endoftext|> سُورَةُ البَقَرَةِ: بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ الٓمٓ ۝ ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى لِّلْمُتَّقِينَ ۝ ٱلَّذِينَ يُؤْمِنُونَ بِٱلْغَيْبِ وَيُقِيمُونَ ٱلصَّلَوٰةَ وَمِمَّا رَزَقْنَٰهُمْ يُنفِقُونَ ۝ وَٱلَّذِينَ يُؤْمِنُونَ بِمَآ أُنزِلَ إِلَيْكَ وَمَآ أُنزِلَ مِن قَبْلِكَ وَبِٱلْءَاخِرَةِ هُمْ يُوقِنُونَ ۝ أُولَٰٓئِكَ عَلَىٰ هُدًى مِّن رَّبِّهِمْ ۖ وَأُولَٰٓئِكَ هُمُ ٱلْمُفْلِحُونَ ۝ إِنَّ ٱلَّذِينَ كَفَرُوا سَوَآءٌ عَلَيْهِمْ ءَأَنذَرْتَهُمْ أَمْ لَمْ تُنذِرْهُمْ لَا يُؤْمِنُونَ ۝ خَتَمَ ٱللَّهُ عَلَىٰ قُلُوبِهِمْ وَعَلَىٰ سَمْعِهِمْ ۖ وَعَلَىٰٓ أَبْصَٰرِهِمْ غِشَٰوَةٌ ۖ وَلَهُمْ عَذَابٌ عَظِيمٌ ۝ وَمِنَ ٱلنَّاسِ مَن ي

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader


class QuraanDataset(Dataset):
    """Torch Dataset that converts the quraan into input and target tokens"""

    def __init__(self, quraan, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(quraan, allowed_special=True)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, item):
        return self.input_ids[item], self.target_ids[item]

In [13]:
def create_dataloader(data, tokenizer, max_length=8, stride=10, **kwargs):
    """create a Quraan torch dataloader"""
    train_ds = QuraanDataset(data, tokenizer, max_length, stride)
    return DataLoader(train_ds, **kwargs)

In [14]:
train_dl = create_dataloader(raw_quraan, quraan_tokenizer, max_length=8, stride=1,
                             batch_size=4, shuffle=False, drop_last=True, num_workers=0)

for i, (x, y) in enumerate(train_dl):
    for i_batch, (x_item, y_item) in enumerate(zip(x, y)):
        print(f"\t{i_batch}: ", quraan_tokenizer.decode(x_item.tolist()), "====>", quraan_tokenizer.decode(y_item.tolist()))
    break  # just not to spam the output

	0:  سُورَةُ ٱلْفَاتِحَةِ: بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ  ====>  ٱلْفَاتِحَةِ: بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ �
	1:   ٱلْفَاتِحَةِ: بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ � ====> : بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝
	2:  : بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝ ====>  بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝ ٱلْحَمْدُ
	3:   بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝ ٱلْحَمْدُ ====>  ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝ ٱلْحَمْدُ لِلَّهِ
