<a href="https://colab.research.google.com/github/taaha3244/gpt2-scratch/blob/main/LLM_from_scratch_DataLoading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
from importlib.metadata import version
import re
import tiktoken
import torch
import matplotlib

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.1+cu124
tiktoken version: 0.8.0


In [None]:
def load_and_clean_text(file_path: str) -> str:

    with open(file_path, 'r', encoding='utf-8') as f:
      text = f.read()
      print(f"Successfully loaded file with UTF-8 encoding")

    # To remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    print(f"Total characters: {len(text)}")
    print(f"Unique characters: {len(set(text))}")
    print(f"\nFirst 100 characters of cleaned text:")
    print(text[:101])

    return text

cleaned_text = load_and_clean_text('/content/iqbal.txt')

Successfully loaded file with UTF-8 encoding
Total characters: 349701
Unique characters: 74

First 100 characters of cleaned text:
۱ گُلزارِ ہست و بود نہ بیگانہ وار دیکھ نہ آتے، ہمیں اس میں تکرار کیا تھی عجب و اعظ کی دِین داری ہے یا


In [None]:
class UrduTokenizer:
    def __init__(self):
        # Special tokens with proper symbols
        self.PAD_TOKEN = "<PAD>"
        self.UNK_TOKEN = "<UNK>"
        self.BOS_TOKEN = "<BOS>"
        self.EOS_TOKEN = "<EOS>"

        # Special token indices
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.BOS_IDX = 2
        self.EOS_IDX = 3

        # Initialize mappings
        self.char_to_idx = {
            self.PAD_TOKEN: self.PAD_IDX,
            self.UNK_TOKEN: self.UNK_IDX,
            self.BOS_TOKEN: self.BOS_IDX,
            self.EOS_TOKEN: self.EOS_IDX
        }
        self.idx_to_char = {
            self.PAD_IDX: self.PAD_TOKEN,
            self.UNK_IDX: self.UNK_TOKEN,
            self.BOS_IDX: self.BOS_TOKEN,
            self.EOS_IDX: self.EOS_TOKEN
        }
        self.vocab_size = 4

    def fit(self, text):
        """Build vocabulary from text"""
        # Get unique characters from text
        unique_chars = sorted(set(text))

        # Add to vocabulary (after special tokens)
        for char in unique_chars:
            if char not in self.char_to_idx:
                self.char_to_idx[char] = len(self.char_to_idx)
                self.idx_to_char[len(self.idx_to_char)] = char

        self.vocab_size = len(self.char_to_idx)

        print(f"Vocabulary statistics:")
        print(f"Total vocab size: {self.vocab_size}")
        print(f"Character vocab size: {len(unique_chars)}")
        print(f"Special tokens: {[self.PAD_TOKEN, self.UNK_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN]}")
        return self

    def encode(self, text, add_special_tokens=False):
        """Convert text to token ids"""
        tokens = []

        # Add BOS token
        if add_special_tokens:
            tokens.append(self.BOS_IDX)

        # Encode characters
        for char in text:
            token_id = self.char_to_idx.get(char)
            if token_id is None:
                token_id = self.UNK_IDX
            tokens.append(token_id)

        # Add EOS token
        if add_special_tokens:
            tokens.append(self.EOS_IDX)

        return tokens

    def decode(self, tokens, skip_special_tokens=True):
        """Convert token ids back to text"""
        text = []
        for token in tokens:
            # Skip special tokens if requested
            if skip_special_tokens and token in {self.PAD_IDX, self.BOS_IDX, self.EOS_IDX}:
                continue

            # Get character from idx_to_char mapping
            char = self.idx_to_char.get(token)
            if char in {self.PAD_TOKEN, self.UNK_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN}:
                if not skip_special_tokens:
                    text.append(char)
            else:
                text.append(char if char is not None else self.UNK_TOKEN)

        return ''.join(text)

    def pad_sequence(self, tokens, max_length):
        """Pad or truncate sequence to max_length"""
        if len(tokens) > max_length:
            return tokens[:max_length]
        return tokens + [self.PAD_IDX] * (max_length - len(tokens))



In [None]:
def test_tokenizer(text):

    tokenizer = UrduTokenizer()
    tokenizer.fit(text)

    sample = text[:50]
    print(f"\nTesting with sample: {sample}")


    tokens = tokenizer.encode(sample)
    decoded = tokenizer.decode(tokens)
    print(f"\nEncoded tokens: {tokens[:10]}...")
    print(f"Decoded text: {decoded}")

    tokens_special = tokenizer.encode(sample, add_special_tokens=True)
    decoded_special = tokenizer.decode(tokens_special, skip_special_tokens=False)
    print(f"\nWith special tokens:")
    print(f"Encoded: {tokens_special[:10]}...")
    print(f"Decoded: {decoded_special}")

    return tokenizer

tokenizer = test_tokenizer(cleaned_text)

Vocabulary statistics:
Total vocab size: 78
Character vocab size: 74
Special tokens: ['<PAD>', '<UNK>', '<BOS>', '<EOS>']

Testing with sample: ۱ گُلزارِ ہست و بود نہ بیگانہ وار دیکھ نہ آتے، ہمی

Encoded tokens: [71, 4, 63, 47, 41, 30, 20, 29, 48, 4]...
Decoded text: ۱ گُلزارِ ہست و بود نہ بیگانہ وار دیکھ نہ آتے، ہمی

With special tokens:
Encoded: [2, 71, 4, 63, 47, 41, 30, 20, 29, 48]...
Decoded: <BOS>۱ گُلزارِ ہست و بود نہ بیگانہ وار دیکھ نہ آتے، ہمی<EOS>


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class UrduTextDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):

        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, add_special_tokens=True)

        for i in range(0, len(token_ids) - max_length, stride):

            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]

            # Pad if necessary
            input_chunk = tokenizer.pad_sequence(input_chunk, max_length)
            target_chunk = tokenizer.pad_sequence(target_chunk, max_length)

            # Convert to tensors
            self.input_ids.append(torch.tensor(input_chunk, dtype=torch.long))
            self.target_ids.append(torch.tensor(target_chunk, dtype=torch.long))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_data_loaders(text, tokenizer, max_length=128, stride=4,
                       batch_size=4, val_split=0.1):


    train_len = int(len(text) * (1 - val_split))
    train_text = text[:train_len]
    val_text = text[train_len:]

    train_dataset = UrduTextDataset(
        train_text, tokenizer, max_length, stride
    )
    val_dataset = UrduTextDataset(
        val_text, tokenizer, max_length, stride
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0
    )

    print(f"\nDataLoader Statistics:")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    print(f"Batch size: {batch_size}")
    print(f"Training batches: {len(train_loader)}")
    print(f"Validation batches: {len(val_loader)}")

    return train_loader, val_loader



In [None]:
train_data, val_data = create_data_loaders(
    cleaned_text,
    tokenizer,
    max_length=128,
    stride=8,
    batch_size=8
)



DataLoader Statistics:
Training samples: 39326
Validation samples: 4356
Batch size: 8
Training batches: 4916
Validation batches: 545


In [None]:
next(iter(train_data))

[tensor([[69, 64,  4,  ..., 24, 31,  4],
         [62, 44, 19,  ...,  4, 62, 69],
         [ 4, 20, 40,  ..., 69,  4, 31],
         ...,
         [29,  4, 40,  ..., 44, 69, 20],
         [42, 24, 65,  ..., 48, 24, 20],
         [ 4, 38, 42,  ..., 35, 20,  4]]),
 tensor([[64,  4, 57,  ..., 31,  4, 57],
         [44, 19, 69,  ..., 62, 69,  4],
         [20, 40, 21,  ...,  4, 31, 70],
         ...,
         [ 4, 40, 21,  ..., 69, 20,  4],
         [24, 65,  4,  ..., 24, 20, 21],
         [38, 42, 48,  ..., 20,  4, 67]])]