## Read short story for tokenzation

### Step 1: Create tokens

In [1]:
with open("the-verdict.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()

# Print total number of characters
print("Total number of characters:", len(raw_text))

# print first 100 characters
print(raw_text[:99])


Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
# Use regular expression library
import re

text = "Hello world! This is a test, working now."

# Split based on white spaces: use \s as regex
result = re.split(r'\s', text) # This will remove the white space

print(result)

result = re.split(r'(\s)', text) # This will include the white spaces
print(result)


['Hello', 'world!', 'This', 'is', 'a', 'test,', 'working', 'now.']
['Hello', ' ', 'world!', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test,', ' ', 'working', ' ', 'now.']


In [3]:
# Now we want to include punctuation characters (commas full stops and exclamation marks) also as separate strings
result = re.split(r'(\W)', text)
print(result)

['Hello', ' ', 'world', '!', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', ',', '', ' ', 'working', ' ', 'now', '.', '']


In [4]:
# Remove empty strings and white space characters now
result = [item for item in result if item.strip()]

print(result)

['Hello', 'world', '!', 'This', 'is', 'a', 'test', ',', 'working', 'now', '.']


In [5]:
# for the book text, there are ? -- : _ ; ' " () are also present. Hence we need to consider them while splitting
# Hence our tokenizer code is as follows, as of now. This is a simple tokenizer. 
# For LLMs, different tokenizer scheme is used which we will see later
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [t for t in preprocessed if t.strip()]
print (len(preprocessed))

4690


### Step 2: Creating token IDs

In [6]:
# Get all unique words and sorted
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [7]:
# Create a vocabulary
vocab = {}
counter = 0
for word in all_words:
    vocab[word] = counter
    counter = counter + 1

items = list(vocab.items())[:20]
for key,value in items:
    print(f"({key}, {value})")

(!, 0)
(", 1)
(', 2)
((, 3)
(), 4)
(,, 5)
(--, 6)
(., 7)
(:, 8)
(;, 9)
(?, 10)
(A, 11)
(Ah, 12)
(Among, 13)
(And, 14)
(Are, 15)
(Arrt, 16)
(As, 17)
(At, 18)
(Be, 19)


In [8]:
# Now we build a simple tokenizer class that will be used to encode the given text to feed IDs to LLM and decode the Token IDs from LLM to convert back to text.
class SimpleTokenizerV1:
    def __init__(self, vocab):
        # For encoder
        self.str_to_int = vocab

        # For decoder, int to string mapping
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        # Get rid of spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [9]:
print(vocab['"'])

1


In [10]:
# initialize the tokenizer
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)


[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [11]:
decoded_text = tokenizer.decode(ids)
print(decoded_text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [12]:
# The encoder fails when the words are not in the vocabulary
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

#### This shows that we need a large and diverse training set to extend the vocabulary when working on LLMs.

Another solution is to use special context tokens, user by GPT as well.

### SPECIAL CONTEXT TOKENS <|unk|> and <|endoftext|> to existing vocabulary

In [13]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
len(vocab.items())

1132

In [14]:
# print last 5 entries in the updated vocab
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [15]:
# Updated existing tokenizer class to include the new special tokens
class SimpleTokenizerV2:
    def __init__(self, vocab):
        # For encoder
        self.str_to_int = vocab

        # For decoder, int to string mapping
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        # Split
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        # Remove white spaces
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        # It item is not present in vocab, the token is 'unknown'
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        # Convert tokens to token IDs
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        # Get rid of spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [16]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(text)


Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [17]:
re.split(r'([,.:;?_!"()\']|--|\s)', text)[:10]

['Hello', ',', '', ' ', 'do', ' ', 'you', ' ', 'like', ' ']

In [18]:
text

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [19]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [20]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

<div class="alert alert-block alert-info">
<h4>GPT uses Byte Pair Encoding.</h4>
</div>

#### After understanding how BPE works in notes, it is fairly complicated algorithm to implement. Hence we will use an existing python library called 'tiktoken' that's fast [BPE tokenizer](https://github.com/openai/tiktoken)

In [21]:
!pip install tiktoken




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))


tiktoken version: 0.11.0


In [23]:
# Initialize BPE
tokenizer = tiktoken.get_encoding("gpt2")

# someunknownPlace will be recognized as well which in our encoder fails
text = (
    "Hello, do you like tea? <|endoftext|> In the sublit tarraces"
    "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print('total token size:', tokenizer.max_token_value)
print(integers)

# note that 50256 is the <|endoftext|> token and vocab list for gpt2 is 50k.
# The actual size of words in English language is around 200k.
# This shows BPE has considerably reduced the size of vocabulary.

# The following line will demonstrate how 'someunknownPlace' has been tokenized to.
print("Decode of tokens for someunknownPlace [617, 34680, 27271]:", tokenizer.decode([617, 34680, 27271]))


total token size: 50256
[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 850, 18250, 256, 3258, 2114, 1659, 617, 34680, 27271, 13]
Decode of tokens for someunknownPlace [617, 34680, 27271]:  someunknownPlace


In [24]:
# Convert the token IDs back into the text
print(tokenizer.decode(integers))

Hello, do you like tea? <|endoftext|> In the sublit tarracesof someunknownPlace.


<div class="alert alert-block alert-info">
<h5>Observations:</h5>
<ol>
  <li>'<|endoftext|>' token is assigned a relatively large token ID: 50256</li>
  <li>BPE tokenizer encodes and decodes unknown words, e.g., 'someunknownPlace'</li>
  <li>The algo underlying BPE breaks down out of vocab words into smaller subwords or even characters,<br/>which enables it to handle out of vocab words.</li>
</ol>
</div>

In [25]:
# Simple example to illustrate how the BPE tokenizer delas with unknown tokens
integers = tokenizer.encode("Akwirw ier")
print(integers)

print(tokenizer.decode(integers))

print(tokenizer.decode([86,220,959]))

print('Print each token:')
for t in integers:
    txt = tokenizer.decode([t])
    print(t, txt)


[33901, 86, 343, 86, 220, 959]
Akwirw ier
w ier
Print each token:
33901 Ak
86 w
343 ir
86 w
220  
959 ier


## Creating Input Target Pairs

#### We will implement a data loader that fetches the input-target pairs using Sliding Window approach.

##### We will use the BPE tokenizer to tokenize The Verdict story.

In [26]:
import tiktoken
encoding = tiktoken.get_encoding("gpt2")
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = encoding.encode(raw_text)
print("Vocabulary size:", len(enc_text))

# Print first 10 tokens
for i in range(10):
    print(enc_text[i], encoding.decode([enc_text[i]]))

Vocabulary size: 5145
40 I
367  H
2885 AD
1464  always
1807  thought
3619  Jack
402  G
271 is
10899 burn
2138  rather


In [27]:
# Now to demonstrate, remove first fifty tokens
enc_sample = enc_text[50:]

# Context size: How many tokens to have as an input to LLM, to predict next word. 
# The model is trained to look at a sequence of context_size tokens to predict next word in the sequence
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]
print(x, y)
for i in range(context_size):
    print(f"Input: {x[:i+1]}, Output/Target: {y[i]}")

[290, 4920, 2241, 287] [4920, 2241, 287, 257]
Input: [290], Output/Target: 4920
Input: [290, 4920], Output/Target: 2241
Input: [290, 4920, 2241], Output/Target: 287
Input: [290, 4920, 2241, 287], Output/Target: 257


In [28]:
# Take the previous code and repeat for decoded tokens to text
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]
print(x, y)
for i in range(context_size):
    print(f"Input: {tokenizer.decode(x[:i+1])}, Output/Target: {tokenizer.decode([y[i]])}")

[290, 4920, 2241, 287] [4920, 2241, 287, 257]
Input:  and, Output/Target:  established
Input:  and established, Output/Target:  himself
Input:  and established himself, Output/Target:  in
Input:  and established himself in, Output/Target:  a


Now we have implemented the logic of sliding window to generate IO pairs. 

Now we need to utilize the same in more structured way and generate a data loader that we can generate in parallel on multiple CPUs that will generate pytorch tensors (multi dimensional arrays)

### Implement a Data Loader

We will use Pytorch's Dataset and Dataloader classes

Step 1: Tokenize the entire text<br/>
Step 2: Use the sliding window to chunk the book into overlapping sequences of max_length<br/>
Step 3: Return the total number of rows in the dataset<br/>
Step 4: Return a single row from the dataset

<div class="alert alert-block alert-warning">
The class is based on PyTorch Dataset class<br/>
It defines how individual rows are fetched from the dataset<br/>

In [None]:
from torch.utils.data import Dataset, DataLoader

# Functions of this class is based on Pytorch Dataset class and is implemented based on the dataloader documentation in pytorch
# See: https://docs.pytorch.org/docs/stable/data.html#map-style-datasets:~:text=style%20datasets.-,Map%2Dstyle%20datasets,-%23
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Step 1: Tokenize the entire dataset
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequence of max_length
        # stride is jump so that next row picks beyond stride. Note that it is not context size
        # max_length is also context size - number of elements in a row of tensor, it may be different than stride
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            output_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
        

### Code for using above class (Create data loader)
Step 1: Initialize the tokenizer<br/>
Step 2: Create dataset<br/>
Step 3: drop_last is set to true to drop the last batch if it is shorter than the batch_size to prevent loss spikes during training<br/>
Step 4: The number of CPU processes to use for preprocessing

In [None]:
import tiktoken
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    """
    Dataloader to do preprocessing in parallel

    Args:
        txt (string): Input text
        batch_size (int): Number of threads - number of batches that model processes before updating its parameters
        max_length (int): Context length
        stride (int): How much to skip the tokens for each row - shift within batch, This means the next batch will have first row which is already shifted stride*batch_size places, see example below.
        shuffle (boolean):
        drop_last (boolean):
        num_workers (int): Number of CPU threads - parallel processing
    """
    # Initilize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader, which is going to look at the GPTDatasetV1.__getitem__ function and create the IO tensors
    # dataloader will help us to load data in parallel and analyze multiple datasets at one time
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

Test the dataloader with batch size = 1 and context size = 4

In [39]:
import torch

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=4, stride=2, shuffle=False)

data_iter = iter(dataloader)

first_batch = next(data_iter)

print(first_batch)

[tensor([[  40,  367, 2885, 1464],
        [2885, 1464, 1807, 3619]]), tensor([[ 367, 2885, 1464, 1807],
        [1464, 1807, 3619,  402]])]


In [40]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 1807,  3619,   402,   271],
        [  402,   271, 10899,  2138]]), tensor([[ 3619,   402,   271, 10899],
        [  271, 10899,  2138,   257]])]


Notes:<br/>
1. Small batch sizes require less memory during training but lead to noisy model updates<br/>
2. Batch size is a tradeoff and hyperparameter to experiment with when training LLMs.<br/>
3. Larger strides enables to utilize the data set fully and also avoid overlap between batches. More overlap could lead to increased overfitting.

In [44]:
# Effect of batch size
import torch

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)

input, target = next(data_iter)

print(f"Input\n {input}")
print(f"Target\n {target}")

Input
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Target
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
