# Working with Text Data

## Tokeinizing Text

In [2]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/varunvohra94/"
           "llms-from-scratch/refs/heads/main/Lesson%201/the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [5]:
len(raw_text)

20479

### Let's try to manually tokeinize our data without using libraries like tiktoken

In [7]:
import re

text = "Hello World, This, is a test."
result = re.split(r'(\s)', text)
result

['Hello', ' ', 'World,', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']

In [8]:
result = re.split('([,.]|\s)', text)
result

['Hello',
 ' ',
 'World',
 ',',
 '',
 ' ',
 'This',
 ',',
 '',
 ' ',
 'is',
 ' ',
 'a',
 ' ',
 'test',
 '.',
 '']

In [9]:
result = [item for item in result if item.strip()]
result

['Hello', 'World', ',', 'This', ',', 'is', 'a', 'test', '.']

In [13]:
text = "Hello, world. Is this-- a test?"

result = re.split('([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
result

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']

### Lets use the above logic for our `raw_text` (the-verdict.txt)

In [16]:
result = re.split('([,.:;?_!"()\']|--|\s)', raw_text)
result = [item.strip() for item in result if item.strip()]
preprocessed = result
len(preprocessed)

4690

## Convert tokens into token IDs

Token Ids which are essentially unique integers. It involves the following steps:
* Building a vocabulary (unique mapping between a unique word to a unique integer)
* Use this vocabulary to tokenize training data into token ids (For any new sentence conver using the mapping from the vocabulary)

### Building the vocabulary

In [17]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab_size

1130

In [19]:
vocab = {token: integer for integer, token in enumerate(all_words)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

### Creating a Simple Tokenizer

Now that we have our vocabular, we can go ahead and combine the above steps of preprocessing (stripping of whitespaces, splitting words etc.) into an encoding and decoding class

In [42]:
from typing import List, Dict
class SimpleTokenizerV1:
    """
    A simple tokenizer class to encode and decode text data with a vacobulary provided
    """
    def __init__(self, vocab: Dict[str, int]):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text: str) -> List[int]:
        preprocessed = re.split('([,.:;?_!"()\']|--|\s)', text)
        
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids: List[int]) -> str:
        text = " ".join([self.int_to_str[id] for id in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [29]:
tokenizer = SimpleTokenizerV1(vocab=vocab)
tokenizer.encode("this is a")

[999, 584, 115]

In [35]:
text = """
"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride
"""
text

'\n"It\'s the last he painted, you know,"\nMrs. Gisburn said with pardonable pride\n'

In [31]:
ids = tokenizer.encode(text)

In [36]:
tokenizer.decode(tokenizer.encode(text=text))

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride'

## Adding Special context tokens

We will now extend our vocabulary for special tokens for example
* Token for unknown words, word not in vocabulary
* End of text Token (will cover this later)

In [38]:
# Limitation of existing tokenizer (cant tokenize unknonwn words)
# text = "Hello World"
# tokenizer.encode(text=text)
### This will throw an error since the world "Hello" is not in the vocabulary.

In [40]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab_extended = {token:integer for integer, token in enumerate(all_tokens)}
len(vocab), len(vocab_extended)

(1130, 1132)

Lets Modify our Simple Tokenizer to use the new special tokens

In [43]:
class SimpleTokenizerV2:
    def __init__(self, vocab: Dict[str, int]):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text: str) -> List[int]:
        preprocessed = re.split('([,.:;?_!"()\']|--|\s)', text)
        
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids: List[int]) -> str:
        text = " ".join([self.int_to_str[id] for id in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [44]:
tokenizer = SimpleTokenizerV2(vocab=vocab_extended)
tokenizer.encode("Hello")

[1131]

In [45]:
tokenizer.decode(tokenizer.encode("Hello"))

'<|unk|>'

## Byte Pair Encoding

From the above example we can see that it is not ideal if we want to recreate our text from the ids with the addition of the `<|unk|>` token.

Byte-Pair Enoding is an algorithm to help us take it to the next level. It's a really popular algorithms for tokenizing
texts. GPT-1, GPT-2, GPT-3, GPT-4 al use byte-pair encoding.

It is an algorithm to help convert any type of word into sub-tokens
Example of the algorithm in action: https://tiktokenizer.vercel.app/

GPT-2 Implementation of Byte-Pair encoding: https://github.com/openai/gpt-2/blob/master/src/encoder.py 

TikToken: https://github.com/openai/tiktoken/tree/main implemented in Rust. Rust is a programming language primiarily used for high performance computing and generally wrapped to provide a python API for it.

In [46]:
import tiktoken
tiktoken.__version__

'0.9.0'

In [47]:
tokenizer = tiktoken.get_encoding("gpt2")



In [48]:
tokenizer.encode("Hello World")

[15496, 2159]

In [49]:
tokenizer.decode(tokenizer.encode("Hello World"))

'Hello World'

In [60]:
text = """Hello, do you like tea? <|endoftext|> In the sunlit terraces
of some unknownplace"""

tokenized_text = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(f"Original text:\n{text}\n")
print(f"Tokenized text:\n{tokenized_text[:5]} ....\n")
print(f"Decoded tokenized text:\n{tokenizer.decode(tokenized_text)}")

Original text:
Hello, do you like tea? <|endoftext|> In the sunlit terraces
of some unknownplace

Tokenized text:
[15496, 11, 466, 345, 588] ....

Decoded tokenized text:
Hello, do you like tea? <|endoftext|> In the sunlit terraces
of some unknownplace


## Data Sampling with Sliding Window
In this section we will create a PyTorch dataset and PyTorch Dataloader for our text data.

In [64]:
enc_text = tokenizer.encode(raw_text)
len(enc_text)

5145

In [65]:
enc_text[:10]

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]

In [69]:
enc_sample = enc_text[50:]
len(enc_sample)

5095

In [68]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:\t{y}")

x: [290, 4920, 2241, 287]
y:	[4920, 2241, 287, 257]


In [73]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "------>", desired)
    print(tokenizer.decode(context), "------>", tokenizer.decode([desired]))

[290] ------> 4920
 and ------>  established
[290, 4920] ------> 2241
 and established ------>  himself
[290, 4920, 2241] ------> 287
 and established himself ------>  in
[290, 4920, 2241, 287] ------> 257
 and established himself in ------>  a


In [74]:
import torch
torch.__version__

'2.6.0'

In [101]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [102]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [103]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [104]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [105]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Creating Token Embeddings

In [107]:
input_ids = torch.tensor([2,3,5,1])

In [111]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [112]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [114]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

## Encoding word positions

In [119]:
vocab_size = 50257
output_dim = 256 # GPT was trained on output dim of 1024

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [120]:
max_length = 4
dataloader = create_dataloader_v1(
    txt=raw_text,
    batch_size=8,
    max_length=max_length,
    stride=max_length,
    shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [118]:
print("TokenIDs:\n", inputs)
print("\nInputs Shape:\n", inputs.shape)

TokenIDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs Shape:
 torch.Size([8, 4])


In [123]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

8 is the batch size, 4 tokens per item in the batch and now instead of 4 tokens we now have those tokens represented as token embeddings.

In [126]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [127]:
pos_embedding_layer.weight

Parameter containing:
tensor([[-1.4693,  1.0024,  0.6403,  ..., -0.7098, -0.4741,  1.3287],
        [-0.3833,  0.5006,  2.1007,  ..., -0.1256,  0.8334, -1.8840],
        [ 0.3221,  0.9576, -1.5949,  ...,  0.4771, -0.7206,  0.2753],
        [ 0.1482, -1.1207,  1.1867,  ...,  0.5207, -1.0125, -0.3823]],
       requires_grad=True)

In [129]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
pos_embeddings

tensor([[-1.4693,  1.0024,  0.6403,  ..., -0.7098, -0.4741,  1.3287],
        [-0.3833,  0.5006,  2.1007,  ..., -0.1256,  0.8334, -1.8840],
        [ 0.3221,  0.9576, -1.5949,  ...,  0.4771, -0.7206,  0.2753],
        [ 0.1482, -1.1207,  1.1867,  ...,  0.5207, -1.0125, -0.3823]],
       grad_fn=<EmbeddingBackward0>)

In [130]:
pos_embeddings.shape

torch.Size([4, 256])

In [132]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
