# Text Encoding process 

This notebook is to work on the text encoding process.

Use the same example as LLMs from scratch to work on text encoding.

In [9]:
from pathlib import Path
import urllib.request
data_path = Path("../data")
file_path = data_path / "the-verdict.txt"
if not file_path.exists():
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    if not data_path.exists():
        data_path.mkdir()
    
    urllib.request.urlretrieve(url, file_path)

In [10]:
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [11]:
import re

simple_example = "This is a simple example. Let's see how it works!"

# Split by space, comma, or period, keeping the delimiters
tokens = re.split(r'([,.]|\s)', simple_example)
# Remove empty tokens
tokens = [token for token in tokens if token.strip()]

print(tokens)


['This', 'is', 'a', 'simple', 'example', '.', "Let's", 'see', 'how', 'it', 'works!']


In [12]:
preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed_text = [token.strip() for token in preprocessed_text if token.strip()]

preprocessed_text[:20]


['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was']

## Vanilla text processing 

Now we have our processed text, we can work on encoding it.

Let's create a simple vocabulary mapping each unique token to an integer index.

In [13]:
vocab = {token: idx for idx, token in enumerate(sorted(set(preprocessed_text)))}

In [14]:
class TextEncoderV1:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inv_vocab = {idx: token for token, idx in vocab.items()}
    
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [token.strip() for token in tokens if token.strip()]
        return [self.vocab[token] for token in tokens]
    
    def decode(self, indices):
        text = [self.inv_vocab[idx] for idx in indices]
        return re.sub(r' ([,.:;?_!"()\'])', r'\1', ' '.join(text))
    
encoder = TextEncoderV1(vocab)
print("Vocabulary size:", len(encoder.vocab))
print("Encoded:", encoder.encode("This is an."))
print("Decoded:", encoder.decode(encoder.encode("This is an.")))

Vocabulary size: 1130
Encoded: [97, 584, 156, 7]
Decoded: This is an.


In [15]:
# Add <unk> token for unknown words and end-of-sequence token <eos>
new_vocbab = vocab.copy()
new_vocbab["<unk>"] = len(new_vocbab)
new_vocbab["<eos>"] = len(new_vocbab)

class TextEncoderV2:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inv_vocab = {idx: token for token, idx in vocab.items()}
    
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [token.strip() for token in tokens if token.strip()]
        tokens = [token if token in self.vocab else "<unk>" for token in tokens
                   ]
        tokens.append("<eos>")
        return [self.vocab[token] for token in tokens]
    
    def decode(self, indices):
        text = [self.inv_vocab[idx] for idx in indices if idx != self.vocab["<eos>"]]
        return re.sub(r' ([,.:;?_!"()\'])', r'\1', ' '.join(text))

In [16]:
encoder_v2 = TextEncoderV2(new_vocbab)
print("Encoded with V2:", encoder_v2.encode("This is an unknownword."))
print("Decoded with V2:", encoder_v2.decode(encoder_v2.encode("This is an unknownword.")))


Encoded with V2: [97, 584, 156, 1130, 7, 1131]
Decoded with V2: This is an <unk>.


Exercise

In [17]:
from tiktoken._educational import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

words = "Akwirw ier"

tokens = tokenizer.encode(words)

print(tokens)

decoded_words = tokenizer.decode(tokens)

print(decoded_words)



[33901, 86, 343, 86, 220, 959]
Akwirw ier


Implementing rolling windows

In [18]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)

print("Total number of characters:", len(raw_text))
print("Total number of tokens:", len(enc_text))

Total number of characters: 20479
Total number of tokens: 5145


In [19]:
enc_seq = enc_text[50:]

In [20]:
context_window = 4 

x = enc_seq[:context_window]
y = enc_seq[1:context_window + 1]

print(f"Input tokens: {x}")
print(f"Target tokens:     {y}")


Input tokens: [290, 4920, 2241, 287]
Target tokens:     [4920, 2241, 287, 257]


Pytorch dataset implementation 

In [21]:
from torch.utils.data import Dataset
import torch

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride) -> None:
        self.input_ids = []
        self.output_ids = []

        tokens = tokenizer.encode(text)

        for i in range(0, len(tokens) - max_length, stride):
            input_seq = tokens[i:i + max_length]
            target_seq = tokens[i + 1:i + max_length + 1]

            self.input_ids.append(input_seq)
            self.output_ids.append(target_seq)

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return torch.Tensor(self.input_ids[index]), torch.Tensor(self.output_ids[index])
    

# Example usage
dataset = GPTDatasetV1(raw_text, tokenizer, max_length=4, stride=4)

# Exercise: Implementing rolling windows
for i in range(3):
    input_seq, target_seq = dataset[i]
    print(f"Input sequence {i}: {input_seq}")
    print(f"Target sequence {i}:     {target_seq}")
    print()
    


Input sequence 0: tensor([  40.,  367., 2885., 1464.])
Target sequence 0:     tensor([ 367., 2885., 1464., 1807.])

Input sequence 1: tensor([1807., 3619.,  402.,  271.])
Target sequence 1:     tensor([ 3619.,   402.,   271., 10899.])

Input sequence 2: tensor([10899.,  2138.,   257.,  7026.])
Target sequence 2:     tensor([ 2138.,   257.,  7026., 15632.])



In [22]:

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=8, shuffle=True, drop_last=True)

first_batch = next(iter(dataloader))
print(first_batch)

[tensor([[1.8900e+03, 2.6200e+02, 7.1700e+02, 6.4000e+02],
        [4.3800e+02, 1.5464e+04, 1.1000e+01, 3.5500e+02],
        [8.1720e+03, 6.4500e+02, 8.6500e+02, 2.9100e+02],
        [6.3480e+03, 5.3910e+03, 1.1000e+01, 2.9000e+02],
        [2.6200e+02, 5.0085e+04, 1.3000e+01, 8.8700e+02],
        [1.5500e+03, 1.4580e+04, 1.1000e+01, 3.4000e+02],
        [1.1000e+01, 5.0800e+02, 5.5000e+02, 1.8459e+04],
        [8.9200e+02, 2.8600e+02, 3.4000e+02, 1.1000e+01]]), tensor([[2.6200e+02, 7.1700e+02, 6.4000e+02, 6.1600e+02],
        [1.5464e+04, 1.1000e+01, 3.5500e+02, 3.4000e+02],
        [6.4500e+02, 8.6500e+02, 2.9100e+02, 1.2000e+01],
        [5.3910e+03, 1.1000e+01, 2.9000e+02, 4.6500e+02],
        [5.0085e+04, 1.3000e+01, 8.8700e+02, 6.7300e+02],
        [1.4580e+04, 1.1000e+01, 3.4000e+02, 1.1070e+03],
        [5.0800e+02, 5.5000e+02, 1.8459e+04, 1.0680e+03],
        [2.8600e+02, 3.4000e+02, 1.1000e+01, 6.1600e+02]])]


Look at the positionnal embeddings

In [23]:
from torch import nn
import torch

vocab_size = tokenizer.n_vocab
print("Vocabulary size:", vocab_size)
output_dim = 256 

token_embedding_layer = nn.Embedding(vocab_size, output_dim)
example = next(iter(dataloader))[0].long()
example = token_embedding_layer(example)

example.shape

Vocabulary size: 50257


torch.Size([8, 4, 256])

In [24]:
positional_embedding_layer = nn.Embedding(context_window, output_dim)

pos_embedding = positional_embedding_layer(torch.arange(context_window))
pos_embedding


tensor([[ 0.1687,  0.3737, -1.0760,  ..., -1.2372, -0.3413, -0.8533],
        [-1.4922, -1.4779, -0.1543,  ..., -0.1339, -0.5327,  1.4610],
        [ 1.7481,  0.1681,  0.7422,  ...,  1.1188, -0.6430, -0.2751],
        [-0.7316, -0.7112, -0.0641,  ..., -1.2814,  0.5148,  0.2427]],
       grad_fn=<EmbeddingBackward0>)

In [25]:
input_embedding = example + pos_embedding
input_embedding.shape

torch.Size([8, 4, 256])