In [3]:
with open('data/the_verdict.txt', 'r') as f:
    raw_text = f.read()

In [4]:
len(raw_text)

20479

simple tokenization with regex

In [5]:
import re

text = "Hello, world! This is a test text for tokenization"

preprocessed_text = [i.strip() for i in re.split(r'([,.:;?_!"()\']|--|\s)', raw_text) if i.strip()]
len(preprocessed_text)


4690

turn tokens into id (build vocabulary, token->id)

In [29]:
all_words = sorted(set(preprocessed_text)) # remove dups and sort
vocab_size = len(all_words)


In [30]:
# simple dict comprehension
vocab = {token:integer for integer, token in enumerate(all_words)}
len(vocab.items())


1130

tokenize the vocab

In [17]:
class SimpleTokenizerV1:
    def __init__ (self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()} # reverse vocab
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'([,.:;?_!"()\']|--|\s)', r'\1' , text) # replace spaces
        return text



In [18]:
tokenizer = SimpleTokenizerV1(vocab)
text = "I found the couple at tea beneath their palm-trees;"

ids = tokenizer.encode(text)
ids

[53, 469, 988, 296, 180, 975, 215, 989, 751, 9]

In [19]:
tokenizer.decode(ids)

'I found the couple at tea beneath their palm-trees ;'

special tokens

In [21]:
text = "Hello, do you like tea. is this-- a test?"

tokenizer.encode(text)

KeyError: 'Hello'

the errow above is expected! hello does not exist in our vocab, let's improve the tokenizer

In [31]:
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_words)}
len(vocab.items())

1132

In [33]:
class SimpleTokenizerV2:
    def __init__ (self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()} # reverse vocab
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'([,.:;?_!"()\']|--|\s)', r'\1' , text) # replace spaces
        return text



In [34]:
tokenizer = SimpleTokenizerV2(vocab)
text = "Hello, do you like tea. is this-- a test?"

ids = tokenizer.encode(text)
ids

[1131, 5, 355, 1126, 628, 975, 7, 584, 999, 6, 115, 1131, 10]

great! we solved the problem, unkown test is recognizable

In [36]:
text

'Hello, do you like tea. is this-- a test?'

In [35]:
tokenizer.decode(ids)

'<|unk|> , do you like tea . is this -- a <|unk|> ?'

there is still another problem, decoding text returns the unkown character and the llm can't distinguish between each unknown character. we solve this with byte pair encoding!

current models use byte pair encoding, llama3, gpt, etc

github.com/openai/gpt-2 and check sebastian book bonus material for the BPE from scratch

In [38]:
import tiktoken

In [40]:
tokenizer = tiktoken.get_encoding("gpt2")

In [41]:
tokenizer.encode("Hello world")

[15496, 995]

In [43]:
tokenizer.decode(tokenizer.encode("Hello world"))

'Hello world'

In [53]:
text = (
    "Hello, do tou like tea? <|endoftext|> In the sunit terraces"
    "of someunknownPlace auhshaskkkasas"
)


tokenizer.encode(text, allowed_special={"<|endoftext|>"})


[15496,
 11,
 466,
 2819,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 270,
 8812,
 2114,
 1659,
 617,
 34680,
 27271,
 257,
 7456,
 1477,
 2093,
 28747,
 292,
 292]

changing or adding random text does not yield errors anymore!

now, data sampling! - feed parts of the text to the llm insteaf of full text

In [55]:
with open('data/the_verdict.txt', 'r') as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
len(enc_text)


5145

In [59]:
enc_sample = enc_text[50:] # truncate for visualization purposes

In [65]:
context_size = 4

In [None]:
# always predict the next word
for i in range(1, context_size + 1):

    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [63]:
import torch

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [64]:
torch.__version__

'2.8.0'

In [70]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__ (self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids [idx]


In [74]:
# drop_last = drop the last batch if it can't have the appropriate size
# e.g. text has 9 tokens, batch_size is 2, 4 batches with 2 tokens
# and 1 lonely batch of 1 token, this last batch gets dropped
def create_dataloader_v1(
        txt, batch_size=4, max_length=256, 
        stride=128, shuffle=True, drop_last=True, num_workers=0
    ):
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [75]:
with open('data/the_verdict.txt', 'r') as f:
    raw_text = f.read()

In [81]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [82]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]


note: stride is by how much we move text between batches, it avoids overfitting by allowing only new tokens as inputs, e.g.

"I like dogs but also like cats too",
with stride1:

["I", "like", "dogs", "but"], ["like", "dogs", "but", "also"]

with stride 4:

["I", "like", "dogs", "but"], ["also", "like", "cats", "too"]

In [83]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


batch = input, target pair

now, token embeddings!!!

we go from raw text -> tokenized text -> token ids -> embedding vectors!!

In [84]:
input_ids = torch.tensor([2, 3, 5, 1])

In [85]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [86]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

check bonus material for difference of embedding layers and linear layers

In [None]:
embedding_layer(torch.tensor([2])) # same row as the weight

tensor([[ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)

In [90]:
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

word positional information

In [91]:
vocab_size = 50257
output_dim = 256 # very small still even for gpt2

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [92]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [93]:
print(f"Token ids: {inputs}")
print(f"input shape", inputs.shape)

Token ids: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
input shape torch.Size([8, 4])


In [95]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

each token id is converted to a 256 dimension vector

In [96]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim) # this is actually a large layer

In [97]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [98]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

In [99]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [100]:
pos_embeddings.shape

torch.Size([4, 256])

In [101]:
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])

## learnings

### the input pipeline:
raw text ->
tokenized text, text split into "words" or "wokens" ->
token ids, building a vocabulary assinging each unique token to a unique id ->
token embeddings, turn ids into N dimensional vectors ->
token embedding + positional embedding (information about the position of the token in the input in order to differentiate multiple tokens in the same batch) = input embedding

---

byte pair encoding (BPE) is how to properly encode tokens, allowing unknown words to be properly tokenized - again, check bnus material for an implementation from scratch

each input batch in training should(should?) have unique tokens in order to avoid overfitting, I thought that it was sequential
