 # Tokenize: from text to words/tokens

In [20]:
import re

def tokenize(text):
    # Split by punctuation and whitespace
    tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    # Remove empty strings and strip whitespace
    tokens = [t.strip() for t in tokens if t.strip()]
    return tokens


In [21]:
with open("Peter_Rabbit.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokens = tokenize(raw_text)
print(tokens[:10])

['Once', 'upon', 'a', 'time', 'there', 'were', 'four', 'little', 'Rabbits', ',']


# Encode: from text to IDs

In [22]:
def build_vocab(whole_text):
    tokens = tokenize(whole_text)
    vocab = {token:id for id,token in enumerate(sorted(set(tokens)))}
    return vocab

In [23]:
vocab = build_vocab(raw_text)

print(len(vocab))
print(list(vocab.items())[:20])

405
[('!', 0), ("'", 1), (',', 2), ('--', 3), ('.', 4), (':', 5), (';', 6), ('A', 7), ('After', 8), ('Also', 9), ('An', 10), ('And', 11), ('Benjamin', 12), ('Bunny', 13), ('But', 14), ('Cotton-tail', 15), ('Cottontail', 16), ('END', 17), ('Father', 18), ('First', 19)]


In [24]:
def encode(vocab, text):
    return [vocab[token] for token in tokenize(text)]

In [25]:
print(encode(vocab, "Once upon a time there were four little Rabbits"))

[33, 373, 46, 354, 346, 386, 155, 210, 38]


# Decode: from IDs to text

In [26]:
def decode(vocab, ids):
    vocab_inverse = {id:token for token,id in vocab.items()}
    text= " ".join([vocab_inverse[id] for id in ids])
    return text

In [27]:
print(decode(vocab,[33, 373, 46, 354, 346, 386, 155, 210, 38]))

Once upon a time there were four little Rabbits


# Tokenizer: vocab, encode, decode

In [28]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_inverse = {id:token for token,id in vocab.items()}

    def encode(self, text):
        return [self.vocab[token] for token in tokenize(text)]

    def decode(self, ids):
        return " ".join([self.vocab_inverse[id] for id in ids])

In [29]:
tokenizer = SimpleTokenizerV1(vocab)
print(tokenizer.decode(tokenizer.encode("Once upon a time there were four little Rabbits")))

Once upon a time there were four little Rabbits


# Special token: UNKnown/EndOfSentence

In [30]:
# print(tokenizer.decode(tokenizer.encode("Once upon a time there were four little Rabbits, and they were all very happy.")))

In [31]:
vocab['<unk>'] = len(vocab)

print(list(vocab.items())[-5:])

[('wriggled', 401), ('you', 402), ('young', 403), ('your', 404), ('<unk>', 405)]


In [32]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_inverse = {id:token for token,id in vocab.items()}

    def encode(self, text):
        unk_id = self.vocab.get("<unk>")
        return [self.vocab.get(token,unk_id) for token in tokenize(text)]

    def decode(self, ids):
        return " ".join([self.vocab_inverse[id] for id in ids])

In [33]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.decode(tokenizer.encode("Once upon a time there were four little Rabbits, and they were all very happy.")))

Once upon a time there were four little Rabbits , and <unk> were all very <unk> .


# BytePair Encoding: break words into chunks/subwords
- tiktoken: https://tiktokenizer.vercel.app/


In [34]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("unbelievability"))

[403, 6667, 11203, 1799]


In [35]:
print(tokenizer.decode([403,12,6667,12,11203,12,1799]))

un-bel-iev-ability


In [36]:
print("vocab size of gpt2: ",tokenizer.n_vocab)

vocab size of gpt2:  50257


# Data Sampling with Sliding Window

In [37]:
with open("Peter_Rabbit.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print("Raw characters: ",len(raw_text))
print("Raw unique characters: ",len(set(raw_text)))
print("Raw words:", len(raw_text.split()))
print("Raw unique words:", len(set(raw_text.split())))
print("tokens: ",len(enc_text))
print("first 15 token IDs: ", enc_text[:15])
print("first 15 tokens: ","|".join(tokenizer.decode([token]) for token in enc_text[:15]))

Raw characters:  5583
Raw unique characters:  54
Raw words: 975
Raw unique words: 440
tokens:  1547
first 15 token IDs:  [7454, 2402, 257, 640, 612, 547, 1440, 1310, 22502, 896, 11, 290, 511, 3891, 198]
first 15 tokens:  Once| upon| a| time| there| were| four| little| Rabb|its|,| and| their| names|



In [38]:
context_size = 5
for i in range(1,context_size+1):
    context = enc_text[:i]
    desired = enc_text[i]
    print(context, "-->", desired)

[7454] --> 2402
[7454, 2402] --> 257
[7454, 2402, 257] --> 640
[7454, 2402, 257, 640] --> 612
[7454, 2402, 257, 640, 612] --> 547


In [39]:
context_size = 5
for i in range(1,context_size+1):
    context = enc_text[:i]
    desired = enc_text[i]
    print(tokenizer.decode(context), "-->", tokenizer.decode([desired]))

Once -->  upon
Once upon -->  a
Once upon a -->  time
Once upon a time -->  there
Once upon a time there -->  were


In [40]:
from torch.utils.data import Dataset
import torch

class GPTDatasetV1(Dataset):
    def __init__(self, txt,tokenizer, context_size, stride):
        token_ids = tokenizer.encode(txt)
        assert len(token_ids) > context_size, "Text is too short"

        self.input_ids = [torch.tensor(token_ids[i:i+context_size])
                          for i in range(0, len(token_ids)-context_size, stride)]
        self.target_ids = [torch.tensor(token_ids[i+1:i+context_size+1])
                          for i in range(0, len(token_ids)-context_size, stride)]
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [41]:
from torch.utils.data import DataLoader
import tiktoken

def dataloader_v1(txt,batch_size=3,context_size=5,stride=2,shuffle=False,drop_last=True,num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,tokenizer,context_size,stride)
    return DataLoader(dataset, batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)


In [42]:
with open("Peter_Rabbit.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = dataloader_v1(raw_text)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("shape of input: ",inputs.shape)
print("first batch, input: \n", inputs,"\n targets: \n", targets)

shape of input:  torch.Size([3, 5])
first batch, input: 
 tensor([[ 7454,  2402,   257,   640,   612],
        [  257,   640,   612,   547,  1440],
        [  612,   547,  1440,  1310, 22502]]) 
 targets: 
 tensor([[ 2402,   257,   640,   612,   547],
        [  640,   612,   547,  1440,  1310],
        [  547,  1440,  1310, 22502,   896]])


# Token Embedding: From Words to Vectors
Vectors are
- high-dimensional
- dense
- learnable

Embedding is
- looking up vectors from a big table
- usually a matrix with shape (vocab_size, embed_dim)
- initialized with random values
- updated during training

In [43]:
from torch import nn

vocab_size = 10
embed_dim = 4
torch.manual_seed(123)
token_embedding_layer = nn.Embedding(vocab_size, embed_dim)
print("token_embedding_layer shape: ", token_embedding_layer.weight.shape)
print("token_embedding_layer weight: ", token_embedding_layer.weight)

token_embedding_layer shape:  torch.Size([10, 4])
token_embedding_layer weight:  Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880],
        [ 0.3486,  0.6603, -0.2196, -0.3792],
        [ 0.7671, -1.1925,  0.6984, -1.4097],
        [ 0.1794,  1.8951,  0.4954,  0.2692],
        [-0.0770, -1.0205, -0.1690,  0.9178],
        [ 1.5810,  1.3010,  1.2753, -0.2010],
        [ 0.9624,  0.2492, -0.4845, -2.0929],
        [-0.8199, -0.4210, -0.9620,  1.2825],
        [-0.3430, -0.6821, -0.9887, -1.7018],
        [-0.7498, -1.1285,  0.4135,  0.2892]], requires_grad=True)


In [44]:
input_ids = torch.tensor([2,3,5])
token_embeddings = token_embedding_layer(input_ids)
print("token_embeddings: \n", token_embeddings) # return row 2,3,5 of token_embedding_layer.weight

token_embeddings: 
 tensor([[ 0.7671, -1.1925,  0.6984, -1.4097],
        [ 0.1794,  1.8951,  0.4954,  0.2692],
        [ 1.5810,  1.3010,  1.2753, -0.2010]], grad_fn=<EmbeddingBackward0>)


- GPT-2 uses an embedding table of size (50257 tokens × 768 dimensions).

In [45]:
from torch import nn

vocab_size = 50527
embed_dim = 768
torch.manual_seed(123)
token_embedding_layer_gpt2 = nn.Embedding(vocab_size, embed_dim)
print("token_embedding_layer_gpt2 shape: ", token_embedding_layer_gpt2.weight.shape)
print("token_embedding_layer_gpt2 weight: ", token_embedding_layer_gpt2.weight)

token_embedding_layer_gpt2 shape:  torch.Size([50527, 768])
token_embedding_layer_gpt2 weight:  Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ..., -0.3181, -1.3936,  0.5226],
        [ 0.2579,  0.3420, -0.8168,  ..., -0.4098,  0.4978, -0.3721],
        [ 0.7957,  0.5350,  0.9427,  ..., -1.0749,  0.0955, -1.4138],
        ...,
        [-1.8239,  0.0192,  0.9472,  ..., -0.2287,  1.0394,  0.1882],
        [-0.8952, -1.3001,  1.4985,  ..., -0.5879, -0.0340, -0.0092],
        [-1.3114, -2.2304, -0.4247,  ...,  0.8176,  1.3480, -0.5107]],
       requires_grad=True)


In [46]:
input_ids = torch.tensor([2,3,5])
print(token_embedding_layer_gpt2(input_ids))

tensor([[ 0.7957,  0.5350,  0.9427,  ..., -1.0749,  0.0955, -1.4138],
        [-0.0312,  1.6913, -2.2380,  ...,  0.2379, -1.1839, -0.3179],
        [-0.4334, -0.5095, -0.7118,  ...,  0.8329,  0.2992,  0.2496]],
       grad_fn=<EmbeddingBackward0>)


# Position Embedding: From Positon to Vectors
position embeddin is
- a matrix with shape (context_size, embed_dim)
- initialized with random values
- a learnable parameter, updated during training

In [47]:
from torch import nn

context_size = 5
embed_dim = 4
torch.manual_seed(123)
position_embedding_layer = nn.Embedding(context_size, embed_dim)
print("position_embedding_layer shape: ", position_embedding_layer.weight.shape)
print("position_embedding_layer weight: ", position_embedding_layer.weight)

position_embedding_layer shape:  torch.Size([5, 4])
position_embedding_layer weight:  Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880],
        [ 1.5810,  1.3010,  1.2753, -0.2010],
        [-0.1606, -0.4015,  0.6957, -1.8061],
        [-1.1589,  0.3255, -0.6315, -2.8400],
        [-0.7849, -1.4096, -0.4076,  0.7953]], requires_grad=True)


Position embedding has nothing to do with token values or IDs — it's purely based on their positions in the sequence.

In [48]:
input_ids = torch.tensor([2,3,5])
position_embeddings = position_embedding_layer(torch.arange(len(input_ids))) # use Position of input_ids, NOT values of it
print("position_embeddings: \n", position_embeddings) # return row 0,1,2 of position_embedding_layer.weight

position_embeddings: 
 tensor([[ 0.3374, -0.1778, -0.3035, -0.5880],
        [ 1.5810,  1.3010,  1.2753, -0.2010],
        [-0.1606, -0.4015,  0.6957, -1.8061]], grad_fn=<EmbeddingBackward0>)


input_embeddings = token_embeddings + pos_embeddings

In [49]:
input_embeddings = token_embeddings + position_embeddings
print("shape of input_embeddings : ",input_embeddings.shape)
print("input_embeddings: ", input_embeddings)

shape of input_embeddings :  torch.Size([3, 4])
input_embeddings:  tensor([[ 1.1045, -1.3703,  0.3948, -1.9977],
        [ 1.7603,  3.1962,  1.7707,  0.0682],
        [ 1.4204,  0.8996,  1.9710, -2.0070]], grad_fn=<AddBackward0>)


GPT-2 uses a position embedding table of size (1024 positions × 768 dimensions).



In [50]:
from torch import nn

context_size = 1024
embed_dim = 768
torch.manual_seed(123)
position_embedding_layer_gpt2 = nn.Embedding(context_size, embed_dim)
print("position_embedding_layer_gpt2 shape: ", position_embedding_layer_gpt2.weight.shape)
print("position_embedding_layer_gpt2 weight: ", position_embedding_layer_gpt2.weight)

position_embedding_layer_gpt2 shape:  torch.Size([1024, 768])
position_embedding_layer_gpt2 weight:  Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ..., -0.3181, -1.3936,  0.5226],
        [ 0.2579,  0.3420, -0.8168,  ..., -0.4098,  0.4978, -0.3721],
        [ 0.7957,  0.5350,  0.9427,  ..., -1.0749,  0.0955, -1.4138],
        ...,
        [-1.2094,  0.6397,  0.6342,  ..., -0.4582,  1.4911,  1.2406],
        [-0.2253, -0.1078,  0.0479,  ...,  0.2521, -0.2893, -0.5639],
        [-0.5375, -1.1562,  2.2554,  ...,  1.4322,  1.2488,  0.1897]],
       requires_grad=True)


In [51]:
with open("Peter_Rabbit.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = dataloader_v1(raw_text,batch_size=3, context_size=1024,stride=2)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("shape of input: ",inputs.shape)
print("first batch, input: \n", inputs,"\n targets: \n", targets)

shape of input:  torch.Size([3, 1024])
first batch, input: 
 tensor([[ 7454,  2402,   257,  ...,   480,   517,   290],
        [  257,   640,   612,  ...,   290,   517, 36907],
        [  612,   547,  1440,  ..., 36907,    13,  1763]]) 
 targets: 
 tensor([[ 2402,   257,   640,  ...,   517,   290,   517],
        [  640,   612,   547,  ...,   517, 36907,    13],
        [  547,  1440,  1310,  ...,    13,  1763,  1473]])


In [52]:
token_embeddings = token_embedding_layer_gpt2(inputs)
print("shape of token_embeddings: ",token_embeddings.shape)
# print("token_embeddings: ", token_embeddings)

position_embeddings = position_embedding_layer_gpt2(torch.arange(context_size))
print("shape of position_embeddings: ",position_embeddings.shape)
# print("position_embeddings: ", position_embeddings)

# token_embeddings shape: [batch_size, seq_len, embedding_dim]
# position_embeddings shape: [seq_len, embedding_dim]
# PyTorch automatically broadcasts position_embeddings across batch dimension
input_embeddings = token_embeddings + position_embeddings
print("shape of input_embeddings : ",input_embeddings.shape)
# print("input_embeddings: ", input_embeddings)

shape of token_embeddings:  torch.Size([3, 1024, 768])
shape of position_embeddings:  torch.Size([1024, 768])
shape of input_embeddings :  torch.Size([3, 1024, 768])
