In [16]:
import urllib.request

url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")

file_path = "data/the-verdict.txt"

urllib.request.urlretrieve(url, file_path)

('data/the-verdict.txt', <http.client.HTTPMessage at 0x111128fe0>)

In [66]:
import re

book_text = open(file_path).read()

# with open("the-verdict.txt", "r", encoding="utf-8") as f:
#     raw_book_text = f.read()
    
print("Total number of character:", len(book_text))


result = re.split(r'([,.:;?_!"()\']|--|\s)', book_text)
result = [item.strip() for item in result if item.strip()]

print("Total number of tokens:", len(result))

Total number of character: 20479
Total number of tokens: 4690


In [26]:
all_words = sorted(set(result))

print(len(all_words))

vocab = {token:integer for integer, token in enumerate(all_words)}

1130


In [None]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v:k for k,v in self.str_to_int.items()}
        
    def encode(self, text):
        result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        return [self.str_to_int[token] for token in result if token.strip()]

    def decode(self, ids):
        text = ' '.join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text.strip()

tokenizer = SimpleTokenizer(vocab)

ids = tokenizer.encode("\"It's the last he painted, you know,\"\nMrs. Gisburn said with pardonable pride.")

print(ids)

print(tokenizer.decode(ids))


[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [None]:
UNK_TOKEN = '<|unk|>'
END_OF_TEXT_TOKEN = '<|endoftext|>'

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.str_to_int[UNK_TOKEN] = len(vocab)
        self.str_to_int[END_OF_TEXT_TOKEN] = len(vocab)
        self.int_to_str = {v:k for k,v in self.str_to_int.items()}
        
    def encode(self, text):
        result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        result = [token.strip() for token in result if token.strip()]
        result = [token if token in self.str_to_int else UNK_TOKEN for token in result]
        return [self.str_to_int[token] for token in result]

    def decode(self, ids):
        text = ' '.join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text.strip()

tokenizer = SimpleTokenizer(vocab)

ids = tokenizer.encode("\"It's the last he painted, you know,\"\nMrs. Gisburn said with pardonable pride.")

print(ids)

print(tokenizer.decode(ids))

In [61]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces "
        "of someunknownPlace.")

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(ids)

print(tokenizer.decode(ids))


tokenizer.encode("made")

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


[9727]

In [112]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        ids = tokenizer.encode(text)

        for i in range(0, len(ids) - max_length, stride):
            input_chunk = ids[i:i+max_length]
            target_chunk = ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

dataset = GPTDatasetV1(book_text, tokenizer, max_length=4, stride=4)


loader = DataLoader(dataset, batch_size = 8, shuffle=False)

next(iter(loader))


[tensor([[   40,   367,  2885,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  922,  5891,  1576,   438],
         [  568,   340,   373,   645],
         [ 1049,  5975,   284,   502],
         [  284,  3285,   326,    11]]),
 tensor([[  367,  2885,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   922],
         [ 5891,  1576,   438,   568],
         [  340,   373,   645,  1049],
         [ 5975,   284,   502,   284],
         [ 3285,   326,    11,   287]])]

In [153]:
torch.manual_seed(123)

vocab_size = 50357
out_dim = 3
context_length = 4

embedding_layer = nn.Embedding(vocab_size, out_dim)

position_embedding_layer = nn.Embedding(context_length, out_dim)

position_embeddings = position_embedding_layer(torch.arange(context_length))

print(next(iter(loader))[0].shape)

embedding_layer(next(iter(loader))[0]).shape

position_embeddings.shape

inputs, targets = next(iter(loader))

token_embeddings = embedding_layer(inputs)

input_embeddings = token_embeddings + position_embeddings

input_embeddings.shape


torch.Size([8, 4])


torch.Size([8, 4, 3])

In [161]:
i = input_embeddings[0]

atten_scores = i @ i.T

atten_weights = torch.softmax(atten_scores, dim = -1)

all_context_vec = atten_weights @ i 

all_context_vec


tensor([[ 1.1975, -0.1398,  0.0658],
        [-1.6562,  2.8028, -1.6308],
        [ 1.0092,  0.1583,  0.3552],
        [ 0.7691,  1.2651, -2.3320]], grad_fn=<MmBackward0>)

In [None]:
class SelfAttentionV1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_keys = nn.Parameter(torch.rand(d_in, d_out))
        self.W_values = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        query = x @ self.W_query
        keys = x @ self.W_keys
        values = x @ self.W_values

        attention_scores = query @ keys.T
        attention_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5, dim = -1)

        context_vec = attention_weights @ values 
        return context_vec

In [None]:
class CausalAttention(nn.Module):
    def __init__(self, dim_in, dim_out, context_length, dropout, qkv_bias = False) -> None:
        super().__init__()
        self.W_query = nn.Linear(dim_in, dim_out, bias = qkv_bias)
        self.W_keys = nn.Linear(dim_in, dim_out, bias = qkv_bias)
        self.W_values = nn.Linear(dim_in, dim_out, bias = qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        query = self.W_query(x)
        keys = self.W_keys(x)
        values = self.W_values(x)

        attention_scores = query @ keys.transpose(1, 2)
        attention_scores.masked_fill_(
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf
        )

        attention_weights = torch.softmax(attention_scores * keys.shape[-1] ** 0.5, dim = -1)
        attention_weights = self.dropout(self.attention_weights)

        context_values = attention_weights @ values

        return context_values

In [None]:
class MultiHeadAttentionV1(nn.Module):
    def __init__(self, dim_in, dim_out, context_length, dropout, qkv_bias = False, num_heads = 2) -> None:
        super().__init__()
        self.heads = nn.ModuleList(
            [
                CausalAttention(dim_in, dim_out, context_length, dropout, qkv_bias)
                for i in range(num_heads)
            ]
        )

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim = -1)

In [168]:
class MutliHeadAttentionV2(nn.Module):
    def __init__(self, dim_in, dim_out, context_length, dropout, qkv_bias = False, num_heads = 2) -> None:
        super().__init__()
        self.d_out = dim_out 
        self.num_heads = num_heads

        assert dim_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.head_dim = dim_out // num_heads

        self.W_query = nn.Linear(dim_in, dim_out, bias = qkv_bias)
        self.W_keys = nn.Linear(dim_in, dim_out, bias = qkv_bias)
        self.W_values = nn.Linear(dim_in, dim_out, bias = qkv_bias)

        self.dropout = nn.Dropout(dropout)

        # TODO: Why is the bias left out here?
        self.output = nn.Linear(dim_out, dim_out)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, dim_in = x.shape
        query = self.W_query(x)
        keys = self.W_keys(x)
        values = self.W_values(x)

        query = query.view(b, num_tokens, self.num_heads, self.head_dim)
        keys = query.view(b, num_tokens, self.num_heads, self.head_dim)
        values = query.view(b, num_tokens, self.num_heads, self.head_dim)

        query = query.transponse(1, 2)
        keys = keys.transponse(1, 2)
        values = values.transponse(1, 2)

        attention_scores = query @ keys.transpose(2, 3)
        attention_scores = query.masked_fill_(
            self.mask.bool()[:num_tokens, :num_tokens],
            -torch.inf
        )

        attention_weights = torch.softmax(
            attention_scores / self.head_dim ** 0.5,
            dim = - 1
        )

        context_values = (attention_weights @ values).transpose(1, 2)

        context_values = context_values.contiguous().view(b, num_tokens, self.d_out)

        context_values = self.output(context_values)

        return context_values

        

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

class DummyTransformer(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()

    def forward(self, x):
        return x

class LayerNorm(nn.Module):
    def __init__(self, dim) -> None:
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(dim))
        self.shift = nn.Parameter(torch.zeroes(dim))

    def forward(self, x: torch.Tensor):
        mean = x.mean(dim = -1, keepdim=True)
        var = x.var(dim = -1, unbiased=False, keepdim=True)
        x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * x + self.shift

class DummyGPTModel(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[DummyTransformer(cfg) for i in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias = False)

    def forward(self, x):
        batch, seq_len = x.shape

        token_embeddings = self.tok_emb(x)
        pos_embeddings = self.pos_emb(torch.arange(seq_len, device = x.device))
        input_embeddings = token_embeddings + pos_embeddings

        input_embeddings = self.drop_emb(input_embeddings)

        out = self.trf_blocks(input_embeddings)
        out = self.final_norm(out)
        logits = self.out_head(out)

        return logits

In [176]:
x = torch.tensor([
    tokenizer.encode("Every effort moves you"),
    tokenizer.encode("Every day holds a")
])

model = DummyGPTModel(GPT_CONFIG_124M)


out = model(x)

out.shape

torch.Size([2, 4, 50257])

In [230]:
torch.manual_seed(123)
x = torch.rand(2,3)
x = x.view(1, 1, 2, 3)
m = x.mean(dim = -1, keepdim = True)

print(x)
print(m)

x - m

tensor([[[[0.2961, 0.5166, 0.2517],
          [0.6886, 0.0740, 0.8665]]]])
tensor([[[[0.3548],
          [0.5430]]]])


tensor([[[[-0.0587,  0.1618, -0.1031],
          [ 0.1455, -0.4690,  0.3235]]]])