In [1]:
with open("the-verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re
text = "Hello, World. This, is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'World.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [3]:
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'World', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [4]:
result = [item for item in result if item.strip()]

In [5]:
result

['Hello', ',', 'World', '.', 'This', ',', 'is', 'a', 'test', '.']

In [6]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4649


In [7]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## converting toens into token ids
this conversion is an intermediate step before converting the token IDs into embedding vectos

In [8]:
all_words = sorted(list(set(preprocessed)))

In [9]:
vocab_size = len(all_words)

In [10]:
print(vocab_size)

1159


In [11]:
vocab = {token: integer for integer, token in enumerate(all_words)}

Implemenitng a simple text tokenizer

In [15]:
class SimpleTextTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = ' '.join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [16]:
tokenizer = SimpleTextTokenizerV1(vocab)


text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]


In [17]:
ids = tokenizer.encode(text)
ids

[1,
 58,
 2,
 872,
 1013,
 615,
 541,
 763,
 5,
 1155,
 608,
 5,
 1,
 69,
 7,
 39,
 873,
 1136,
 773,
 812,
 7]

In [18]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [None]:
# text = "Hello, do you like tea?"
# tokenizer.encode(text)

KeyError: 'Hello'

In [20]:
## Extending vocab by adding unk and end of text token
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1161


In [21]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|endoftext|>', 1159)
('<|unk|>', 1160)


In [26]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else '<|unk|>' for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = ' '.join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [24]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [27]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]


In [28]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

# Byte pair encoding 
Since implementing BPE can be relatively complicated, we will use an
existing Python open-source library called tiktoken
(https://github.com/openai/tiktoken), which implements the BPE algorithm
very efficiently based on source code in Rust. Similar to other Python
libraries, we can install the tiktoken library via Python's pip installer from the
terminal:
`pip install tiktoken==0.5.1`
for replicating the book code


In [30]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.5.1


In [31]:
tokenizer = tiktoken.get_encoding("gpt2")

In [33]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [34]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

In [35]:
## Applying BPE tokenizer for entire text

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [40]:
enc_sample = enc_text[50:]

In [41]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:       {y}")

x: [290, 4920, 2241, 287]
y:       [4920, 2241, 287, 257]


In [42]:
## Processing the inputs along with the targets, which are the inputs shifted by
## one position, we can then create the next-word prediction tasks
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired =enc_sample[i]
    print(context,"---->", desired)


[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [44]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired =enc_sample[i]
    print(tokenizer.decode(context),"---->", tokenizer.decode([desired]))


 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [46]:
## need an efficient dataloader
## returning two tensors: an input tensor
## containing the text that the LLM sees and a target tensor that includes the
## targets for the LLM to predict
import torch 
from torch.utils.data import DataLoader, Dataset

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) -  max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [47]:
def create_dataloader(txt, batch_size = 4,
                      max_length = 256, stride = 128, shuffle = True,
                      drop_last = True, num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last,
                            num_workers = num_workers)
    return dataloader

In [48]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text= f.read()

In [52]:
dataloader = create_dataloader(raw_text, batch_size=1, max_length = 4, stride = 1, shuffle = False)

In [None]:
##Note
##that an input size of 4 is relatively small and only chosen for illustration
##purposes. It is common to train LLMs with input sizes of at least 256
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
## The stride setting dictates the number of
## positions the inputs shift across batches, emulating a sliding window
## approach
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [56]:
dataloader_v2 = create_dataloader(raw_text, batch_size=1, max_length = 2, stride = 2, shuffle = False)
data_iter_v2 = iter(dataloader_v2)
first_batch_v2 = next(data_iter_v2)
print(first_batch_v2)
second_batch_v2 = next(data_iter_v2)
print(second_batch_v2)

[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[tensor([[2885, 1464]]), tensor([[1464, 1807]])]


In [57]:
dataloader_v2 = create_dataloader(raw_text, batch_size=1, max_length = 8, stride = 2, shuffle = False)
data_iter_v2 = iter(dataloader_v2)
first_batch_v2 = next(data_iter_v2)
print(first_batch_v2)
second_batch_v2 = next(data_iter_v2)
print(second_batch_v2)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]


In [None]:
# Note that we increase the stride to 4. This is to utilize the data set fully (we
# don't skip a single word) but also avoid any overlap between the batches,
# since more overlap could lead to increased overfitting.
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


we initialize these embedding weights with random values as a
preliminary step. This initialization serves as the starting point for the LLM's
learning process.

In [59]:
## token ids conversion to embedding vector
input_ids = torch.tensor([2,3,5,1])

In [60]:
## example
vocab_size = 6
output_dim = 3

In [61]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [62]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [63]:
## embedding layer is essentially a lookup operation that retrieves rows from the embedding layer's weight matrix
## via a token id
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


important thing to note here:
The embedding layer converts a token ID into the same vector representation
regardless of where it is located in the input sequence. For example, the token ID 5, whether it's
in the first or third position in the token ID input vector, will result in the same embedding
vector.

In principle, the deterministic, position-independent embedding of the token
ID is good for reproducibility purposes. However, since the self-attention
mechanism of LLMs itself is also position-agnostic, it is helpful to inject
additional position information into the LLM.
To achieve this, there are two broad categories of position-aware
embeddings: relative positional embeddings and absolute positional
embeddings.

Previously, we focused on very small embedding sizes in this chapter for
illustration purposes. We now consider more realistic and useful embedding
sizes and encode the input tokens into a 256-dimensional vector
representation. This is smaller than what the original GPT-3 model used (in
GPT-3, the embedding size is 12,288 dimensions) but still reasonable for
experimentation. Furthermore, we assume that the token IDs were created by
the BPE tokenizer that we implemented earlier, which has a vocabulary size
of 50,257:

In [64]:
## encoding word positions
output_dim = 256
vocab_size = 50257

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [65]:
token_embedding_layer

Embedding(50257, 256)

Using the token
_
embedding_
layer above, if we sample data from the data
loader, we embed each token in each batch into a 256-dimensional vector. If
we have a batch size of 8 with four tokens each, the result will be an 8 x 4 x
256 tensor.

In [66]:
max_length = 4
dataloader = create_dataloader(
    raw_text, batch_size = 8, max_length = max_length, stride = max_length,
    shuffle = False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [67]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [69]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [70]:
# For a GPT model's absolute embedding approach, we just need to create
# another embedding layer that has the same dimension as the
# token_embedding_layer:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [71]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [72]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [73]:
# As we can see, the positional embedding tensor consists of four 256-
# dimensional vectors. We can now add these directly to the token embeddings,
# where PyTorch will add the 4x256-dimensional pos_embeddings tensor to
# each 4x256-dimensional token embedding tensor in each of the 8 batches:

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
