In [1]:
import torch
import tiktoken

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [1]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "../data/the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('../data/the-verdict.txt', <http.client.HTTPMessage at 0x21eb62954d0>)

In [2]:
# Load the data
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of characters in the text:", len(raw_text))
print(raw_text[:1000])

Total number of characters in the text: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look

In [6]:
# Tokenize the text using regex
import re
text = "Hello, world! this is a test."

result = re.split(r"[,.]|\s", text)
print(result)

['Hello', '', 'world!', 'this', 'is', 'a', 'test', '']


In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text = (
    "jhjhdhd Hello, world! this is a test.\n <|endoftext|>"
    "This is a new line. Let's see how this tokenizer works."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

print(tokenizer.decode(integers))

[73, 71, 73, 31298, 31298, 18435, 11, 995, 0, 428, 318, 257, 1332, 13, 198, 220, 50256, 1212, 318, 257, 649, 1627, 13, 3914, 338, 766, 703, 428, 11241, 7509, 2499, 13]
jhjhdhd Hello, world! this is a test.
 <|endoftext|>This is a new line. Let's see how this tokenizer works.


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128,
                         shuffle=True,
                         drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last,
                            num_workers=num_workers)
    
    return dataloader

with open("../data/the-verdict.txt", "r", encoding="utf-8") as f:
    txt = f.read()

dataloader = create_dataloader_v1(txt, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [7]:
# Trying out a small token embedding model

input_ids = torch.tensor([[1, 2, 3, 4]]).to('xpu')

vocab_size = 6
output_dim = 4

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim).to('xpu')
print(embedding_layer.weight)

embeddings = embedding_layer(input_ids)
print(embeddings)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880],
        [ 0.3486,  0.6603, -0.2196, -0.3792],
        [-0.1606, -0.4015,  0.6957, -1.8061],
        [ 1.8960, -0.1750,  1.3689, -1.6033],
        [-0.7849, -1.4096, -0.4076,  0.7953],
        [ 0.9985,  0.2212,  1.8319, -0.3378]], device='xpu:0',
       requires_grad=True)
tensor([[[ 0.3486,  0.6603, -0.2196, -0.3792],
         [-0.1606, -0.4015,  0.6957, -1.8061],
         [ 1.8960, -0.1750,  1.3689, -1.6033],
         [-0.7849, -1.4096, -0.4076,  0.7953]]], device='xpu:0',
       grad_fn=<EmbeddingBackward0>)


In [13]:
# Creating the full embedding model
vocab_size = 50257 # size of the gpt-2 vocabulary
output_dim = 256
token_embeding_layer = torch.nn.Embedding(vocab_size, output_dim)

max_length = 4
dataloader = create_dataloader_v1(txt, batch_size=8, max_length=max_length, stride=max_length, shuffle=False, num_workers=0)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

token_embeddings = token_embeding_layer(inputs)
print("\nEmbeddings shape:\n", token_embeddings.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])

Embeddings shape:
 torch.Size([8, 4, 256])


In [15]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

print(pos_embeddings)
print(pos_embeddings.shape)

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings)

tensor([[-0.2527, -0.0394,  0.2733,  ..., -1.1980,  0.5375,  2.2997],
        [-0.7995, -0.0521,  0.4565,  ..., -0.9853,  0.9330,  0.7574],
        [ 0.4896, -0.3548, -0.2768,  ..., -0.5923,  1.0320,  0.1859],
        [-0.7416, -1.4633,  1.1775,  ...,  1.8541, -0.6434,  0.7267]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([4, 256])
tensor([[[-1.3734e+00, -6.8528e-03, -7.6597e-01,  ..., -1.0742e+00,
           2.2177e+00,  1.6062e+00],
         [-2.0430e+00,  6.5746e-02,  1.3244e+00,  ..., -6.5890e-01,
           1.9324e+00,  2.3588e+00],
         [ 3.6466e-01,  7.3413e-02,  8.4166e-01,  ...,  9.4218e-01,
          -6.7741e-01,  2.4261e+00],
         [-3.4853e-01, -7.4823e-01,  1.8472e-01,  ...,  1.1741e+00,
          -1.7265e+00, -1.0318e+00]],

        [[ 4.9798e-01,  6.3419e-01,  1.3085e+00,  ...,  7.0523e-02,
           4.7192e-01,  1.0729e+00],
         [-7.5285e-01, -2.3526e-01, -8.4214e-01,  ...,  3.1610e-01,
          -3.5245e-01,  2.0653e+00],
         [-1.2019e+00, -1.106