# Embedding

In [1]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
tokens = tokenizer.encode(raw_text)
print(f"raw text size = {len(raw_text)} token size = {len(tokens)}")

raw text size = 20479 token size = 5145


In [2]:
from gpt_dataset_v1 import GPTDatasetV1, create_dataloader_v1

In [3]:
ds = GPTDatasetV1(tiktoken.get_encoding('gpt2'), raw_text, 4, 1)
print(f"Dataset size = {len(ds)}")
print(f"Dataset[50] = {ds[50]}")

Dataset size = 5141
Dataset[50] = (tensor([ 290, 4920, 2241,  287]), tensor([4920, 2241,  287,  257]))


In [4]:
CONTEXT_SIZE = 4
BATCH_SIZE = 8

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
tokenizer = tiktoken.get_encoding('gpt2')
dataloader = create_dataloader_v1(
    raw_text,
    tokenizer=tokenizer,
    batch_size=BATCH_SIZE,
    context_size=CONTEXT_SIZE,
    stride=BATCH_SIZE,
    shuffle=False,
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
first_batch

[tensor([[   40,   367,  2885,  1464],
         [10899,  2138,   257,  7026],
         [  922,  5891,  1576,   438],
         [ 1049,  5975,   284,   502],
         [  287,   262,  6001,   286],
         [  550,  5710,   465, 12036],
         [27075,    11,   290,  4920],
         [   64,   319,   262, 34686]]),
 tensor([[  367,  2885,  1464,  1807],
         [ 2138,   257,  7026, 15632],
         [ 5891,  1576,   438,   568],
         [ 5975,   284,   502,   284],
         [  262,  6001,   286,   465],
         [ 5710,   465, 12036,    11],
         [   11,   290,  4920,  2241],
         [  319,   262, 34686, 41976]])]

In [5]:
import torch
EMBEDDING_DIM = 4
input_embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, EMBEDDING_DIM)
pos_embedding_layer = torch.nn.Embedding(CONTEXT_SIZE, EMBEDDING_DIM)

In [43]:
input_embedding = input_embedding_layer(first_batch[0])
input_embedding[0]

tensor([[-0.9220,  0.3543, -1.7453, -0.0172],
        [-0.0062,  0.5936,  0.4708,  0.8263],
        [-1.2343,  1.3128, -0.9582, -1.5700],
        [-0.6964,  1.3787,  1.7570, -2.0223]], grad_fn=<SelectBackward0>)

In [41]:
pos_embedding = pos_embedding_layer(torch.arange(CONTEXT_SIZE))
pos_embedding

tensor([[-1.5441, -0.0155, -0.3817,  0.2478],
        [-0.5039,  0.2037,  1.7721,  0.5258],
        [ 0.4856, -1.0139, -1.4608,  0.4787],
        [ 0.7816, -1.5947, -0.1290, -0.1381]], grad_fn=<EmbeddingBackward0>)

In [42]:
final_embedding = input_embedding + pos_embedding
final_embedding[0]

tensor([[-2.4660,  0.3388, -2.1270,  0.2306],
        [-0.5101,  0.7974,  2.2429,  1.3521],
        [-0.7487,  0.2989, -2.4190, -1.0913],
        [ 0.0853, -0.2159,  1.6280, -2.1604]], grad_fn=<SelectBackward0>)

In [44]:
-0.922-1.5441

-2.4661