## Linear vs Embedding

In [64]:
import torch
torch.manual_seed(42)

<torch._C.Generator at 0x115707650>

In [65]:
# input
inputs = torch.tensor([0, 2, 1, 4, 3])

In [66]:
# linear 
linear = torch.nn.Linear(inputs.shape[0], 5, bias=False)

In [67]:
embedding = torch.nn.Embedding(inputs.shape[0], 5)

In [68]:
linear.weight = torch.nn.Parameter(embedding.weight.T)

In [69]:
linear.weight

Parameter containing:
tensor([[-0.6866, -2.3169,  0.5258, -0.8371,  1.8446],
        [-0.4934, -0.2168, -0.4880, -0.9224, -1.1845],
        [ 0.2415, -1.3847,  1.1914, -0.0635,  1.3835],
        [-1.1109, -0.3957, -0.8140,  0.6756, -1.2024],
        [ 0.0915,  0.0780, -0.7360, -0.0978,  0.7078]], requires_grad=True)

In [70]:
embedding.weight

Parameter containing:
tensor([[-0.6866, -0.4934,  0.2415, -1.1109,  0.0915],
        [-2.3169, -0.2168, -1.3847, -0.3957,  0.0780],
        [ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
        [-0.8371, -0.9224, -0.0635,  0.6756, -0.0978],
        [ 1.8446, -1.1845,  1.3835, -1.2024,  0.7078]], requires_grad=True)

In [71]:
# one hot
onehot = torch.nn.functional.one_hot(inputs)
onehot

tensor([[1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0]])

In [72]:
linear(onehot.float())

tensor([[-0.6866, -0.4934,  0.2415, -1.1109,  0.0915],
        [ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
        [-2.3169, -0.2168, -1.3847, -0.3957,  0.0780],
        [ 1.8446, -1.1845,  1.3835, -1.2024,  0.7078],
        [-0.8371, -0.9224, -0.0635,  0.6756, -0.0978]], grad_fn=<MmBackward0>)

## Encoding 

In [73]:
import tiktoken

In [74]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
input1 = "Hi how are you"
input2 = "transformer architecture"

idx1 = tokenizer.encode(input1, allowed_special={"<|endoftext|>"})
idx2 = tokenizer.encode(input2, allowed_special={"<|endoftext|>"})


In [82]:
idx1, idx2

([17250, 703, 389, 345], [7645, 16354, 10959])

In [110]:
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        self.token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(self.token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length + 1"

        for i in range(0, len(self.token_ids) - max_length, stride):
            input_chunk = self.token_ids[i: i + max_length]
            target_chunk = self.token_ids[i + 1: i + 1 + max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [111]:
def create_dataloader(txt, batch_size = 4, max_length = 256,
                      stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDataset(txt , tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [112]:
raw = ""
with open("discipline.txt", "r", encoding="utf-8") as f:
    raw = f.read()

In [113]:
print(raw[:100])

The Power of Discipline: A Comprehensive Analysis
Discipline is a multifaceted concept that plays a


In [126]:
dataloader = create_dataloader(raw, batch_size=4, max_length = 4, stride = 1, shuffle=False)
dataiter = iter(dataloader)
inputs, targets = next(dataiter)

In [134]:
ndim =768
embeddings = torch.nn.Embedding(50257, ndim)

In [135]:
embeddings(inputs).shape

torch.Size([4, 4, 768])

In [136]:
embeddings(targets).shape

torch.Size([4, 4, 768])

In [138]:
token_embeddings = embeddings(inputs)

In [144]:
positional_embeddings_layer = torch.nn.Embedding(inputs.shape[1], ndim)

In [146]:
positional_embeddings = positional_embeddings_layer(torch.arange(inputs.shape[1]))

In [147]:
input_embeddings = token_embeddings + positional_embeddings

In [148]:
input_embeddings.shape

torch.Size([4, 4, 768])