In [None]:
import torch
import tiktoken

In [None]:
with open( "humphreys.txt", "r" ) as f:
    raw_text = f.read()

raw_text[:50]

'ABOUT fifteen years ago, on a date late in August '

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
enc_text = tokenizer.encode(raw_text)

In [None]:
print(enc_text[:20])

[6242, 12425, 17280, 812, 2084, 11, 319, 257, 3128, 2739, 287, 2932, 393, 1903, 287, 2693, 11, 257, 4512, 9859]


In [None]:
print( tokenizer.decode( enc_text[:2]))

ABOUT


In [None]:
len( enc_text)

14643

In [None]:
for i in range(1,10):
    print("Input:", tokenizer.decode(enc_text[:i]), "Target:", tokenizer.decode([enc_text[i]]))

Input: AB Target: OUT
Input: ABOUT Target:  fifteen
Input: ABOUT fifteen Target:  years
Input: ABOUT fifteen years Target:  ago
Input: ABOUT fifteen years ago Target: ,
Input: ABOUT fifteen years ago, Target:  on
Input: ABOUT fifteen years ago, on Target:  a
Input: ABOUT fifteen years ago, on a Target:  date
Input: ABOUT fifteen years ago, on a date Target:  late


In [None]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [None]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[ 6242, 12425, 17280,   812],
        [ 2084,    11,   319,   257],
        [ 3128,  2739,   287,  2932],
        [  393,  1903,   287,  2693],
        [   11,   257,  4512,  9859],
        [  510,   379,  5187,   301],
        [17899,   431,    11,   257],
        [ 1499,  4429,   287,  8345]])

Targets:
 tensor([[12425, 17280,   812,  2084],
        [   11,   319,   257,  3128],
        [ 2739,   287,  2932,   393],
        [ 1903,   287,  2693,    11],
        [  257,  4512,  9859,   510],
        [  379,  5187,   301, 17899],
        [  431,    11,   257,  1499],
        [ 4429,   287,  8345,  4492]])


In [None]:
# to apply the tokenizer's decoder to these IDs, the rows of the tensor `inputs` have to be converted into lists:
for row in inputs:
    print( tokenizer.decode( row.tolist() ) )

ABOUT fifteen years
 ago, on a
 date late in August
 or early in September
, a train drew
 up at Wilst
horpe, a
 country station in Eastern


In [None]:
# we don't send these IDs to the LLM for training; we associate a vector a.k.a. tensor with each ID and then train the LLM on the vectors
# as a first example, let's create embedding vectors of length 3 for each token in a vocabulary of 6 tokens
vocab_size = 6
output_dim = 3
embedding = torch.nn.Embedding( vocab_size, output_dim )
print(embedding.weight)

Parameter containing:
tensor([[ 1.2456,  0.5841,  0.7223],
        [-2.7521,  0.1307, -0.4288],
        [-0.1844, -0.2347, -0.7426],
        [-0.5396, -0.8305,  1.2291],
        [ 0.1089,  0.2475, -0.9558],
        [-0.6353, -0.7698, -1.2909]], requires_grad=True)


In [None]:
# if you just want the tensor part of this without the requires_grad=True bit
# method 1:
embedding.weight.data

tensor([[ 1.2456,  0.5841,  0.7223],
        [-2.7521,  0.1307, -0.4288],
        [-0.1844, -0.2347, -0.7426],
        [-0.5396, -0.8305,  1.2291],
        [ 0.1089,  0.2475, -0.9558],
        [-0.6353, -0.7698, -1.2909]])

In [None]:
# if you just want the tensor part of this without the requires_grad=True bit
# method 1:
embedding.weight.detach()

tensor([[ 1.2456,  0.5841,  0.7223],
        [-2.7521,  0.1307, -0.4288],
        [-0.1844, -0.2347, -0.7426],
        [-0.5396, -0.8305,  1.2291],
        [ 0.1089,  0.2475, -0.9558],
        [-0.6353, -0.7698, -1.2909]])

In [None]:
# call this A for some examples:
A = embedding.weight.detach()

In [None]:
# first row:
A[0]

tensor([1.2456, 0.5841, 0.7223])

In [None]:
# second row:
A[1]

tensor([-2.7521,  0.1307, -0.4288])

In [None]:
# first column:
A[:,0]

tensor([ 1.2456, -2.7521, -0.1844, -0.5396,  0.1089, -0.6353])

In [None]:
# element in row 2, column 3:
A[1,2]

tensor(-0.4288)

In [None]:
# to create a tensor directly:
x = torch.tensor([1.2,2.1])
y = torch.tensor([2.7,1.5])
print(x)
print(y)

tensor([1.2000, 2.1000])
tensor([2.7000, 1.5000])


In [None]:
torch.dot( x,y)

tensor(6.3900)

In [None]:
# check:
1.2*2.7 + 2.1*1.5

6.390000000000001