In [2]:
with open('../the-verdict.txt', 'r') as f:
    text = f.read()

print(f"Total length: {len(text)}")
print(text[:99])

Total length: 20480
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re

# let's try creating a simple tokenizer by hand!


def create_vocab(corpus):
    """
    given a corpus, create a *vocabulary* for that corpus.
    a vocabulary is a mapping from tokens to integer IDs, covering all the tokens in the corpus.
    """
    preprocessed = re.split(r'([.,:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    all_tokens = sorted(list(set(preprocessed)))
    all_tokens.extend(["<|endoftext|>", "<|unk|>"])
    vocab = {word: i for i, word in enumerate(all_tokens)}
    return vocab


class SimpleTokenizer:
    """
    a simple tokenizer that uses a vocabulary to encode and decode text.
    """

    def __init__(self, vocab):
        """
        initialize the tokenizer with a vocabulary.
        """
        self.str_to_int = vocab
        self.int_to_str = {v: k for k, v in vocab.items()}


    def encode(self, text):
        """
        encode a text into a list of integer IDs.
        """
        preprocessed = re.split(r'([.,:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        # replace any tokens that are not in the vocabulary with "<|unk|>"
        preprocessed = [
            word if word in self.str_to_int else "<|unk|>" for word in preprocessed]
        ids = [self.str_to_int[word] for word in preprocessed]
        return ids


    def decode(self, ids):
        """
        decode a list of integer IDs back into a text string.
        """
        text = ' '.join([self.int_to_str[id] for id in ids])
        # remove extra spaces before certain punctuation
        text = re.sub(r'\s+([,.!?"()\'])', r'\1', text)
        return text


vocab = create_vocab(text)
tokenizer = SimpleTokenizer(vocab)

encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)

# it works! (mostly.)
print(decoded)

I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdication." Of course it' s going to send the value of my picture' way up ; but I don' t think of that, Mr. Rickham -- the loss to Arrt is all I think of." The word, on Mrs. Thwing' s lips, multiplied its _ rs _ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn' s" Moon-dancers" to say, with tears in her eyes :" We shall not look upon its like again"? Well

In [4]:
import tiktoken

# now let's try using a real tokenizer, which uses BPE.

tokenizer = tiktoken.get_encoding("gpt2")

encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)

print(decoded)

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?

Well!--even through th

In [None]:
# let's try using a BPE encoder with a sliding window. this is one way we might set up for word-prediction tasks.
# we'll also convert the text to a PyTorch tensor.

import torch

from torch.utils.data import Dataset, DataLoader


class GptDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # tokenize the text
        token_ids = tokenizer.encode(text)

        # create the input and target sequences using a sliding window
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            # convert our token ids into a PyTorch tensor
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader(text, batch_size, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GptDataset(text, tokenizer, max_length, stride)
    data_loader = DataLoader(dataset, batch_size=batch_size,
                             shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return data_loader


data_loader = create_dataloader(
    text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(data_loader)
first_batch = next(data_iter)

print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
# ok, now let's do some positional embedding.

vocab_size = 50257  # this magic number is the GPT-2 vocab size
output_dim = 256
max_length = 4

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# load a batch of data, using a context length of 4 and a stride so that there's no overlap.
data_loader = create_dataloader(
    text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(data_loader)
input, target = next(data_iter)

# raw token embeddings
token_embeddings = token_embedding_layer(input)

# positional embeddings
pos_embedding_layer = torch.nn.Embedding(max_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

# incorporate the positional embeddings with simple summation
input_embeddings = token_embeddings + pos_embeddings

In [78]:
# time to explore self-attention!

# we'll choose the first example from our earlier batch to play with...
one_example = torch.tensor(
    [
        [0.43, 0.15, 0.89],
        [0.55, 0.87, 0.66],
        [0.57, 0.85, 0.64],
        [0.22, 0.58, 0.33],
        [0.77, 0.25, 0.10],
        [0.05, 0.80, 0.55],
    ]
)
# and we'll pick an arbitrary token from that example to calculate attention scores for.
query = one_example[1]

attention_scores = torch.empty(one_example.shape[0])

for i, x_i in enumerate(one_example):
    attention_scores[i] = torch.dot(x_i, query)

print(attention_scores)

# we've computed the attention scores for each token in the sequence.
# we can now apply a softmax to these scores to get the attention weights.

attention_weights = torch.softmax(attention_scores, dim=0)

# why are these all 0. or 1.? I think my data needs some sort of normalization.
print(attention_weights)

context_vector = torch.zeros_like(query)

for i, x_i in enumerate(one_example):
    context_vector += attention_weights[i] * x_i

print(context_vector)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor([0.4419, 0.6515, 0.5683])


In [79]:
# we can do this for _all_ tokens, instead of just one.

# a matrix multiplication is a dot product for each row!
attention_scores = one_example @ one_example.T

print(attention_scores)

attention_weights = torch.softmax(attention_scores, dim=-1)

print(attention_weights)

all_context_vectors = attention_weights @ one_example

print(all_context_vectors)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])
