In [5]:
import re

In [6]:
filePath = "TheVerdict.txt"
try:
    with open(filePath, 'r', encoding="utf-8") as file:
        rawText = file.read()
        print(f"sucessfully got the file {filePath}\nThis file has {len(rawText)} characters")
except:
    print("Error getting the file")

sucessfully got the file TheVerdict.txt
This file has 20479 characters


In [7]:
#Space only Tokenizer
SpaceTokenizedText = re.split("\s", rawText)
# "\s" is used and r"(\s)" is used as this will make an item for the white space also ' '
# This is further talked about in my notes under "2.2 Tokenizing Text"
print(SpaceTokenizedText[:50])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius--though', 'a', 'good', 'fellow', 'enough--so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that,', 'in', 'the', 'height', 'of', 'his', 'glory,', 'he', 'had', 'dropped', 'his', 'painting,', 'married', 'a', 'rich', 'widow,', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera.', '(Though', 'I']


In [8]:
# Space and punctuation Tokenizer
SpacePuncTokenizedText = re.findall(r'\w+|[,.:;?_!"()\']|--', rawText)
print(SpacePuncTokenizedText[:50])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself']


In [9]:
# Making a vocabulary
allWords = sorted(set(SpacePuncTokenizedText))
vocabSize = len(allWords)
print(f"vocab size: {vocabSize}")

vocab = {word: word_index for word_index, word in enumerate(allWords)}
print(vocab)

vocab size: 1148
{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry_': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Professional': 81, 'Renaissance

In [10]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inverseVocab = {word_index: word for word, word_index in self.vocab.items()}
    
    def encode(self, text):
        splitText = re.findall(r'\w+|[,.:;?_!"()\']|--', text)
        IDs = [self.vocab[token] for token in splitText]
        return IDs
    def decode(self, IDs):
        text = " ".join([self.inverseVocab[ID] for ID in IDs])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [11]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)
decText = tokenizer.decode(ids)
print(decText)
# There is one problem with this, there is an extra space after all the " and ' which is not accurate to the original text.

[1, 56, 2, 867, 1006, 615, 546, 760, 5, 1144, 609, 5, 1, 67, 7, 38, 868, 1126, 769, 810, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


## 2.4

In [12]:
#2.4
print(allWords)
allWords.extend(["<|endoftext|>", "<|unk|>"])
print(allWords)
vocab = {token: index for index,token in enumerate(allWords)}
print(len(vocab.keys()))

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry_', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those', 'Though', 'Thwing', 'Thwings', 'To', 'Usually', 'Venetian', 'Victor', 'Was', 'We', 'Well', 'What', 'When', 'Why', 'Yes', 'You', '_I', '_am_', '_famille', '_felt_', '_has_', '_have_', '

In [13]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inverseVocab = {word_index: word for word, word_index in self.vocab.items()}
    
    def encode(self, text):
        splitText = re.findall(r'<\|[^|]+\|>|"\s*<\|[^|]+\|>"|[a-zA-Z0-9]+|[,.:;?_!"()\']|--|"', text)
        IDs = [self.vocab[token] if token in self.vocab.keys() else self.vocab["<|unk|>"] for token in splitText]
        return IDs
    def decode(self, IDs):
        text = " ".join([self.inverseVocab[ID] for ID in IDs])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [14]:
tokenizer2 = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
ids = tokenizer2.encode(text)
print(ids)
decText = tokenizer2.decode(ids)
print(decText)
# There is one problem with this, there is an extra space after all the " and ' which is not accurate to the original text.

[1149, 5, 374, 1144, 640, 993, 10, 1148, 55, 1006, 974, 1002, 736, 1006, 1149, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## 2.5 BPE


In [15]:
import tiktoken
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.")

tokenizer = tiktoken.get_encoding("gpt2")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)
token_words = [tokenizer.decode_single_token_bytes(token) for token in integers]
print(token_words)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.
[b'Hello', b',', b' do', b' you', b' like', b' tea', b'?', b' ', b'<|endoftext|>', b' In', b' the', b' sun', b'lit', b' terr', b'aces', b' of', b' some', b'unknown', b'Place', b'.']


## 2.6

In [16]:
import tiktoken

with open("TheVerdict.txt", 'r', encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [22]:
enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1: context_size+1]
print(f"x: {x}\ny: \t{y}")

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(f"{tokenizer.decode(context)} --> {tokenizer.decode([desired])}")

x: [290, 4920, 2241, 287]
y: 	[4920, 2241, 287, 257]
 and -->  established
 and established -->  himself
 and established himself -->  in
 and established himself in -->  a


In [25]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_len, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids)-max_len, stride):
            x = token_ids[i: i+max_len]
            y = token_ids[i+1: i+max_len+1]
            self.input_ids.append(torch.tensor(x))
            self.target_ids.append(torch.tensor(y))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
    

def create_dataloader_v1(txt: str, batch_size=4, max_len=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_len, stride)
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            num_workers=num_workers,
                            drop_last=drop_last)
    return dataloader

In [31]:
with open("TheVerdict.txt", 'r', encoding="utf-8") as f:
    txt = f.read()

dataloader = create_dataloader_v1(txt, batch_size=3, max_len=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]]), tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]])]


## 2.7 

In [33]:
from torch import nn

In [42]:
vocab_size = 6
embedding_dim = 3

torch.manual_seed(123)
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [None]:
print(embedding_layer(torch.Tensor([3]).int())) # this is just the index 3 ele in the above matrix
# torch.Tensor([3]).int() same as torch.tensor([3])

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [50]:
input_ids = torch.tensor([2,4,1,5])
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-1.1589,  0.3255, -0.6315],
        [ 0.9178,  1.5810,  1.3010],
        [-2.8400, -0.7849, -1.4096]], grad_fn=<EmbeddingBackward0>)


## 2.8

In [59]:
vocab_size = 50257
embedding_dim = 256
token_embedding_layer = nn.Embedding(vocab_size, embedding_dim)

max_length = 4
dataloader = create_dataloader_v1(txt, batch_size=8, max_len=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
# print(inputs)
# print(targets)
print(f"dims: {len(dataloader)} x {inputs.shape[0]} x {inputs.shape[1]}")

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

dims: 160 x 8 x 4
torch.Size([8, 4, 256])


In [61]:
# For a GPT model’s absolute embedding approach, we just need to create another embedding layer that has the same embedding dimension as the token_embedding_layer

context_length = max_length
pos_embedding_layer = nn.Embedding(context_length, embedding_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings, pos_embeddings.shape)

tensor([[-0.7180, -1.6009,  0.3609,  ..., -2.2394, -1.4581, -0.8744],
        [ 1.5543, -2.1174,  0.4044,  ..., -1.3120, -0.6699,  1.2716],
        [-0.6629,  0.4562,  0.0575,  ...,  0.0279, -1.0018,  0.4836],
        [ 0.3225, -1.1543,  1.6842,  ..., -0.3874,  1.1326, -1.7115]],
       grad_fn=<EmbeddingBackward0>) torch.Size([4, 256])


In [62]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings, input_embeddings.shape)

tensor([[[-0.4466, -1.2089,  1.9116,  ..., -2.5171,  0.5177,  1.0508],
         [ 2.3725, -2.1144,  0.2816,  ..., -0.2947,  0.7927,  1.1478],
         [ 0.7618,  2.3193, -0.8711,  ...,  1.4673,  0.3900,  1.9076],
         [-0.2153,  2.6686,  2.2753,  ..., -0.6694,  0.0707, -0.6766]],

        [[ 0.4577, -1.1822,  1.7659,  ..., -4.6932, -0.5067, -0.9455],
         [ 0.1795, -3.4956,  0.5759,  ..., -2.8583, -0.7475,  0.1249],
         [-1.7056, -0.2779, -0.8310,  ...,  0.1600,  0.0992, -0.8924],
         [-0.0237, -1.3716,  0.8419,  ..., -1.3682,  0.8221, -4.5292]],

        [[-0.6946, -2.2963,  0.3489,  ..., -0.2686, -1.4820, -0.8296],
         [ 2.2095, -1.5307,  0.2493,  ..., -2.4683, -0.1960,  0.5276],
         [-1.9803,  1.2496,  0.2502,  ..., -0.2331, -3.1653, -0.4573],
         [ 0.7768, -2.5170,  1.2508,  ..., -0.2933,  0.6892, -2.6618]],

        ...,

        [[-1.5846, -2.6068, -0.6915,  ..., -2.2684, -0.4264, -1.8845],
         [ 1.2949, -3.3428, -0.5533,  ..., -0.7721, -0.51