# Working with Text
1) Text Embeddings.
2) Tokenization

In [1]:
import os
import urllib.request
import re

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


Word Tokenize

In [2]:
import nltk
nltk.download('punkt')
preprocessed = nltk.word_tokenize(raw_text)
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


[nltk_data] Downloading package punkt to /home/y.khan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
print(len(preprocessed))

4544


# Convert tokens to tokens ID
1) The tokens are mapped to token id that we can process via embedding layers later

In [4]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1138


In [5]:
vocab = {token:integer for integer, token in enumerate(all_words)}

# Create tokenizer class

In [6]:
class WordTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        
    def encode(self, text):
        preprocessed = nltk.word_tokenize(text)
        ids = [self.vocab[s] for s in preprocessed if s in self.vocab]
        return ids
    
    def decode(self, ids):
        tokens = [list(self.vocab.keys())[list(self.vocab.values()).index(i)] for i in ids]
        return ' '.join(tokens)

In [7]:
tokenizer = WordTokenizer(vocab)

text = "It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."
ids = tokenizer.encode(text)
print(ids)

[67, 10, 999, 618, 550, 762, 17, 1134, 612, 17, 78, 49, 864, 1117, 770, 809, 19]


In [8]:
tokenizer.decode(ids)

"It 's the last he painted , you know , Mrs. Gisburn said with pardonable pride ."

In [9]:
tokenizer.decode(tokenizer.encode(text))

"It 's the last he painted , you know , Mrs. Gisburn said with pardonable pride ."

# Adding Special Context Tokens

1) [BOS] - Beginning of sentence.
2) [EOS] - End of sentence. This usually use to concatenate two different article, etc.
3) [PAD] - If we train LLM on batch size greater than 1. The two sentences can have different lengths. Padding is used to pad sentence of shorter length
4) [UNK] - Represent word that are not included in the vocabulary

5) GPT-2 does not need any of these tokens mentioned above but only uses an <|endoftext|> token to reduce complexity.
6) GPT also uses the <|endoftext|> for padding (since we typically use a mask when training on batched inputs, we would not attend padded tokens anyways, so it does not matter what these tokens are)
7) GPT-2 does not use <UNK>. Instead it uses byte-pair encoding (BPE) to break down words into subword. 

In [10]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [11]:
class WordTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.special_tokens = ["<|endoftext|>", "<|unk|>"]
        
        
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.vocab
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.vocab.get(item, self.vocab["<|unk|>"]) for item in preprocessed]
        return ids
        
    def decode(self, ids):
        rev_vocab = {v: k for k, v in self.vocab.items()}
        text = " ".join([rev_vocab.get(i, "<|unk|>") for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [12]:
tokenizer = WordTokenizer(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [13]:
print(tokenizer.encode(text))

[1139, 17, 377, 1134, 644, 987, 22, 1138, 66, 999, 969, 995, 738, 999, 1139, 19]


In [14]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

# Byte pair encoding

1) Handles out of vocabolary situation by breaking down words into smaller words or individual character.

In [15]:
# pip install tiktoken

In [16]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.8.0


In [17]:
tokenizer = tiktoken.get_encoding("gpt2")

In [18]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [19]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


# Data Sampling with sliding window

1) We train LLM to predict one token at a time

In [20]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [24]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [25]:
def create_dataloader(text, batch_size = 12, max_length = 256, stride = 128, shuffle=True, drop_last=True, num_worker = 0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(text, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_worker
    )

    return dataloader

Testing Dataloader with batch size of 1

In [26]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
dataloader = create_dataloader(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


Creating Dataloader

In [27]:
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


# Create Token Embeddings

1) Embed the tokens into continuous vector representations using an embedding layer.
2) Usually, these embedding layers are part of the LLM itself and are updated (trained) during model training

# Positional Embeddings
1) Token embeddings are added with the positional embeddings to form the input embeddings

In [28]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [33]:
max_length = 4
batch_size = 8

dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [36]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


GPT2 uses absolute positional embeddings

In [37]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [39]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [40]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
