Chapter 2: Working with Text Data

2.2: Tokenizing Text

In [2]:
import os
import urllib.request


if not os.path.exists("the-verdict.txt"):
    url = (
        "https://raw.githubusercontent.com/rasbt/"
        "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
        "the-verdict.txt"
    )
    file_path = "the-verdict.txt"
    urllib.request.urlretreive(url,file_path)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_dataset = f.read()

In [4]:
raw_dataset

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [5]:
len(raw_dataset)

20479

In [6]:
import re

text = "Hello, My name is Yasharth."
result = re.split(r'(\s)',text)

print(result)

['Hello,', ' ', 'My', ' ', 'name', ' ', 'is', ' ', 'Yasharth.']


In [None]:
import re

result = re.findall(r'\w+|[^\w\s]', raw_dataset)

#print(result)


In [None]:
len(result)

But we will be using following tokenizing for now, later shifting to tiktokenizeer

In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_dataset)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

In [None]:
len(preprocessed)

In [None]:
preprocessed[:20]

2.3 Converting Tokens into Token ID

In [None]:
all_words = sorted(set(preprocessed))
vocab_size= len(all_words)
print(vocab_size)

In [None]:
vocab = {token:integer for integer,token in enumerate(all_words)}
#vocab

In [None]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self,ids):
               text=" ".join([self.int_to_str[i] for i in ids])
               #Replace spaces before the specified punctuations
               text = re.split(r's\+([,.?!"()\'])',r'\1', text)
               return text

In [None]:
tokenizer = SimpleTokenizer(vocab)

In [None]:
text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""

In [None]:
ids = tokenizer.encode(text)
print(ids)

In [None]:
tokenizer.decode(ids)

In [None]:
tokenizer.decode(tokenizer.encode(text))

In [None]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

In [None]:
tokenizer.encode(text)

In [None]:
print(text)

In [None]:
tokenizer.decode(tokenizer.encode(text))

## 2.5: Byte Pair Encoding

In [None]:
import tiktoken

In [None]:
tiktoken.__version__

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
tokenizer.decode(tokenizer.encode("Hello Yasharth"))

## Data sampling with a sliding window

In [None]:
with open("the-verdict.txt", "r",encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

In [None]:
enc_sample= enc_text[50:]

In [None]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

In [None]:
#print(enc_text)

In [None]:
import torch
from torch.utils.data import Dataset,DataLoader

In [None]:
class GPTDatasetV1(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids =[]

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk= token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader


# --------------------------------------------------
# 2. GPT-style Dataset (Listing 2.5)
# --------------------------------------------------
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk, dtype=torch.long))
            self.target_ids.append(torch.tensor(target_chunk, dtype=torch.long))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# --------------------------------------------------
# 3. Example text
# --------------------------------------------------
text = "hello world, this is a tiny gpt dataset"


# --------------------------------------------------
# 5. Create dataset
# --------------------------------------------------
max_length = 8
stride = 4

dataset = GPTDatasetV1(
    txt=text,
    tokenizer=tokenizer,
    max_length=max_length,
    stride=stride
)

# --------------------------------------------------
# 6. DataLoader
# --------------------------------------------------
loader = DataLoader(
    dataset,
    batch_size=2,
    shuffle=False
)

# --------------------------------------------------
# 7. Inspect batches
# --------------------------------------------------
for batch_idx, (inputs, targets) in enumerate(loader):
    print(f"\nBatch {batch_idx}")
    print("Input IDs:\n", inputs)
    print("Target IDs:\n", targets)

    # decode first example in batch for clarity
    print("Decoded input :", tokenizer.decode(inputs[0].tolist()))
    print("Decoded target:", tokenizer.decode(targets[0].tolist()))


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

# --------------------------------------------------
# 1. GPT-style Dataset (Listing 2.5)
# --------------------------------------------------
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # tokenize entire text once
        token_ids = tokenizer.encode(txt)

        # sliding window
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]

            self.input_ids.append(
                torch.tensor(input_chunk, dtype=torch.long)
            )
            self.target_ids.append(
                torch.tensor(target_chunk, dtype=torch.long)
            )

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


# --------------------------------------------------
# 2. DataLoader factory (Listing 2.6)
# --------------------------------------------------
def create_dataloader_v1(
    txt,
    batch_size=4,
    max_length=256,
    stride=128,
    shuffle=True,
    drop_last=True,
    num_workers=0,
):
    # A: initialize tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # B: create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # C + D: create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

    return dataloader


# --------------------------------------------------
# 3. Example text (replace with your own file if needed)
# --------------------------------------------------
sample_text = (
    "This is a tiny example text used to demonstrate "
    "how GPT-style dataloaders work with sliding windows."
)

# --------------------------------------------------
# 4. Create DataLoader with small context (demo)
# --------------------------------------------------
dataloader = create_dataloader_v1(
    txt=sample_text,
    batch_size=1,
    max_length=4,
    stride=1,
    shuffle=False,
)

# --------------------------------------------------
# 5. Inspect first two batches
# --------------------------------------------------
data_iter = iter(dataloader)

first_batch = next(data_iter)
second_batch = next(data_iter)

print("First batch:")
print(first_batch)

print("\nSecond batch:")
print(second_batch)


In [None]:







import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

# --------------------------------------------------
# 1. GPT-style Dataset (Listing 2.5)
# --------------------------------------------------
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # tokenize entire text once
        token_ids = tokenizer.encode(txt)

        # sliding window
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]

            self.input_ids.append(
                torch.tensor(input_chunk, dtype=torch.long)
            )
            self.target_ids.append(
                torch.tensor(target_chunk, dtype=torch.long)
            )

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


# --------------------------------------------------
# 2. DataLoader factory (Listing 2.6)
# --------------------------------------------------
def create_dataloader_v1(
    txt,
    batch_size=4,
    max_length=8,
    stride=2,
    shuffle=True,
    drop_last=True,
    num_workers=0,
):
    # A: initialize tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # B: create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # C + D: create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

    return dataloader


# --------------------------------------------------
# 3. Example text (replace with your own file if needed)
# --------------------------------------------------
sample_text = (
    "This is a tiny example text used to demonstrate "
    "how GPT-style dataloaders work with sliding windows."
)

# --------------------------------------------------
# 4. Create DataLoader with small context (demo)
# --------------------------------------------------
dataloader = create_dataloader_v1(
    txt=sample_text,
    batch_size=1,
    max_length=8,
    stride=2,
    shuffle=False,
)

# --------------------------------------------------
# 5. Inspect first two batches
# --------------------------------------------------
data_iter = iter(dataloader)

first_batch = next(data_iter)
second_batch = next(data_iter)

print("First batch:")
print(first_batch)

print("\nSecond batch:")
print(second_batch)


In [None]:
input_ids = torch.tensor([2,3,5,1])

In [None]:
#vocab_size =6
#output_dim = 3

In [None]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)
print(embedding_layer.weight)

In [None]:
print(embedding_layer(torch.tensor([3])))

  ## Data Sampling using sliding window (contd)

In [None]:
from torch.utils.data import Dataset
import torch

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [None]:
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)


    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [None]:
with open("the-verdict.txt","r", encoding = "utf-8") as f:
    raw_text = f.read()

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Characters read:", len(raw_text))
print("Preview:", raw_text[:200])


In [None]:
print(raw_text[:1000])   # first 1000 characters

In [None]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1,max_length=8,stride=4,shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

In [None]:
second_batch = next(data_iter)
print(second_batch)

In [None]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=8,
    max_length=8,
    stride=4,
    shuffle=False
)

data_iter = iter(dataloader)
input_ids, target_ids = next(data_iter)

print("===== FIRST BATCH =====")
print(f"Batch size     : {input_ids.shape[0]}")
print(f"Sequence length: {input_ids.shape[1]}")

print("\n--- Input IDs (x) ---")
print(input_ids)

print("\n--- Target IDs (y) ---")
print(target_ids)


In [None]:

tokenizer = tiktoken.get_encoding("gpt2")

print("\n--- Decoded Input Sequences ---")
for i, seq in enumerate(input_ids):
    print(f"[Sample {i}] {tokenizer.decode(seq.tolist())}")

print("\n--- Decoded Target Sequences ---")
for i, seq in enumerate(target_ids):
    print(f"[Sample {i}] {tokenizer.decode(seq.tolist())}")


## Creating Token Embedding

In [None]:
input_ids = torch.tensor([7,2,1,5])

In [None]:
vocab_size =6
output_dim =3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [None]:
print(vocab_size)

In [None]:
print(embedding_layer.weight)

In [None]:
embedding_layer(torch.tensor([3]))
#4th row

In [None]:
embedding_layer(torch.tensor([2])) 
#3rd row

In [None]:
print("Max token ID in batch:", input_ids.max().item())
print("Embedding vocab size :", embedding_layer.num_embeddings)


In [None]:
data_iter = iter(dataloader)
input_ids, target_ids = next(data_iter)


In [None]:
print("FIX CHECK â†’", input_ids.shape)


In [None]:
input_ids

In [None]:
embedding_layer(input_ids)

In [None]:
input_ids

## Encoding word position

In [None]:
vocab_size = 50527
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length =4

dataloader = create_dataloader_v1(
    raw_text, batch_size=8,max_length=max_length,stride=max_length,shuffle=False
)

data_iter = iter(dataloader)
inputs,targets = next(data_iter)

In [None]:
print("Token ID's: \n ",inputs)

print("shape: \n",inputs.shape)

In [None]:
token_embedding=token_embedding_layer(inputs)

In [None]:
token_embedding.shape

meaning 8 rows 4 coloumns, and 256 dimension vectors

In [None]:
#token_embedding[0,0]

### Adding positional information

In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
torch.arange(max_length) # 0 1 2 3

In [None]:
pos_embedding = pos_embedding_layer(torch.arange(max_length)) #it will give embeddings for positions 0,1,2,3
print(pos_embedding.shape) #(4, 256)
print(pos_embedding)

In [None]:
pos_embedding_layer.weight

In [None]:
token_embedding[0] +pos_embedding 

In [None]:
input_embedding = token_embedding + pos_embedding
input_embedding.shape

In [None]:
token_embedding[0] +pos_embedding 

In [None]:
token_embedding + pos_embedding