# Tokenizing text

We'll use an of-the-shelve tokenizer for doing Byte Pair Encoding (BPE).  We'll use `tiktoken` for this.

In [None]:
#| echo: true
#| output: false
%conda install -y tiktoken

Let's load a text and tokenize it:

In [4]:
import tiktoken

filepath = '../data/dracula.txt'

def load_text(path):
    with open(path, 'r') as f:
        raw_text = f.read()
    return raw_text

def tokens_from_text(text: str):
    tokenizer = tiktoken.get_encoding("gpt2")
    integers = tokenizer.encode(text)
    return integers

def text_from_tokens(tokens: list[int]):
    tokenizer = tiktoken.get_encoding("gpt2")
    text = tokenizer.decode(tokens)
    return text


This now allows us to load text and turn it into tokens (each identified by an integer) or the reverse: given a set of tokens, reconstruct the text from them:

In [17]:
def get_sample_text(num_chars:int = 40):
    raw_text = load_text(filepath)
    return raw_text[:num_chars]

sample_text = get_sample_text()
print(sample_text)

tokens = tokens_from_text(sample_text)
print(tokens)

text = text_from_tokens(tokens)
print(text)

The Project Gutenberg eBook of Dracula
 
[464, 4935, 20336, 46566, 286, 41142, 198, 220]
The Project Gutenberg eBook of Dracula
 


# Creating a dataset

We'll first make sure to install pytorch: `conda install pytorch cpuonly -c pytorch`.


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, txt: str, tokenizer, max_length=16, stride=4):
        """
        Args:
            txt (str): The input text to tokenize and split into sequences.
            tokenizer: The tokenizer used to encode the text into token ids.
            max_length (int): The context length, i.e., the number of tokens in each input sequence.
            stride (int): The step size between the start of consecutive sequences.
        """
        self.tokenizer = tokenizer
        self.max_length = max_length  # context length for each input sequence
        self.stride = stride
        self.token_ids = self.tokenizer.encode(txt)
        self.length = len(self.token_ids)

        self.input_ids = []    # list of input tokens, our "context" as input to the LLM
        self.target_ids = []   # list of target tokens that will need to be predicted, our "context" shifted by stride

        for i in range(0, len(self.token_ids) - self.max_length):
            input_chunk = self.token_ids[i:i + self.max_length]
            target_chunk = self.token_ids[i + 1:i + self.max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader(txt:str, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,num_workers=0):
    """
    Create a DataLoader for the given text.
    Args:
        txt (str): The input text to tokenize and split into sequences.
        batch_size (int): Number of samples per batch.
        max_length (int): The context length, i.e., the number of tokens in each input sequence.
        stride (int): The step size between the start of consecutive sequences.
        shuffle (bool): Whether to shuffle the data at every epoch.
        drop_last (bool): Whether to drop the last incomplete batch.
        num_workers (int): Number of subprocesses to use for data loading.
    """

    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = MyDataset(txt, tokenizer, max_length=max_length, stride=stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

Let's test our dataloader now:

In [None]:
text = get_sample_text(1000)
print("sample_text: ", text)
dataloader = create_dataloader(txt=text, batch_size=2, max_length=8, stride=2, drop_last=False)
for batch in dataloader:
    input_ids, target_ids = batch
    print("Input IDs:", input_ids)
    print("Target IDs:", target_ids)
    break  # Just show the first batch
print("Total batches:", len(dataloader))
print("Batch size:", dataloader.batch_size)
print("Number of workers:", dataloader.num_workers)

sample_text:  The Project Gutenberg eBook of Dracula
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Dracula

Author: Bram Stoker

Release date: October 1, 1995 [eBook #345]
                Most recently updated: November 12, 2023

Language: English

Credits: Chuck Greif and the Online Distributed Proofreading Team


*** START OF THE PROJECT GUTENBERG EBOOK DRACULA ***




                                DRACULA

                                  _by_

                              Bram Stoker

                        [Illustration: colophon]

                                NEW YORK

