# Tokenization

_Tokenization_ is the process of converting a body of text into individual _tokens_, e.g., words and punctuation characters. This is the first step for most Natural Language Processing (NLP) tasks, including preparing data for training an LLM. Let's see how it's done!

## Some sample text

In [None]:
text = "This is a test! Or is this not a test? Test it to be sure. :)"
print(text)
print(f"This sample text has {len(text)} characters.")

This is a test! Or is this not a test? Test it to be sure. :)
This sample text has 61 characters.


In [None]:
print( text.split() )

['This', 'is', 'a', 'test!', 'Or', 'is', 'this', 'not', 'a', 'test?', 'Test', 'it', 'to', 'be', 'sure.', ':)']


In [None]:
import re

In [None]:
tokens = re.split( r'([.?!:()]|\s)', text)  #\s is white space... splits text into tokens   
tokens = [item for item in tokens if item.split()] #
tokens = sorted(list( set( tokens )))
print(tokens)

['!', ')', '.', ':', '?', 'Or', 'Test', 'This', 'a', 'be', 'is', 'it', 'not', 'sure', 'test', 'this', 'to']


In [None]:
vocab = {token:index for index, token in enumerate(tokens)} #enumerate assigns a number to each term
print(vocab.items())

dict_items([('!', 0), ("'", 1), ("'Error,'", 2), ("'The", 3), ("'fine,", 4), ("'hands-off", 5), ("'heavy", 6), ('.', 7), ('1', 8), (':', 9), ('<|unk|>, <|endoftext|>', 10), ('?', 11), ('A', 12), ('All', 13), ('Almost', 14), ('Am', 15), ('And', 16), ('And,', 17), ('Anstruther', 18), ('Anstruther,”', 19), ('Anstruther—what', 20), ('Art', 21), ('At', 22), ('Briefly,', 23), ('But', 24), ('But,', 25), ('Calhoun', 26), ("Calhoun's", 27), ('Calhoun,', 28), ('Calhoun,”', 29), ('Calhoun—I', 30), ('Calhoun—it', 31), ('Can', 32), ('Carter——”', 33), ('Christian', 34), ('Coronet', 35), ('Dairy', 36), ('David', 37), ('David,', 38), ('David,”', 39), ('Death', 40), ('Did', 41), ("Didn't", 42), ('Do', 43), ("Don't", 44), ('Drawn', 45), ('Fancy', 46), ('Five', 47), ('For', 48), ('From', 49), ('Glad', 50), ('God', 51), ('God,', 52), ('Good-by,', 53), ('Halfway', 54), ('Hardy', 55), ("Hardy's", 56), ('Hardy,', 57), ('Hardy,”', 58), ('Hardy—may', 59), ('He', 60), ('Heaven', 61), ('Helen', 62), ('Helen,', 6

In [None]:
vocab["a"]

8

In [None]:
with open("ShortStory.txt", "r") as f:
    raw_text= f.read()

print(raw_text[:50])

IF you please, ma'am, dinner is served.”

David Ha


In [None]:
tokens = re.split( r'([.?!:()]|\s)', raw_text)  #\s is white space... splits text into tokens   
tokens = [item for item in tokens if item.split()] 
tokens.extend(["<|unk|>", "<|endoftext|>"])
print( len(tokens))

6936


In [None]:
tokens = sorted(list( set( tokens )))
print(len(tokens))

1917


In [None]:
print(tokens[:25])

['!', "'", "'Error,'", "'The", "'fine,", "'hands-off", "'heavy", '.', '1', ':', '<|endoftext|>', '<|unk|>', '?', 'A', 'All', 'Almost', 'Am', 'And', 'And,', 'Anstruther', 'Anstruther,”', 'Anstruther—what', 'Art', 'At', 'Briefly,']


In [None]:
vocab = {token: index for index, token in enumerate(tokens)}
vocab.items()

dict_items([('!', 0), ("'", 1), ("'Error,'", 2), ("'The", 3), ("'fine,", 4), ("'hands-off", 5), ("'heavy", 6), ('.', 7), ('1', 8), (':', 9), ('<|endoftext|>', 10), ('<|unk|>', 11), ('?', 12), ('A', 13), ('All', 14), ('Almost', 15), ('Am', 16), ('And', 17), ('And,', 18), ('Anstruther', 19), ('Anstruther,”', 20), ('Anstruther—what', 21), ('Art', 22), ('At', 23), ('Briefly,', 24), ('But', 25), ('But,', 26), ('Calhoun', 27), ("Calhoun's", 28), ('Calhoun,', 29), ('Calhoun,”', 30), ('Calhoun—I', 31), ('Calhoun—it', 32), ('Can', 33), ('Carter——”', 34), ('Christian', 35), ('Coronet', 36), ('Dairy', 37), ('David', 38), ('David,', 39), ('David,”', 40), ('Death', 41), ('Did', 42), ("Didn't", 43), ('Do', 44), ("Don't", 45), ('Drawn', 46), ('Fancy', 47), ('Five', 48), ('For', 49), ('From', 50), ('Glad', 51), ('God', 52), ('God,', 53), ('Good-by,', 54), ('Halfway', 55), ('Hardy', 56), ("Hardy's", 57), ('Hardy,', 58), ('Hardy,”', 59), ('Hardy—may', 60), ('He', 61), ('Heaven', 62), ('Helen', 63), ('He

In [None]:
phrase= "He tried to laugh, but failed rather badly;"
print(phrase)

He tried to laugh, but failed rather badly;


In [None]:
phrase= re.split( r'([.?!:()]|\s)', phrase)
phrase= [item for item in phrase if item.split()]
print(phrase)

['He', 'tried', 'to', 'laugh,', 'but', 'failed', 'rather', 'badly;']


In [None]:
ids= [vocab[token] for token in phrase]
print(ids)

[61, 1667, 1647, 977, 364, 652, 1297, 278]


In [None]:
reverse_vocab= {index:token for token,index in vocab.items()}
reverse_vocab.items()

dict_items([(0, '!'), (1, "'"), (2, "'Error,'"), (3, "'The"), (4, "'fine,"), (5, "'hands-off"), (6, "'heavy"), (7, '.'), (8, '1'), (9, ':'), (10, '<|endoftext|>'), (11, '<|unk|>'), (12, '?'), (13, 'A'), (14, 'All'), (15, 'Almost'), (16, 'Am'), (17, 'And'), (18, 'And,'), (19, 'Anstruther'), (20, 'Anstruther,”'), (21, 'Anstruther—what'), (22, 'Art'), (23, 'At'), (24, 'Briefly,'), (25, 'But'), (26, 'But,'), (27, 'Calhoun'), (28, "Calhoun's"), (29, 'Calhoun,'), (30, 'Calhoun,”'), (31, 'Calhoun—I'), (32, 'Calhoun—it'), (33, 'Can'), (34, 'Carter——”'), (35, 'Christian'), (36, 'Coronet'), (37, 'Dairy'), (38, 'David'), (39, 'David,'), (40, 'David,”'), (41, 'Death'), (42, 'Did'), (43, "Didn't"), (44, 'Do'), (45, "Don't"), (46, 'Drawn'), (47, 'Fancy'), (48, 'Five'), (49, 'For'), (50, 'From'), (51, 'Glad'), (52, 'God'), (53, 'God,'), (54, 'Good-by,'), (55, 'Halfway'), (56, 'Hardy'), (57, "Hardy's"), (58, 'Hardy,'), (59, 'Hardy,”'), (60, 'Hardy—may'), (61, 'He'), (62, 'Heaven'), (63, 'Helen'), (64,

In [None]:
print(" ".join([ reverse_vocab[id] for id in ids]))

He tried to laugh, but failed rather badly;


In [None]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {index:token for token, index in vocab.items()}

    def encode(self, text):
        tokens= re.split( r'([.?!:()]|\s)', text)
        tokens= [item if item in self.str_to_int else "<|unk|>" for item in tokens if item.split()]
        ids= [self.str_to_int[token] for token in tokens]
        return ids

    def decode(self, ids):
        text= " ".join([self.int_to_str[id] for id in ids])
        text= re.sub( r'\s+([.?!:())"\'^])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizer(vocab)

In [None]:
phrase= "He hesitated. For a full minute, he stood looking away from her; and then, with a slow, uncertain step, he went to the writing table by the window,"

In [None]:
ids= tokenizer.encode(phrase)
print(ids)

[61, 854, 7, 49, 173, 743, 1082, 827, 1524, 1021, 270, 740, 847, 226, 1608, 1770, 173, 1462, 1687, 1521, 827, 1737, 1647, 1601, 1800, 1563, 366, 1601, 11]


In [None]:
text= tokenizer.decode(ids)
print(text)

He hesitated. For a full minute, he stood looking away from her; and then, with a slow, uncertain step, he went to the writing table by the <|unk|>


In [None]:
different_phrase= "And all day long he had sat at the handsome table and looked out over the gray river at the masts piercing the mists"

In [None]:
ids= tokenizer.encode(different_phrase)
print(ids)

In [None]:
text= tokenizer.decode(ids)
print(text)

In [None]:
#End of Tokenizer Assignment

In [None]:
import tiktoken

In [None]:
tokenizer= tiktoken.get_encoding("gpt2")