In [1]:
#Read the text file and count total characters
with open("C:\\VarunVault\\LLMs\\CodeFiles\\data\\the-verdict.txt","r",encoding="utf-8") as f:
#with open("C:\\VarunVault\\LLMs\\CodeFiles\\data\\ML.txt","r",encoding="utf-8") as f:
    raw_data = f.read()
print("Total Characters:",len(raw_data))

Total Characters: 20480


In [2]:
#Tokenize the text data and count total tokens, keeping punctuation as separate tokens and removing extra spaces
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_data)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print("Total Tokens:",len(preprocessed))

Total Tokens: 4690


In [3]:
#create a list of all unique tokens and sort them alphabetically to determine the vocabulary size:
all_words = sorted(set(preprocessed))
#vocab_size = len(all_words)
print("Vocabulary Size:",len(all_words))

Vocabulary Size: 1130


In [6]:
#After determining that the vocabulary size is 174 via the above code, we create the vocabulary and print its first 51 entries for illustration purposes:
vocab = {token:integer for integer,token in enumerate(all_words)}
for i,item in enumerate(vocab.items()):
    print(item)
    if i>=40:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)


In [7]:
#We create Tokenizer class which contains Encode and Decode methods to convert text to token IDs and vice versa:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s  for s,i in vocab.items() }

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        print("Preprocessed:",preprocessed)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        print("Preprocessed after stripping:",preprocessed)
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    def decode(self,token_ids):
        text = " ".join([self.int_to_str[id] for id in token_ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    

In [8]:
#New tokenizer object calling Encoder
tokenizer = SimpleTokenizerV1(vocab)
text = "It's the last he painted, you know,"
ids=tokenizer.encode(text)
print("Token IDs:",ids)

Preprocessed: ['It', "'", 's', ' ', 'the', ' ', 'last', ' ', 'he', ' ', 'painted', ',', '', ' ', 'you', ' ', 'know', ',', '']
Preprocessed after stripping: ['It', "'", 's', 'the', 'last', 'he', 'painted', ',', 'you', 'know', ',']
Token IDs: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5]


In [9]:
#Calling Decoder
decoded_text = tokenizer.decode(ids)
print("Decoded Text:",decoded_text) 

Decoded Text: It' s the last he painted, you know,


In [10]:
#Testing outside vocab , you will get KeyError
test_text = "Hello world!"
print("Testing outside vocab:",tokenizer.encode(test_text))

Preprocessed: ['Hello', ' ', 'world', '!', '']
Preprocessed after stripping: ['Hello', 'world', '!']


KeyError: 'Hello'

In [None]:
#Augmment the vobac to handle unkown tokens by adding <UNK> token and end of text <endoftext>

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|unk|>", "<|endoftext|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [None]:
#Length of new vocab will increase by 2
len(vocab)

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

In [None]:
tokenizer.encode(text)

In [None]:
tokenizer.decode(tokenizer.encode(text))

In [None]:
#Use BPE tokenizer from the tokenizers library
!pip3 install tiktoken

In [11]:
import tiktoken 
import importlib
print("tiktoken Version:",importlib.metadata.version("tiktoken"))

tiktoken Version: 0.12.0


In [12]:
tokenizer = tiktoken.get_encoding("cl100k_base")
print("Tokenizer Type:",tokenizer)

Tokenizer Type: <Encoding 'cl100k_base'>


In [13]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[9906, 11, 656, 499, 1093, 15600, 30, 220, 100257, 763, 279, 7160, 32735, 7317, 2492, 1073, 1063, 16476, 17826, 13]


In [None]:
strings = tokenizer.decode(integers)
print(strings)  

In [105]:
integers = tokenizer.encode("Hello my Name is Varun.")
print(integers)

[9906, 856, 4076, 374, 8909, 359, 13]


In [106]:
strings = tokenizer.decode(integers)
print(strings)  

Hello my Name is Varun.
