In [1]:
with open("data/the-verdict.txt" , "r" ,encoding="utf-8") as f:
    raw_text=f.read()

print("Total number of characters in data:" , len(raw_text))
print(raw_text[:99])
    

Total number of characters in data: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


# Step 1 : Tokenzing the Input Data

---
<span style="color:#FF5733">
We now create tokens using the regex that splits on whilespace. 
Also include the "." and ","
We then strip the whitespaces from the sentence for number or reason (size, memory , processing power)
<span>

In [2]:
import re
text="Hello, world. Is this-- , a test?"
re_words=re.split(r'([,.:;?_!"()\']|--|\s)' , text)

In [3]:
print(re_words)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', '', ',', '', ' ', 'a', ' ', 'test', '?', '']


In [4]:
final_result = [item for item in re_words if item.strip()]

In [5]:
print(final_result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', ',', 'a', 'test', '?']


---
<span style="color:#FF5733">
This was example.
    Now below we tokenizing the actual dataset
<span>

In [6]:
preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)' , raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [7]:
print(len(preprocessed))

4690


# Step 2 : Assign Token to Words

---
<span style="color:#FF5733">
Now we create a vocabulary from the text , and then assign then numeric id values
    1. We first create set to get unique values in vocab
    2. Then we sort them
<span>

In [8]:
all_words=sorted(set(preprocessed))

# the size of the vocabulary from the training set we choose to us to pre-train our LLM

print(len(all_words))

1130


In [9]:
# Now we enumarate then to create a token and word combination 
# Store them in a dictionary  , and this is ove vocabulary 

vocab = {token:index for index,token in enumerate(all_words)}

#for i,word in enumerate(sorted(set(preprocessed))):
#    print("Index :" ,i , "words:",word)


In [10]:
for i , item in enumerate(vocab.items()):
    print(item)
    if i >=10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


---
<span style="color:#FF5733">
Now we create a Tokenizer Class for input Text that comes on the fly.
    This will contain 2 methods encode and decode 

<span>

In [11]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int =vocab
        self.int_to_str={index:token for token ,index in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)' , text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s]  for s in preprocessed ]
        return ids

    def decode(self,ids):
        list_of_text=[self.int_to_str[id] for id in ids]
        text = " ".join(list_of_text)
        text = re.sub(r'\s+([,.?_!"()\'])',r'\1' ,text)
        return text

In [12]:
tokenizer=SimpleTokenizerV1(vocab)
text=""""It's the last he painted, you know,"
         Mrs. Gisburn said with pardonable pride."""

ids=tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [13]:
text=tokenizer.decode(ids)
print(text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


# ADDING  SPECIAL CONTEXT TOKENS

---
<span style="color:#FF5733">
What happens when LLM comes across a word that is not part of the vocabulary we generated from the training 
    datasets . We cannot have LLM fail due to this . This scenarios is handled by Adding special context
tokens
<span>


In this case we will add additional tokens to vocab.
1. <|unk|>  :- This will be used when there is a word/token encountered that is not part of vocab
2. <|endoftext|> :- LLM are trained on large corpus of datasets (multiple books  , articles etc ) lets call this training set. Now to clearly
                        distinguish between these different training sets , we add this token before each training dataset


In [14]:
print(len(preprocessed))

4690


In [15]:
all_tokens=list(sorted(set(preprocessed)))

In [16]:
#W We sort and convert them to list . Add new tokens using extend as we want this at the end of vocab list
all_tokens.extend(["<|endoftext|>" , "<|unk|>"])

In [18]:
vocab = {token:index for index,token in enumerate(all_tokens)}

In [19]:
for i , item in enumerate(vocab.items()):
    print(item)
    if i >=10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


In [20]:
# print last 5 tokens 

list(vocab.items())[-5:]

[('younger', 1127),
 ('your', 1128),
 ('yourself', 1129),
 ('<|endoftext|>', 1130),
 ('<|unk|>', 1131)]

---
<span style="color:#FF5733">
We now create a new version on SimpleTokenizer Class as SimpleTokenizerV2.
As part of updates to this class , we make sure that any new toekn/word that is not part of the vocab we created
will be replace with new token "<|unk|>" , the one we added above. This way even if the token/word is not part
dataset/training set we will not LLM failing
<span>

In [30]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int =vocab
        self.int_to_str={index:token for token ,index in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)' , text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

# This is the comprehension way
        prepdata = [
                     item if item in self.str_to_int 
                     else "<|unk|>" for item in preprocessed
                   ]

        
# This is one way a classic way to add new token for tokens not part of vocab
#        prepdata=[]
#        for item in preprocessed:
#            if item in self.str_to_int:
#                prepdata.append(item)
#            else:
#                prepdata.append("<|unk|>")
               

        ids = [self.str_to_int[s]  for s in prepdata ]
        return ids

    def decode(self,ids):
        list_of_text=[self.int_to_str[id] for id in ids]
        text = " ".join(list_of_text)
        text = re.sub(r'\s+([,.?_!"()\'])',r'\1' ,text)
        return text

In [31]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1,text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


---
<span style="color:#FF5733">
The above 2 text are kinda our training set.  
    
We have separated then using the token " <|endoftext|> " , to mark clear seggregation.  
    
Also , in these 2 text , the token "Hello" and "palace" are not part of vocab , they now replace with "<|unk|>" token as well.  
    
See below example
<span>

In [34]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [35]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'