### **TOKENIZATION**

In [1]:
import re

data_path = "./data/"

In [2]:
with open(data_path + "the_verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
        
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
def split_into_tokens(text):
    return re.split(r'([,.:;?_!\"()\']|--|\s)', text)

def remove_whitespaces(text):
    return [item.strip() for item in text if item.strip()]

In [4]:
preprocessed = split_into_tokens(raw_text)
preprocessed = remove_whitespaces(preprocessed)

print(preprocessed[:99])
print(len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter']
4690


In [5]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

In [6]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [7]:
# for index, item in enumerate(vocab.items()):
#     print(item)
#     if (index == 50):
#         break;

In [8]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {index:item for item, index in vocab.items()}
    
    def encode(self, text):
        preprocessed = remove_whitespaces(split_into_tokens(text))
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
        
        

In [9]:
tokenizer = SimpleTokenizerV1(vocab)

In [10]:
ids = tokenizer.encode(raw_text)
print(ids)

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120, 530, 208, 85, 734, 34, 7, 4, 1, 93, 538, 722, 549, 496, 1, 6, 987, 1077, 1089, 988, 1112, 242, 585, 7, 53, 244, 535, 67, 7, 37, 100, 6, 549, 602, 25, 897, 6, 326, 549, 1042, 116, 7, 1, 73, 297, 585, 2, 850, 498, 1016, 866, 988, 1059, 722, 697, 769, 2, 1083, 1051, 9, 239, 53, 359, 2, 970, 998, 722, 987, 5, 66, 7, 83, 6, 988, 646, 1016, 16, 584, 145, 53, 998, 722, 7, 1, 93, 1116, 5, 727, 67, 7, 100, 2, 850, 633, 5, 693, 586, 114, 847, 114, 177, 1002, 994, 1088, 827, 568, 156, 389, 1069, 722, 677, 7, 14, 585, 1077, 711, 731, 988, 67, 7, 101, 1097, 688, 7, 45, 711, 988, 410, 50, 28, 5, 180, 988, 602, 40, 36, 882, 5, 929, 663, 209, 38, 2, 850, 1, 65, 1, 1016, 856, 5, 1108, 976, 568, 539, 4

In [11]:
decoded_text = tokenizer.decode(ids)
print(decoded_text)

I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdication." Of course it' s going to send the value of my picture' way up ; but I don' t think of that, Mr. Rickham -- the loss to Arrt is all I think of." The word, on Mrs. Thwing' s lips, multiplied its _ rs _ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn' s" Moon-dancers" to say, with tears in her eyes :" We shall not look upon its like again"? Well

In [12]:
end_of_text = "<|endoftext|>"
unknown = "<|unk|>"

In [13]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend([end_of_text, unknown])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [14]:
len(vocab.items())

1132

In [15]:
for _, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [16]:
class SimpleTokenizerV2(SimpleTokenizerV1):
    def encode(self, text):
        preprocessed = remove_whitespaces(split_into_tokens(text))
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

In [17]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace"

text = f' {end_of_text} '.join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace


In [18]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131]

In [19]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>'

In [21]:
import tiktoken
import importlib

print("tiktoken version", importlib.metadata.version("tiktoken"))

tiktoken version 0.12.0


In [22]:
tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the terraces"
    "of someunknownPlace"
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 8812, 2114, 1659, 617, 34680, 27271]
