In [None]:
with open("./the-verdict.txt", mode="r", encoding="UTF-8") as file:
    raw = file.read()

print("Number of characters in raw text ::: ", len(raw))

text = raw

Number of characters in raw text :::  20479


In [20]:
import re

words_list = re.split(r"([,.:;?\"()-]|\s)", text)
# print(words_list)

In [21]:
words_list = [item for item in words_list if item.strip()]

In [None]:
unique_words = sorted(set(words_list))
unique_words.extend(["<|unk|>", "<|end_of_text|>"])

In [24]:
vocab = {item:integer for integer, item in enumerate(unique_words)}

In [27]:
from abc import ABC, abstractmethod

class Tokenize(ABC):
    
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {value:key for key, value in vocab.items()}
    
    @abstractmethod
    def encode(self, string):
        pass
    
    @abstractmethod
    def decode(self, tokens):
        pass


class SimpleTokenizerV1(Tokenize):

    def __init__(self, vocab):
        super().__init__(vocab)
    
    def encode(self, string):
        words = re.split(r"([,.:;?\"()-]|\s)", string)
       
        preprocessed = [item for item in words if item.strip()]

        words_list = [word if word in self.str_to_int else "<|unk|>" for word in preprocessed]

        token_ids = [self.str_to_int[word] for word in words_list]

        return token_ids

    def decode(self, tokens):
        words = [self.int_to_str[token] for token in tokens]
        
        sentence = " ".join(words)

        decoded_sentence = re.sub(r"\s+([,:;?!\")()'])", r"\1", sentence)

        return decoded_sentence


test_string = "Hello, do you like tea?"

tokenizer = SimpleTokenizerV1(vocab)

tokens = tokenizer.encode(test_string)
print(tokens)

decoded_string = tokenizer.decode(tokens)
print(decoded_string)

[1190, 11, 396, 1184, 670, 1026, 16]
<|unk|>, do you like tea?
