# Encode & Decode

In [1]:
import re

In [2]:
class DataLoader:

    def __init__(self, filename):
        self.filename = filename

        with open(self.filename, 'r', encoding='utf-8') as f:
            self.text = f.read().lower()

        self.vocabulary = sorted(set(self.split_text(self.text)))
        self.word2index = {word: index for index, word in enumerate(self.vocabulary)}
        self.index2word = {index: word for index, word in enumerate(self.vocabulary)}

    @staticmethod
    def split_text(text):
        words = re.split(r'([,.:;?_!"()\']|\s)', text.lower())
        return [t.strip() for t in words if t.strip()]

    def encode(self, text):
        words = self.split_text(text)
        return [self.word2index[word] for word in words]

    def decode(self, tokens):
        text = " ".join([self.index2word[index] for index in tokens])
        return re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)

In [3]:
dataset = DataLoader('../one-day.txt')

print('Total number of character: ', len(dataset.text))
print('Word to Index: ', dataset.word2index)

sentence = '"Be careful," his mother says.'
ids = dataset.encode(sentence)

print('Text: ', sentence)
print('Encode: ', ids)
print('Decode: ', dataset.decode(ids))

Total number of character:  3352
Word to Index:  {'"': 0, "'": 1, ',': 2, '.': 3, ':': 4, 'a': 5, 'about': 6, 'above': 7, 'across': 8, 'adventure': 9, 'after': 10, 'air': 11, 'along': 12, 'also': 13, 'an': 14, 'and': 15, 'ants': 16, 'apple': 17, 'are': 18, 'around': 19, 'as': 20, 'asleep': 21, 'at': 22, 'away': 23, 'back': 24, 'backpack': 25, 'bath': 26, 'be': 27, 'bed': 28, 'bees': 29, 'begins': 30, 'berries': 31, 'big': 32, 'birds': 33, 'bloom': 34, 'blue': 35, 'both': 36, 'bottle': 37, 'boy': 38, 'branches': 39, 'breakfast': 40, 'brings': 41, 'brushes': 42, 'butterfly': 43, 'buzz': 44, 'by': 45, 'calm': 46, 'careful': 47, 'carefully': 48, 'ceiling': 49, 'chirp': 50, 'clear': 51, 'climbs': 52, 'closes': 53, 'combs': 54, 'continues': 55, 'cookie': 56, 'cool': 57, 'dark': 58, 'day': 59, 'different': 60, 'dinner': 61, 'door': 62, 'down': 63, 'draws': 64, 'drinks': 65, 'eat': 66, 'eating': 67, 'eats': 68, 'eggs': 69, 'explorer': 70, 'eyes': 71, 'face': 72, 'falls': 73, 'fast': 74, 'faste