In [3]:
import os
import re

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [4]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [5]:
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

In [8]:
sorted(preprocessed)[:10]

['!', '!', '!', '!', '!', '!', '!', '!', '!', '!']

In [9]:
all_words = set(sorted(preprocessed))
all_words

{'palm-trees',
 'resolve',
 'stand',
 'vindicated',
 'attack',
 'between',
 'real',
 'hall',
 'silent',
 'them',
 'flash',
 'five',
 'Are',
 '_',
 'me',
 'were',
 'Why',
 'arm',
 'to',
 'genius',
 'serious',
 'arms',
 'beaming',
 'gray',
 'holding',
 'years',
 'canvas',
 'later',
 'patient',
 'word',
 'reason',
 'ourselves',
 'represented',
 'or',
 'again',
 'platitudes',
 'used',
 'Was',
 'going',
 'down',
 'negatived',
 'grayish',
 'close',
 'hung',
 'died',
 'nervousness',
 'splash',
 'good',
 'vista',
 'Chicago',
 'spacious',
 'full',
 'without',
 'atom',
 'should',
 'placed',
 'silver',
 'amusing',
 '--',
 'For',
 'dingy',
 'foreseen',
 'since',
 'faith',
 'grew',
 'strain',
 'regrets',
 'mighty',
 'posing',
 'at',
 'landing',
 'surprised',
 'princely',
 'wincing',
 'cigar',
 'covered',
 'on',
 'event',
 'fashionable',
 'house',
 'lucky',
 'subject',
 'too',
 'tributes',
 'veins',
 'enabled',
 'be',
 'live',
 'escape',
 'immediately',
 'something',
 'drawn',
 'object',
 'stuff',
 

In [10]:
len(all_words)

1130

In [11]:
vocab = {token:integer for integer,token in enumerate(all_words)}

check_vocab = [x for x in vocab.items()][:10]
check_vocab

[('palm-trees', 0),
 ('resolve', 1),
 ('stand', 2),
 ('vindicated', 3),
 ('attack', 4),
 ('between', 5),
 ('real', 6),
 ('hall', 7),
 ('silent', 8),
 ('them', 9)]

In [None]:
class TokenizerV1():

    def __init__(self, vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = {idx: val for val, idx in vocabulary.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[x] for x in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join(self.int_to_str[id] for id in ids)
        return text

In [26]:
t = TokenizerV1(vocab)

print(t.str_to_int['resolve'])
print(t.int_to_str[1])

id = t.encode("It's the last he painted, you know, Mrs. Gisburn said with pardonable pride")
print(id)

txt = t.decode([908, 1086, 455, 802, 807, 150, 741, 1099, 109, 343, 1099, 966, 262, 849, 1036, 851, 616, 145])
print(txt)

1
resolve
[908, 1086, 455, 802, 807, 150, 741, 1099, 109, 343, 1099, 966, 262, 849, 1036, 851, 616, 145]
It ' s the last he painted , you know , Mrs . Gisburn said with pardonable pride


In [28]:
print(t.encode("encyclopedia"))

KeyError: 'encyclopedia'

In [29]:
unknown_token = "<|unk|>"
all_words_with_unknown = sorted(list(set(preprocessed)))
all_words_with_unknown.append(unknown_token)
vocab_with_unknown = {token:integer for integer,token in enumerate(all_words_with_unknown)}

In [31]:
for i, item in enumerate(list(vocab_with_unknown.items())[-5:]):
    print(item)

len(vocab_with_unknown)

('you', 1126)
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|unk|>', 1130)


1131

In [34]:
class TokenizerV2(TokenizerV1):

    def __init__(self, vocabulary):
        super().__init__(vocabulary=vocabulary)
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        return [self.str_to_int[x] for x in preprocessed]
    
    def decode(self, ids):
        txt = " ".join(self.int_to_str[id] for id in ids)
        txt = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', txt)
        return txt
        

In [None]:
t2 = TokenizerV2(vocab_with_unknown)

id = t2.encode("It's the last he painted, you know, Mrs. Gisburn said with pardonable pride")
print(id)

txt = t2.decode([56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793])
print(txt)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793]
It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride


In [40]:
id = t2.encode("I have encyclopedia")
print(id)

txt = t2.decode([53, 530, 1130])
print(txt)

[53, 530, 1130]
I have <|unk|>
