In [64]:
!pip install striprtf
import re



In [65]:
from striprtf.striprtf import rtf_to_text

with open("/text.rtf", "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text = rtf_to_text(raw_text)

print("Total number of characters (clean): ", len(raw_text))
print("--- Preview ---")
print(raw_text[:99])

Total number of characters (clean):  1791
--- Preview ---
Research Statement
I want to understand the true limits of what neural networks can compute exactly


In [66]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['Research', 'Statement', 'I', 'want', 'to', 'understand', 'the', 'true', 'limits', 'of', 'what', 'neural', 'networks', 'can', 'compute', 'exactly', '.', 'A', 'recent', 'paper', 'from', 'the', 'University', 'of', 'Waterloo', 'titled', '“Learning', 'to', 'Add', ',']


In [67]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

181


In [68]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [69]:
for i,item in enumerate(vocab.items()):
  print(item)
  if i >= 20:
    break

(',', 0)
('.', 1)
('?', 2)
('A', 3)
('AI', 4)
('Add', 5)
('Algorithmic', 6)
('Can', 7)
('Exactly', 8)
('Execute', 9)
('I', 10)
('If', 11)
('Instructions', 12)
('I’ll', 13)
('Multiply', 14)
('My', 15)
('Networks”', 16)
('Neural', 17)
('Research', 18)
('Statement', 19)
('This', 20)


In [70]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
     self.str_to_int = vocab
     self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [71]:
tokenizer = SimpleTokenizerV1(vocab)
ids = tokenizer.encode(raw_text)
print(ids)

[18, 19, 10, 174, 165, 169, 158, 167, 101, 119, 176, 115, 114, 50, 55, 70, 1, 3, 142, 127, 76, 158, 21, 119, 22, 164, 180, 165, 5, 0, 14, 0, 34, 9, 6, 12, 8, 179, 17, 16, 147, 157, 146, 115, 114, 50, 99, 165, 27, 34, 112, 70, 0, 46, 121, 47, 145, 90, 92, 84, 59, 89, 49, 156, 1, 20, 93, 25, 133, 135, 119, 56, 0, 162, 129, 25, 128, 74, 102, 4, 130, 54, 70, 1, 20, 93, 110, 53, 165, 86, 87, 51, 99, 0, 177, 87, 51, 50, 67, 79, 32, 100, 28, 152, 34, 79, 165, 35, 118, 119, 89, 148, 0, 178, 138, 165, 41, 110, 65, 74, 60, 98, 97, 106, 0, 108, 119, 178, 171, 170, 166, 48, 165, 81, 158, 95, 66, 1, 3, 141, 74, 163, 58, 41, 157, 106, 39, 117, 26, 165, 37, 30, 76, 89, 125, 126, 94, 46, 140, 104, 159, 1, 15, 82, 93, 165, 109, 43, 28, 34, 111, 1, 10, 33, 117, 121, 91, 88, 143, 123, 32, 46, 64, 160, 44, 120, 168, 119, 131, 31, 57, 74, 153, 155, 40, 36, 1, 10, 174, 165, 72, 124, 176, 96, 119, 122, 50, 41, 73, 70, 2, 23, 96, 39, 117, 131, 2, 7, 175, 63, 25, 107, 78, 75, 119, 156, 74, 38, 46, 62, 32, 2, 2

In [72]:
tokenizer.decode(ids)

'Research Statement I want to understand the true limits of what neural networks can compute exactly. A recent paper from the University of Waterloo titled “Learning to Add, Multiply, and Execute Algorithmic Instructions Exactly with Neural Networks” showed that shallow neural networks can learn to add and multiply exactly, but only by restructuring inputs into hand crafted input called templates. This is a powerful proof of concept, therefore paves a path for making AI perform computation exactly. This is much closer to how human children learn, where human children can easily generalize algorithms like addition subtraction and generalize to any number of input size, which proves to be much difficult for current large language models, most of which usually use tool call to get the job done. A reason for this could be that models are not able to approximate algorithm from input output pairs itself but rather memorize them. My goal is to move beyond addition and multiplication. I am not

In [73]:
text = "This might not be in vocab, so will generate error"
tokenizer.encode(text)

KeyError: 'might'

**Special Context Tokens**

In [79]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
len(vocab.items())

183

In [80]:
for i,item in enumerate(list(vocab.items())[-5:]):
  print(item)

('which', 178)
('with', 179)
('“Learning', 180)
('<|endoftext|>', 181)
('<|unk|>', 182)


In [82]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
     self.str_to_int = vocab
     self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [83]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "This might not be in vocab, so will generate error"
text2 = "Let's check"

text = " <|endoftext|> ".join((text1,text2))
print(text)

This might not be in vocab, so will generate error <|endoftext|> Let's check


In [84]:
tokenizer.encode(text)

[20, 182, 117, 41, 88, 182, 0, 182, 182, 182, 182, 181, 182, 182, 182, 182]

In [85]:
tokenizer.decode(tokenizer.encode(text))

'This <|unk|> not be in <|unk|>, <|unk|> <|unk|> <|unk|> <|unk|> <|endoftext|> <|unk|> <|unk|> <|unk|> <|unk|>'