## Test Run

In [1]:
text_to_tokenize = """Here is some text to tokenize. It is long and not very usefule but does work as a test"""
tokens = text_to_tokenize.encode("utf-8") # raw bytes
ids = list(map(int, tokens))

In [2]:
def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids, pair, idx):
  newids = []
  i = 0
  while i < len(ids):
    if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
      newids.append(idx)
      i += 2
    else:
      newids.append(ids[i])
      i += 1
  return newids

In [3]:
# ---
vocab_size = 276 # the desired final vocabulary size
num_merges = vocab_size - 256
ids = list(tokens) # copy so we don't destroy the original list

merges = {} # (int, int) -> int
for i in range(num_merges):
  stats = get_stats(ids)
  if (len(stats) > 0):
    pair = max(stats, key=stats.get)
    idx = 256 + i
    print(f"merging {pair} into a new token {idx}")
    ids = merge(ids, pair, idx)
    merges[pair] = idx

merging (115, 32) into a new token 256
merging (32, 116) into a new token 257
merging (116, 32) into a new token 258
merging (101, 114) into a new token 259
merging (101, 32) into a new token 260
merging (105, 256) into a new token 261
merging (257, 101) into a new token 262
merging (257, 111) into a new token 263
merging (32, 97) into a new token 264
merging (72, 259) into a new token 265
merging (265, 260) into a new token 266
merging (266, 261) into a new token 267
merging (267, 115) into a new token 268
merging (268, 111) into a new token 269
merging (269, 109) into a new token 270
merging (270, 101) into a new token 271
merging (271, 262) into a new token 272
merging (272, 120) into a new token 273
merging (273, 116) into a new token 274
merging (274, 263) into a new token 275


In [4]:
merges

{(115, 32): 256,
 (32, 116): 257,
 (116, 32): 258,
 (101, 114): 259,
 (101, 32): 260,
 (105, 256): 261,
 (257, 101): 262,
 (257, 111): 263,
 (32, 97): 264,
 (72, 259): 265,
 (265, 260): 266,
 (266, 261): 267,
 (267, 115): 268,
 (268, 111): 269,
 (269, 109): 270,
 (270, 101): 271,
 (271, 262): 272,
 (272, 120): 273,
 (273, 116): 274,
 (274, 263): 275}

In [5]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

def decode(ids):
  # given ids (list of integers), return Python string
  tokens = b"".join(vocab[idx] for idx in ids)
  text = tokens.decode("utf-8", errors="replace")
  return text

print(decode([128]))

�


In [6]:
def encode(text):
  # given a string, return list of integers (the tokens)
  tokens = list(text.encode("utf-8"))
  while len(tokens) >= 2:
    stats = get_stats(tokens)
    pair = min(stats, key=lambda p: merges.get(p, float("inf")))
    if pair not in merges:
      break # nothing else can be merged
    idx = merges[pair]
    tokens = merge(tokens, pair, idx)
  return tokens

print(encode(""))

[]


In [7]:
print(decode(encode("hello world")))

hello world


In [8]:
decode([32, 116])

' t'

In [9]:
t = encode("at the water park")
t

[97, 116, 257, 104, 260, 119, 97, 116, 259, 32, 112, 97, 114, 107]

## Full Run - Tokenize Shakespere

In [10]:
# download the TinyShakespeare dataset
!wget -O input.txt https://raw.githubusercontent.com/vvr-rao/my-mini-LLama/main/input_text/input.txt
!mkdir -p input_folder
!mv input.txt input_folder/

# load the dataset
with open('./input_folder/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

--2024-05-25 08:25:58--  https://raw.githubusercontent.com/vvr-rao/my-mini-LLama/main/input_text/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-05-25 08:25:59 (16.7 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [11]:
print(len(text))
print(text[:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [12]:
def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids, pair, idx):
  newids = []
  i = 0
  while i < len(ids):
    if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
      newids.append(idx)
      i += 2
    else:
      newids.append(ids[i])
      i += 1
  return newids

In [13]:
tokens = text.encode("utf-8") # raw bytes
ids = list(map(int, tokens))

In [14]:
# ---
vocab_size = 296 # the desired final vocabulary size
num_merges = vocab_size - 256
ids = list(tokens) # copy so we don't destroy the original list

merges = {} # (int, int) -> int
for i in range(num_merges):
  stats = get_stats(ids)
  if (len(stats) > 0):
    pair = max(stats, key=stats.get)
    idx = 256 + i
    print(f"merging {pair} into a new token {idx}")
    ids = merge(ids, pair, idx)
    merges[pair] = idx

merging (101, 32) into a new token 256
merging (116, 104) into a new token 257
merging (116, 32) into a new token 258
merging (115, 32) into a new token 259
merging (100, 32) into a new token 260
merging (44, 32) into a new token 261
merging (111, 117) into a new token 262
merging (101, 114) into a new token 263
merging (105, 110) into a new token 264
merging (121, 32) into a new token 265
merging (97, 110) into a new token 266
merging (58, 10) into a new token 267
merging (111, 114) into a new token 268
merging (111, 32) into a new token 269
merging (101, 110) into a new token 270
merging (10, 10) into a new token 271
merging (97, 114) into a new token 272
merging (32, 257) into a new token 273
merging (111, 110) into a new token 274
merging (108, 108) into a new token 275
merging (104, 97) into a new token 276
merging (44, 10) into a new token 277
merging (46, 271) into a new token 278
merging (105, 259) into a new token 279
merging (101, 115) into a new token 280
merging (121, 262) 

In [27]:
len(merges), len(vocab), type(merges), type(vocab)

(40, 296, dict, dict)

In [30]:
#merge the vocabulary and save it
import pickle

vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

!mkdir -p vocab
file_name = f'./vocab/vocab.pkl'

with open(file_name, 'wb') as f:
    pickle.dump(vocab, f)

In [31]:

def decode(ids):
  # given ids (list of integers), return Python string
  tokens = b"".join(vocab[idx] for idx in ids)
  text = tokens.decode("utf-8", errors="replace")
  return text

In [32]:
def encode(text):
  # given a string, return list of integers (the tokens)
  tokens = list(text.encode("utf-8"))
  while len(tokens) >= 2:
    stats = get_stats(tokens)
    pair = min(stats, key=lambda p: merges.get(p, float("inf")))
    if pair not in merges:
      break # nothing else can be merged
    idx = merges[pair]
    tokens = merge(tokens, pair, idx)
  return tokens

In [33]:
decode([105, 259])

'is '

In [36]:
decode(encode("Wherefore art thou Romeo!! and wherefore are the tater tots?"))

'Wherefore art thou Romeo!! and wherefore are the tater tots?'