In [1]:
import pickle
import random
import time

import numpy as np
import pandas as pd

# I had to run "pip install -e ." in the directory cs336-assignment1
# for the following imports to work.
from cs336_basics.bpe import ENCODING, train_bpe
from cs336_basics.tokenizer import Tokenizer
from cs336_basics.utils import Timer

In [2]:
special_tokens = ["<|endoftext|>"]

# Tiny Stories

## Build Vocab

In [3]:
%%time
vocab, merges = train_bpe(
    "../../data/TinyStoriesV2-GPT4-train.txt", 10_000, special_tokens
)  # 1min

CPU times: user 16 s, sys: 216 ms, total: 16.2 s
Wall time: 1min 1s


In [5]:
# with open('../output/tiny_stories_bpe_vocab.pkl', 'wb') as file:
#     pickle.dump(vocab, file)
# with open('../output/tiny_stories_bpe_merges.pkl', 'wb') as file:
#     pickle.dump(merges, file)

with open("../../output/tiny_stories_bpe_vocab.pkl", "rb") as file:
    vocab = pickle.load(file)
with open("../../output/tiny_stories_bpe_merges.pkl", "rb") as file:
    merges = pickle.load(file)

In [6]:
# longest token in vocab
max(vocab.values(), key=lambda token: len(token)).decode(encoding=ENCODING)

' accomplishment'

## Compression Ratio

In [10]:
tokenizer_ts = Tokenizer.from_files(
    "../../output/tiny_stories_bpe_vocab.pkl",
    "../../output/tiny_stories_bpe_merges.pkl",
    special_tokens,
)

In [28]:
start = 0
end = 100_000
with open("../../data/TinyStoriesV2-GPT4-train.txt", "rb") as f:
    f.seek(start)
    chunk: str = f.read(end - start).decode(encoding=ENCODING)

print(special_tokens)
chunks_ts = chunk.split(special_tokens[0])
len(chunks_ts)

['<|endoftext|>']


126

In [13]:
res = []
for chunk in random.sample(chunks_ts[1:-1], k=10):
    tokens = tokenizer_ts.encode(chunk)
    compression_ratio = len(chunk.encode(encoding=ENCODING)) / len(tokens)
    res.append(compression_ratio)
print(f"compression_ratio={np.array(res).mean():.1f}")

compression_ratio=4.0


In [29]:
# Use OpenWebText Tokenizer on TinyStories
res = []
for chunk in random.sample(chunks_ts[1:-1], k=10):
    tokens = tokenizer_owt.encode(chunk)
    compression_ratio = len(chunk.encode(encoding=ENCODING)) / len(tokens)
    res.append(compression_ratio)
print(f"compression_ratio={np.array(res).mean():.1f}")

compression_ratio=3.9


## Throughput

In [14]:
target_size = 1000
start = 0
end = 10_000_000
with open("../../data/TinyStoriesV2-GPT4-train.txt", "rb") as f:
    f.seek(start)
    chunk: bytes = f.read(end - start)
data_size_bytes = len(chunk)

In [15]:
start = time.time()
for _ in tokenizer_ts.encode_iterable([chunk.decode(encoding=ENCODING)]):
    pass
end = time.time()
duration = end - start
throughput = data_size_bytes / duration
throughput  # 670KB/s

775867.5833202705

In [16]:
pd.Timedelta(seconds=825e9 / throughput)  # Pile dataset is 825GB

Timedelta('12 days 07:22:05.775861740')

# OWT

In [6]:
%%time
vocab, merges = train_bpe("../../data/owt_train.txt", 32_000, special_tokens)  # 1h7min

CPU times: user 48min 17s, sys: 12min 44s, total: 1h 1min 2s
Wall time: 1h 7min 15s


In [19]:
# with open('../../output/owt_bpe_vocab.pkl', 'wb') as file:
#     pickle.dump(vocab, file)
# with open('../../output/owt_bpe_merges.pkl', 'wb') as file:
#     pickle.dump(merges, file)

with open("../../output/owt_bpe_vocab.pkl", "rb") as file:
    vocab = pickle.load(file)
with open("../../output/owt_bpe_merges.pkl", "rb") as file:
    merges = pickle.load(file)

In [20]:
# longest token in vocab
max(vocab.values(), key=lambda token: len(token)).decode(encoding=ENCODING)

'ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ'

In [21]:
len(vocab)

32000

## Compression Ratio

In [23]:
tokenizer_owt = Tokenizer.from_files(
    "../../output/owt_bpe_vocab.pkl",
    "../../output/owt_bpe_merges.pkl",
    special_tokens,
)

In [24]:
start = 0
end = 100_000
with open("../../data/owt_train.txt", "rb") as f:
    f.seek(start)
    chunk: str = f.read(end - start).decode(encoding=ENCODING)

print(special_tokens)
chunks_owt = chunk.split(special_tokens[0])
len(chunks_ts)

['<|endoftext|>']


126

In [25]:
res = []
for chunk in random.sample(chunks_owt[1:-1], k=10):
    tokens = tokenizer_owt.encode(chunk)
    compression_ratio = len(chunk.encode(encoding=ENCODING)) / len(tokens)
    res.append(compression_ratio)
print(f"compression_ratio={np.array(res).mean():.1f}")

compression_ratio=4.4


In [26]:
# Use TinyStories Tokenizer on OpenWebText
res = []
for chunk in random.sample(chunks_owt[1:-1], k=10):
    tokens = tokenizer_ts.encode(chunk)
    compression_ratio = len(chunk.encode(encoding=ENCODING)) / len(tokens)
    res.append(compression_ratio)
print(f"compression_ratio={np.array(res).mean():.1f}")

compression_ratio=3.1
