In [2]:
from datasets import load_dataset
import os

os.makedirs("data", exist_ok=True)

data = load_dataset("wikitext", 'wikitext-103-v1', split="train")
data_1m = ""
with open("data/wikitext_1m.txt", "w") as f:
    for line in data["text"][:1_000_000]:
        line = line.replace("<unk>", "").strip()
        if line:
            f.write(line+"\n")
            data_1m += line+"\n"


Found cached dataset wikitext (/Users/vzouhar/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [3]:
from tokenizers import (
    models,
    pre_tokenizers,
    trainers,
    Tokenizer,
)
import collections
dists = {}

def get_bpe_unigram(vocab_size):
    print(vocab_size, "training")
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size)
    tokenizer.train(["data/wikitext_1m.txt"], trainer=trainer)
    
    print(vocab_size, "encoding")
    encoding = tokenizer.encode(data_1m)
    return collections.Counter(encoding.tokens)


dists["bpe_8k"] = get_bpe_unigram(8_000)
dists["bpe_28k"] = get_bpe_unigram(28_000)

8000 training



8000 encoding
28000 training



28000 encoding


In [4]:
from transformers import AutoTokenizer

gpt2tokenizer = AutoTokenizer.from_pretrained("gpt2")
# TODO: replace this with .tokenize call which might be slower but will make the types the same as for the bpe
dists["gpt2"] = collections.Counter(gpt2tokenizer.tokenize(data_1m))

Token indices sequence length is longer than the specified maximum sequence length for this model (64274162 > 1024). Running this sequence through the model will result in indexing errors


In [20]:
import numpy as np

def renyi_entropy(P, alpha):
    scale = 1 / (1 - alpha)

    return scale * np.log2(np.sum([
        prob**alpha
        for prob in P
    ]))

def renyi_eff(P, alpha):
    return renyi_entropy(P, alpha)/np.log(len(P))

def shannon_entropy(P):
    P = np.array(P)
    return -np.sum(P * np.log2(P))

def shannon_eff(P):
    return shannon_entropy(P)/np.log(len(P))

def duplicate_bpe(P, N, k):
    return P[N:]+[x/k for x in P[:N]] * k

def table_line(P, extra=[]):
    out = extra
    out.append(f"{shannon_entropy(P):.2f}")
    out.append(f"{renyi_entropy(P, 0.5):.2f}")
    out.append(f"{renyi_entropy(P, 3):.2f}")
    out.append(f"{shannon_eff(P):.2f}")
    out.append(f"{renyi_eff(P, 0.5):.2f}")
    out.append(f"{renyi_eff(P, 3):.2f}")
    return "& " + " & ".join(out) + r"\\"

for tokenizer, freq in dists.items():
    total = sum(freq.values())
    P = [v/total for k, v in freq.most_common()]
    print(tokenizer)
    print(table_line(P, extra=["", ""]))
    print(table_line(duplicate_bpe(P, 100, 3), extra=["100", "3"]))
    print(table_line(duplicate_bpe(P, 100, 5), extra=["100", "5"]))
    print(table_line(duplicate_bpe(P, 500, 3), extra=["500", "3"]))
    print(table_line(duplicate_bpe(P, 500, 5), extra=["500", "5"]))
    print(r"\hdashline", "\n")

bpe_8k
&  &  & 10.16 & 11.82 & 6.17 & 1.13 & 1.32 & 0.69\\
& 100 & 3 & 10.86 & 12.01 & 7.75 & 1.21 & 1.33 & 0.86\\
& 100 & 5 & 11.18 & 12.14 & 8.48 & 1.24 & 1.34 & 0.94\\
& 500 & 3 & 11.16 & 12.28 & 7.75 & 1.23 & 1.35 & 0.85\\
& 500 & 5 & 11.62 & 12.56 & 8.49 & 1.26 & 1.36 & 0.92\\
\hdashline 

bpe_28k
&  &  & 10.65 & 13.29 & 5.83 & 1.04 & 1.30 & 0.57\\
& 100 & 3 & 11.37 & 13.40 & 7.42 & 1.11 & 1.31 & 0.72\\
& 100 & 5 & 11.71 & 13.48 & 8.15 & 1.14 & 1.32 & 0.80\\
& 500 & 3 & 11.59 & 13.54 & 7.42 & 1.13 & 1.32 & 0.72\\
& 500 & 5 & 12.02 & 13.71 & 8.16 & 1.17 & 1.33 & 0.79\\
\hdashline 

gpt2
&  &  & 10.76 & 13.66 & 5.93 & 1.00 & 1.27 & 0.55\\
& 100 & 3 & 11.49 & 13.77 & 7.52 & 1.07 & 1.28 & 0.70\\
& 100 & 5 & 11.83 & 13.83 & 8.26 & 1.10 & 1.29 & 0.77\\
& 500 & 3 & 11.70 & 13.89 & 7.52 & 1.09 & 1.29 & 0.70\\
& 500 & 5 & 12.13 & 14.04 & 8.26 & 1.12 & 1.30 & 0.77\\
\hdashline 

