In [4]:
from datasets import load_dataset
import os

os.makedirs("data", exist_ok=True)

data = load_dataset("wikitext", 'wikitext-103-v1', split="train")
data_1m = ""
with open("data/wikitext_1m.txt", "w") as f:
    for line in data["text"][:1_000_000]:
        line = line.replace("<unk>", "").strip()
        if line:
            f.write(line+"\n")
            data_1m += line+"\n"


Found cached dataset wikitext (/Users/vzouhar/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [5]:
from tokenizers import (
    models,
    pre_tokenizers,
    trainers,
    Tokenizer,
)
import collections
dists = {}

def get_bpe_unigram(vocab_size):
    print(vocab_size, "training")
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size)
    tokenizer.train(["data/wikitext_1m.txt"], trainer=trainer)
    
    print(vocab_size, "encoding")
    encoding = tokenizer.encode(data_1m)
    return collections.Counter(encoding.tokens)


dists["bpe_8k"] = get_bpe_unigram(8_000)
dists["bpe_28k"] = get_bpe_unigram(28_000)

8000 training



8000 encoding
28000 training



28000 encoding


In [6]:
from transformers import AutoTokenizer

gpt2tokenizer = AutoTokenizer.from_pretrained("gpt2")
dists["gpt2"] = collections.Counter(gpt2tokenizer.tokenize(data_1m))

Token indices sequence length is longer than the specified maximum sequence length for this model (64274162 > 1024). Running this sequence through the model will result in indexing errors


In [32]:
import copy
import random
import numpy as np

def renyi_entropy(P, alpha):
    scale = 1 / (1 - alpha)

    return scale * np.log2(np.sum([
        prob**alpha
        for prob in P
    ]))

def renyi_eff(P, alpha):
    return renyi_entropy(P, alpha)/np.log(len(P))

def shannon_entropy(P):
    P = np.array(P)
    return -np.sum(P * np.log2(P))

def shannon_eff(P):
    return shannon_entropy(P)/np.log(len(P))

def table_line(P, extra=[]):
    out = extra
    out.append(f"{shannon_entropy(P):.2f}")
    out.append(f"{renyi_entropy(P, 0.5):.2f}")
    out.append(f"{renyi_entropy(P, 3):.2f}")
    out.append(f"{shannon_eff(P):.2f}")
    out.append(f"{renyi_eff(P, 0.5):.2f}")
    out.append(f"{renyi_eff(P, 3):.2f}")
    return "& " + " & ".join(out) + r"\\"

def freqs_to_p(freqs):
    total = sum(freqs.values())
    return [v/total for k, v in freqs.most_common()]


def drop_bpe(freqs, N, k):
    freqs = copy.deepcopy(freqs)

    # get top N
    freq_words = [k for k, v in freqs.most_common(N)]

    # sample k words
    dead_tokens = random.sample(freq_words, k=k)
    for token in dead_tokens:
        # remove existing token
        token_freq = freqs.pop(token)

        # add the old frequency to the individual characters
        for c in token:
            freqs[c] += token_freq

    # drop this special GPT token if it's an individual character if it's there
    if "Ġ" in freqs:
        freqs.pop("Ġ")
    return freqs_to_p(freqs)


def drop_bpe(freqs, N, k):
    # get top N
    freq_words = [k for k, v in freqs.most_common(N)]

    # sample k words
    dead_tokens = random.sample(freq_words, k=k)
    for token in dead_tokens:
        # remove existing token
        token_freq = freqs.pop(token)

        # add the old frequency to the individual characters
        for c in token:
            freqs[c] += token_freq

    return freqs_to_p(freqs)


random.seed(0)
for tokenizer, freqs in dists.items():
    print(tokenizer)
    print(table_line(freqs_to_p(freqs), extra=["", ""]))
    print(table_line(drop_bpe(freqs, 2_500, 500), extra=["2500", "500"]))
    print(table_line(drop_bpe(freqs, 2_500, 1000), extra=["2500", "1000"]))
    print(table_line(drop_bpe(freqs, None, 500), extra=[r"$\infty$", "500"]))
    print(table_line(drop_bpe(freqs, None, 1000), extra=[r"$\infty$", "1000"]))
    print(r"\hdashline", "\n")

bpe_8k
&  &  & 10.16 & 11.82 & 6.17 & 1.13 & 1.32 & 0.69\\
& 2500 & 500 & 8.42 & 11.11 & 5.11 & 0.94 & 1.24 & 0.57\\
& 2500 & 1000 & 7.72 & 10.66 & 5.10 & 0.87 & 1.20 & 0.58\\
& $\infty$ & 500 & 9.51 & 11.49 & 6.20 & 1.06 & 1.29 & 0.69\\
& $\infty$ & 1000 & 8.90 & 11.19 & 5.80 & 1.00 & 1.26 & 0.65\\
\hdashline 

bpe_28k
&  &  & 10.65 & 13.29 & 5.83 & 1.04 & 1.30 & 0.57\\
& 2500 & 500 & 9.16 & 12.74 & 5.53 & 0.90 & 1.25 & 0.54\\
& 2500 & 1000 & 7.75 & 12.23 & 4.66 & 0.76 & 1.20 & 0.46\\
& $\infty$ & 500 & 10.50 & 13.19 & 5.94 & 1.03 & 1.29 & 0.58\\
& $\infty$ & 1000 & 10.34 & 13.10 & 5.98 & 1.01 & 1.28 & 0.59\\
\hdashline 

gpt2
&  &  & 10.76 & 13.66 & 5.93 & 1.00 & 1.27 & 0.55\\
& 2500 & 500 & 9.55 & 13.22 & 5.90 & 0.89 & 1.23 & 0.55\\
& 2500 & 1000 & 8.27 & 12.77 & 5.17 & 0.77 & 1.19 & 0.48\\
& $\infty$ & 500 & 10.70 & 13.61 & 5.99 & 1.00 & 1.27 & 0.56\\
& $\infty$ & 1000 & 10.63 & 13.56 & 6.00 & 0.99 & 1.26 & 0.56\\
\hdashline 

