In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting transformers[sentencepiece]
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━

In [2]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
!apt-get install git-lfs

Detected operating system as Ubuntu/jammy.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Detected apt version as 2.4.10
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... Packagecloud gpg key imported to /etc/apt/keyrings/github_git-lfs-archive-keyring.gpg
done.
Running apt-get update... done.

The repository is setup! You can now install packages.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 143 not upgraded.
Need to get 8489 kB of archives.
After this operation, 18.1 MB of additional disk space will be used.
Get:1 https://packagecloud.io/github/git-lfs/ubuntu jammy/main amd64 git-lfs amd64 3.6.1 [8489 kB]
Fetched 8489 kB in 2s (5081 kB/s)  
debconf: delaying package configuration, since a

In [3]:
from datasets import load_dataset

# This can take a few minutes to load, so grab a coffee or tea while you wait!

raw_data = load_dataset("roneneldan/TinyStories")

README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

(…)-00000-of-00004-2d5a1467fff1081b.parquet:   0%|          | 0.00/249M [00:00<?, ?B/s]

(…)-00001-of-00004-5852b56a2bd28fd9.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00002-of-00004-a26307300439e943.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

(…)-00003-of-00004-d243063613e5a057.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00000-of-00001-869c898b519ad725.parquet:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [4]:
raw_data["train"]

Dataset({
    features: ['text'],
    num_rows: 2119719
})

In [7]:
def get_training_corpus():
    dataset = raw_data["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["text"]
training_corpus = get_training_corpus()

In [5]:
from transformers import AutoTokenizer, LlamaTokenizer

old_tokenizer =  AutoTokenizer.from_pretrained("Jae-star/llama-sc")
# old_tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>"})
old_tokenizer.pad_token = ""
print("Vocab size:", old_tokenizer.vocab_size)
print("Special tokens:", old_tokenizer.special_tokens_map)

tokenizer_config.json:   0%|          | 0.00/820 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

Vocab size: 32000
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>'}


In [None]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 2000)





In [None]:
def evaluate_tokenizer(tokenizer, test_texts):
    metrics = {}
    
    # Compression metrics
    total_chars = sum(len(text) for text in test_texts)
    total_words = sum(len(text.split()) for text in test_texts)
    
    all_tokens = []
    for text in test_texts:
        tokens = tokenizer.encode(text)
        all_tokens.extend(tokens)
    
    total_tokens = len(all_tokens)
    
    metrics['chars_per_token'] = total_chars / total_tokens
    metrics['words_per_token'] = total_words / total_tokens
    metrics['tokens_per_word'] = total_tokens / total_words
    
    # Vocabulary utilization
    unique_tokens = set(all_tokens)
    metrics['vocab_utilization'] = len(unique_tokens) / tokenizer.vocab_size
    
    # Calculate unknown token rate if you have an UNK token ID
    if hasattr(tokenizer, 'unk_token_id'):
        unk_count = all_tokens.count(tokenizer.unk_token_id)
        metrics['unknown_token_rate'] = unk_count / total_tokens
    
    return metrics

In [None]:
shuffled_dataset = raw_data["train"].shuffle(seed=42)
samples = shuffled_dataset.take(10000)
print(evaluate_tokenizer(old_tokenizer, samples))
print(evaluate_tokenizer(tokenizer, samples))