In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
!apt-get install git-lfs

In [None]:
from datasets import load_dataset

# This can take a few minutes to load, so grab a coffee or tea while you wait!

raw_data = load_dataset("roneneldan/TinyStories")

In [None]:
raw_data["train"]

In [None]:
def get_training_corpus():
    dataset = raw_data["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["text"]
training_corpus = get_training_corpus()

In [None]:
from transformers import AutoTokenizer, LlamaTokenizer

old_tokenizer =  AutoTokenizer.from_pretrained("Jae-star/llama-sc")
# old_tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>"})
old_tokenizer.pad_token = ""
print("Vocab size:", old_tokenizer.vocab_size)
print("Special tokens:", old_tokenizer.special_tokens_map)

In [None]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 2000)

In [None]:
def evaluate_tokenizer(tokenizer, test_texts):
    metrics = {}
    
    # Compression metrics
    total_chars = sum(len(text) for text in test_texts)
    total_words = sum(len(text.split()) for text in test_texts)
    
    all_tokens = []
    for text in test_texts:
        tokens = tokenizer.encode(text)
        all_tokens.extend(tokens)
    
    total_tokens = len(all_tokens)
    
    metrics['chars_per_token'] = total_chars / total_tokens
    metrics['words_per_token'] = total_words / total_tokens
    metrics['tokens_per_word'] = total_tokens / total_words
    
    # Vocabulary utilization
    unique_tokens = set(all_tokens)
    metrics['vocab_utilization'] = len(unique_tokens) / tokenizer.vocab_size
    
    # Calculate unknown token rate if you have an UNK token ID
    if hasattr(tokenizer, 'unk_token_id'):
        unk_count = all_tokens.count(tokenizer.unk_token_id)
        metrics['unknown_token_rate'] = unk_count / total_tokens
    
    return metrics

In [None]:
shuffled_dataset = raw_data["train"].shuffle(seed=42)
samples = shuffled_dataset.take(10000)
print(evaluate_tokenizer(old_tokenizer, samples))
print(evaluate_tokenizer(tokenizer, samples))