In [None]:
# ⬛ SET‑UP ⬛
!pip install -q datasets transformers evaluate
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, DataCollatorForLanguageModeling,
                          AutoModelForMaskedLM, TrainingArguments, Trainer)
import evaluate, json, os, re, itertools, random

CORPUS_PATH = "data/raw/toy_lang_dataset/text_corpus.txt"   # swap to bigger corpus
MODEL_NAME = "distilbert-base-uncased"

# ⬛ LOAD TEXT ⬛
with open(CORPUS_PATH) as f:
    lines = [l.strip() for l in f if l.strip()]
ds = Dataset.from_dict({"text": lines}).train_test_split(test_size=0.2, seed=42)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=128)
ds = ds.map(tokenize, batched=True, remove_columns=["text"])

# ⬛ MODEL ⬛
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)

args = TrainingArguments(
    "models/vocab",
    per_device_train_batch_size=16,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-5,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator,
)
trainer.train()

# ⬛ PERPLEXITY ⬛
perplexity_metric = evaluate.load("perplexity", module_type="metric")
ppl = perplexity_metric.compute(model_id="models/vocab", input_texts=lines[:100])["perplexity"]
print("Perplexity:", ppl)
with open("results/vocab_perplexity.txt", "w") as f:
    f.write(f"{ppl:.2f}")

# ⬛ GENERATE SIMPLE WORD‑LIST ⬛
def get_vocab(model, tokenizer, top_k=100):
    # sort by token frequency in tokenizer vocab indexes 999‑.. gives real sub‑words
    tokens = tokenizer.get_vocab()
    most_common = sorted(tokens.items(), key=lambda x: x[1])[:top_k]
    return [tok for tok, _ in most_common]
print(get_vocab(model, tokenizer)[:20])
