In [None]:
# ⬛ SET‑UP ⬛
!pip install -q datasets transformers sacrebleu
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          TrainingArguments, Trainer, DataCollatorForLanguageModeling)
import sacrebleu, json, random, re, torch

CORPUS_PATH = "data/raw/toy_lang_dataset/text_corpus.txt"
MODEL_NAME = "gpt2"             # small; good for quick tests

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token   # GPT2 has no pad

# ⬛ LOAD & BUILD PROMPTS ⬛
def wrap(line):
    return f"<prompt>Generate a culturally relevant dialogue:\n{line}\n<end>"
with open(CORPUS_PATH) as f:
    prompts = [wrap(l.strip()) for l in f if l.strip()]
ds = Dataset.from_dict({"text": prompts}).train_test_split(test_size=0.2, seed=42)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=128)
ds = ds.map(tokenize, batched=True, remove_columns=["text"])

# ⬛ MODEL ⬛
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    "models/gpt2",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_steps=20,
    learning_rate=5e-5,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator,
)
trainer.train()

# ⬛ BLEU ⬛
def eval_bleu():
    preds, refs = [], []
    for ex in ds["test"]:
        input_ids = tokenizer(ex["input_ids"], return_tensors="pt").input_ids
        gen_ids = model.generate(input_ids, max_length=50, do_sample=False, num_beams=5)[0]
        gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True)
        preds.append(gen_text.split())
        refs.append([tokenizer.decode(ex["input_ids"], skip_special_tokens=True).split()])
    bleu = sacrebleu.corpus_bleu(preds, refs).score
    return bleu

bleu = eval_bleu()
print("BLEU:", bleu)
with open("results/gpt2_bleu.json", "w") as f:
    json.dump({"bleu": bleu}, f, indent=2)

# ⬛ GENERATE SAMPLE EXERCISE ⬛
prompt = "<prompt>Generate a fill‑in‑the‑blank exercise using common greetings in the language.<end>"
gen = model.generate(tokenizer(prompt, return_tensors="pt").input_ids,
                     max_len_
