In [None]:
!nvidia-smi

!pip -q install -U transformers datasets peft accelerate einops
!pip -q install -U evaluate textstat gradio

Mon Oct 20 07:48:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pathlib import Path
import textwrap, json, re
import unicodedata
import numpy as np
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict
from datasets import load_dataset
import math, gc, time, random, torch
from typing import Dict, Any, List
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

In [None]:
sonnets_txt_path = Path("/content/drive/MyDrive/bardify/shakespeares-sonnets_TXT_FolgerShakespeare.txt")
sonnets_txt = sonnets_txt_path.read_text(encoding="utf-8", errors="ignore")

# Cleaning the data

In [None]:
def remove_start_end(txt):
    start_to_delete = re.search(r"(?m)^\s*1\s*$", txt)
    txt = txt[start_to_delete.start():]
    txt = re.split(r"\bTwo\s+Sonnets\s+from\s+The\s+Passionate\s+Pilgrim\b",
                   txt, flags=re.I)[0]
    return txt

removed_st_end = remove_start_end(sonnets_txt)

In [None]:
def normalise_indents(txt):
    txt = re.sub(r"\r\n?", "\n", txt)
    txt = re.sub(r"\n{3,}", "\n\n", txt).strip()
    txt = re.sub(r"(?m)^[ \t]+", "", txt)
    txt = re.sub(r"(?m)[ \t]+$", "", txt)
    return txt

normalised = normalise_indents(removed_st_end)

In [None]:
dash = "\u2014"

def hyphen_to_dash(txt):
    txt = unicodedata.normalize("NFC", txt)
    txt = re.sub(r'(?<=\S)--(?=\S)', dash, txt)
    txt = re.sub(r'\s--\s', f' {dash} ', txt)
    txt = re.sub(r'(?m)--\s*$', f' {dash}', txt)
    txt = re.sub(r'(?m)^(?=\S)--', dash, txt)
    return txt

dashes_added = hyphen_to_dash(normalised)

In [None]:
def separate_sonnets(txt):
    sonnets = []
    curr_num = None
    curr_block = []

    def add_curr_sonnet():
        if curr_num is not None and curr_block:
            body = "\n".join(curr_block).strip()
            if body:
                sonnets.append((curr_num, body))

    lines = dashes_added.splitlines()
    for line in lines:
        new_heading = re.match(r"^\s*(\d{1,3})\s*$", line)
        if new_heading:
            add_curr_sonnet()
            curr_num = int(new_heading.group(1))
            curr_block = []
        else:
            if curr_num is not None:
                line = re.sub(r"\s+", " ", line).strip()
                curr_block.append(line)

    add_curr_sonnet()
    return sonnets

separated = separate_sonnets(dashes_added)

In [None]:
corpus = "\n\n".join([sonnet for _, sonnet in separated]).strip()
print("Corpus size (chars):", len(corpus))

Corpus size (chars): 93621


# Cleaned sonnets to file

In [None]:
Path("/content/corpus").mkdir(parents=True, exist_ok=True)
(Path("/content/corpus") / "all_sonnets.txt").write_text(corpus, encoding="utf-8")

print("Saved:", "/content/corpus/all_sonnets.txt")

Saved: /content/corpus/all_sonnets.txt


# Train/test split (K-Fold)

In [None]:
sonnets = [sonnet for _, sonnet in separated]
idx = np.arange(len(sonnets))
kf = KFold(n_splits=10, shuffle=True, random_state=42)

BASE = Path("/content/corpus_cv")
BASE.mkdir(parents=True, exist_ok=True)

fold_paths = []
for fold, (tr, va) in enumerate(kf.split(idx), 1):
    fdir = BASE / f"f{fold}"
    fdir.mkdir(parents=True, exist_ok=True)
    (fdir / "train.txt").write_text("\n\n".join(sonnets[i] for i in tr), encoding="utf-8")
    (fdir / "valid.txt").write_text("\n\n".join(sonnets[i] for i in va), encoding="utf-8")
    fold_paths.append((str(fdir / "train.txt"), str(fdir / "valid.txt")))

print(f"Prepared such folds:\n")
for fold_path in fold_paths:
    print(f"{fold_path}\n")

Prepared such folds:

('/content/corpus_cv/f1/train.txt', '/content/corpus_cv/f1/valid.txt')

('/content/corpus_cv/f2/train.txt', '/content/corpus_cv/f2/valid.txt')

('/content/corpus_cv/f3/train.txt', '/content/corpus_cv/f3/valid.txt')

('/content/corpus_cv/f4/train.txt', '/content/corpus_cv/f4/valid.txt')

('/content/corpus_cv/f5/train.txt', '/content/corpus_cv/f5/valid.txt')

('/content/corpus_cv/f6/train.txt', '/content/corpus_cv/f6/valid.txt')

('/content/corpus_cv/f7/train.txt', '/content/corpus_cv/f7/valid.txt')

('/content/corpus_cv/f8/train.txt', '/content/corpus_cv/f8/valid.txt')

('/content/corpus_cv/f9/train.txt', '/content/corpus_cv/f9/valid.txt')

('/content/corpus_cv/f10/train.txt', '/content/corpus_cv/f10/valid.txt')



# Tokenisation

In [None]:
train_model = "gpt2-medium"
tokeniser = AutoTokenizer.from_pretrained(train_model)

if tokeniser.pad_token is None:
    tokeniser.pad_token = tokeniser.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def token_len(text):
    return len(tokeniser(text, add_special_tokens=False)["input_ids"])

lengths = [(num, token_len(txt)) for num, txt in separated]
max_sonnet_num, max_token_seq = max(lengths, key=lambda x: x[1])

print(f"Max token sequence length is {max_token_seq} (Sonnet {max_sonnet_num})")

Max token sequence length is 186 (Sonnet 99)


In [None]:
BLOCK_SIZE = 192
if tokeniser.pad_token is None:
    tokeniser.pad_token = tokeniser.eos_token

collator = DataCollatorForLanguageModeling(tokenizer=tokeniser, mlm=False)

In [None]:
def read_poems(path: str):
    text = Path(path).read_text(encoding="utf-8").strip()
    return [p for p in text.split("\n\n") if p.strip()]

In [None]:
def get_folds(fold_k):
    train_path, valid_path = fold_paths[fold_k - 1]
    train_docs = read_poems(train_path)
    valid_docs = read_poems(valid_path)

    raw = DatasetDict({
        "train": Dataset.from_dict({"text": train_docs}),
        "validation": Dataset.from_dict({"text": valid_docs}),
    })

    def tokenize_chunk(batch):
        texts = [t + ("\n" + (tokeniser.eos_token or "")) for t in batch["text"]]
        enc = tokeniser(
            texts,
            add_special_tokens=False,
            truncation=True,
            max_length=BLOCK_SIZE,
            return_overflowing_tokens=True,
            return_attention_mask=False,
        )
        return {"input_ids": enc["input_ids"]}

    return raw.map(tokenize_chunk, batched=True, remove_columns=["text"])

In [None]:
lm = get_folds(1)
print(lm)

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 138
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 16
    })
})


# Training

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

HP_LR, HP_R, HP_ALPHA, HP_DROPOUT = "lr", "r", "alpha", "dropout"
HP_BATCH, HP_GACC, HP_EPOCHS = "batch_size", "grad_accum", "epochs"

def train_eval_one_fold_with(h, fold_k):
    lm_dsets = get_folds(fold_k)

    model = AutoModelForCausalLM.from_pretrained(
        train_model,
        dtype=torch.float16 if device == "cuda" else torch.float32
    ).to(device)

    lora_cfg = LoraConfig(
        r=h[HP_R],
        lora_alpha=h[HP_ALPHA],
        lora_dropout=h[HP_DROPOUT],
        target_modules=["c_attn", "c_fc", "c_proj"],
        bias="none",
        task_type="CAUSAL_LM",
        fan_in_fan_out=True
    )
    model = get_peft_model(model, lora_cfg)
    bs = h.get(HP_BATCH, 2)
    ga = h.get(HP_GACC, 8)
    ne = h.get(HP_EPOCHS, 3)

    args = TrainingArguments(
        output_dir=f"/content/bardify_cv_search/f{fold_k}",
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        gradient_accumulation_steps=ga,

        eval_strategy="steps", eval_steps=200,
        logging_strategy="steps", logging_steps=100,
        save_strategy="no",

        learning_rate=h[HP_LR],
        num_train_epochs=ne,
        warmup_steps=100,
        weight_decay=0.01,
        report_to=[],

        lr_scheduler_type="cosine",
        warmup_ratio=0.10,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=lm_dsets["train"],
        eval_dataset=lm_dsets["validation"],
        data_collator=collator,
    )

    trainer.train()
    ev = trainer.evaluate()
    ppl = math.exp(ev["eval_loss"]) if "eval_loss" in ev else float("nan")
    del trainer, model
    gc.collect()
    if device == "cuda":
        torch.cuda.empty_cache()
    return ppl

In [None]:
def sample_hparams():
    return {
        "lr": 1e-4,
        "r": 64,
        "alpha": 128,
        "dropout": 0.05,
        "batch_size": 8,
        "grad_accum": 2,
        "epochs": 3,
    }

trials_num = 1
folds = list(range(1, len(fold_paths) + 1))

search_results = []

t_start = time.time()
for t in range(1, trials_num + 1):
    h = sample_hparams()
    print(f"\n=== Trial {t}/{trials_num} ===  {h}")

    perplexities = []
    for k in folds:
        print(f"  Fold {k} ...", end="", flush=True)
        ppl_k = train_eval_one_fold_with(h, k)
        perplexities.append(float(ppl_k))
        print(f" PPL={ppl_k:.3f}")

    mean_perplexity = sum(perplexities)/len(perplexities)
    std_perplexity  = (sum((x-mean_perplexity)**2 for x in perplexities)/len(perplexities))**0.5

    rec = {
        "trial": t,
        "hparams": h,
        "folds": folds,
        "perplexities": [round(x, 6) for x in perplexities],
        "mean_ppl": round(mean_perplexity, 6),
        "std_ppl": round(std_perplexity, 6),
        "seconds": round(time.time() - t_start, 1),
    }
    search_results.append(rec)
    print(f"==> Trial {t}: mean PPL = {mean_perplexity:.3f}  (std {std_perplexity:.3f})")

search_results_sorted = sorted(search_results, key=lambda r: r["mean_ppl"])
print("\nTop 3 configs (by mean PPL):")
for r in search_results_sorted[:3]:
    print(r["hparams"], "mean:", r["mean_ppl"], "std:", r["std_ppl"])


=== Trial 1/1 ===  {'lr': 0.0001, 'r': 64, 'alpha': 128, 'dropout': 0.05, 'batch_size': 8, 'grad_accum': 2, 'epochs': 3}
  Fold 1 ...

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=82.162
  Fold 2 ...

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=79.056
  Fold 3 ...

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=76.973
  Fold 4 ...

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=81.864
  Fold 5 ...

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=73.360
  Fold 6 ...

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=68.819
  Fold 7 ...

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=73.950
  Fold 8 ...

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=76.942
  Fold 9 ...

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=66.350
  Fold 10 ...

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 PPL=82.415
==> Trial 1: mean PPL = 76.189  (std 5.293)

Top 3 configs (by mean PPL):
{'lr': 0.0001, 'r': 64, 'alpha': 128, 'dropout': 0.05, 'batch_size': 8, 'grad_accum': 2, 'epochs': 3} mean: 76.189156 std: 5.293379


# Generation

In [None]:
from transformers import pipeline, AutoModelForCausalLM

gen_model = AutoModelForCausalLM.from_pretrained("/content/bardify_lora").to(device)
pipe = pipeline("text-generation", model=gen_model, tokenizer=tokeniser, device=0 if device=="cuda" else -1)

def make_prompt(topic):
    topic = topic.strip()
    return (
        f"Upon the theme of '{topic}', in lofty measure write,\n"
    )

def generate_poem(topic="time and roses", max_new_tokens=160, temperature=0.9, top_p=0.92, repetition_penalty=1.07):
    prompt = make_prompt(topic)
    out = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=float(temperature),
        top_p=float(top_p),
        repetition_penalty=float(repetition_penalty),
        eos_token_id=tokeniser.eos_token_id,
        pad_token_id=tokeniser.eos_token_id
    )[0]["generated_text"]
    return out[len(prompt):].strip()

print(generate_poem("table"))

In [None]:
print(generate_poem("table"))

In [None]:
def to_sonnet(text, target_lines = 14, max_width = 62):
    parts = re.split(r'(?<=[\.\!\?\:\;\,])\s+', text)
    lines = []
    for p in parts:
        p = p.strip()
        if p:
            lines.extend(re.findall(rf'.{{1,{max_width}}}(?:\s+|$)', p))
    lines = [l.strip() for l in lines if l.strip()]
    if len(lines) >= target_lines:
        lines = lines[:target_lines]
    else:
        lines += [""]*(target_lines - len(lines))
    return "\n".join(lines)

sample = generate_poem("eternal summer", max_new_tokens=140)
print("\nResulting sonnet\n", to_sonnet(sample))