# **Renaissance Poems Dataset**
### GPT-2 Base, LoRA, Beams Search




In [None]:
!pip install transformers peft torch datasets accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true" # Disable wandb

In [None]:
import torch
import pandas as pd
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import numpy as np
import math

In [None]:
# Load model and tokenizer
model_name = "gpt2"  # Using base GPT-2 (117M parameters)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

print(f"✅ Loaded {model_name} with {model.num_parameters():,} parameters")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Loaded gpt2 with 124,439,808 parameters


In [None]:
# Set up LoRA Configurations
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Causal language modeling
    inference_mode=False,          # Training mode
    r=16,                          # Rank: lower = fewer parameters
    lora_alpha=32,                 # Scaling factor
    lora_dropout=0.1,              # Dropout for regularization
    target_modules=["c_attn"],     # Target attention layers
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
print(f"✅ LoRA applied- Trainable parameters: {model.num_parameters():,}")
model.print_trainable_parameters()

✅ LoRA applied- Trainable parameters: 125,029,632
trainable params: 589,824 || all params: 125,029,632 || trainable%: 0.4717




In [None]:
# Load processed data file

def load_text_file(file_path):
    """Load and split text file into individual poems"""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split by end tokens and clean up
    poems = content.split('<|endoftext|>')
    poems = [poem.strip() for poem in poems if poem.strip()]

    return poems

# Load training and validation data
train_poems = load_text_file('traini_poems.txt')
val_poems = load_text_file('vali_poems.txt')

# Other Option: Mount Google Drive and adjust file path
#train_poems = load_text_file('folder_name/train_poems.txt')
#val_poems = load_text_file('folder_name/val_poems.txt')

print(f"✅ Loaded {len(train_poems)} training poems")
print(f"✅ Loaded {len(val_poems)} validation poems")

✅ Loaded 285 training poems
✅ Loaded 15 validation poems


In [None]:
# Tokenize data

def tokenize_function(examples):
    """Tokenize all poems for training"""
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=1024,   # GPT-2 context limit
        return_tensors="pt"
    )
    tokenized['labels'] = tokenized['input_ids'].clone() # For language modeling, labels are same as input_ids

    return tokenized

# Create datasets
train_dataset = Dataset.from_dict({'text': train_poems})
val_dataset = Dataset.from_dict({'text': val_poems})

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

print(f"✅ Tokenization complete")

Map:   0%|          | 0/285 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

✅ Tokenization complete


In [None]:
# Set up data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Doing causal LM, not masked LM
)

## Perplexity


*   Intrinsic evaluation metric that measures how well a model predicts the next word in a sequence, given a set of prior words

*   Lower perplexity indicates better performance, since it means that the model is confident and accurate in making these predictions

*   Can only be used to compare models trained on the same data



In [None]:
# Define function for perplexity
def compute_metrics(eval_pred):
    """Compute perplexity from evaluation predictions"""
    predictions, labels = eval_pred

    # Convert to tensors if not already
    predictions = torch.from_numpy(predictions) if isinstance(predictions, np.ndarray) else predictions
    labels = torch.from_numpy(labels) if isinstance(labels, np.ndarray) else labels

    # For causal LM, need to shift predictions and labels
    shift_logits = predictions[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    # Flatten for loss calculation
    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    shift_labels = shift_labels.view(-1)

    # Calculate cross-entropy loss, ignoring padding tokens
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
    loss = loss_fct(shift_logits, shift_labels)

    # Calculate perplexity
    perplexity = torch.exp(loss).item()

    return {
        "perplexity": perplexity,
        "eval_loss": loss.item()
    }

In [None]:
# Configure Training Arguments

training_args = TrainingArguments(
    output_dir="./lora_poetry_model",      # Where to save model
    overwrite_output_dir=True,             # Overwrite if exists
    num_train_epochs=3,                    # Number of training epochs
    per_device_train_batch_size=2,         # Train batch size (adjust for GPU)
    per_device_eval_batch_size=2,          # Eval batch size
    warmup_steps=100,                      # Warmup steps
    weight_decay=0.01,                     # Weight decay for regularization
    logging_dir="./logs",                  # Directory for logs
    logging_steps=10,                      # Log every 10 steps
    eval_strategy="steps",                 # Evaluate every eval_steps
    eval_steps=50,                         # Evaluate every 50 steps
    save_steps=100,                        # Save every 100 steps
    save_total_limit=3,                    # Keep only 3 best models
    load_best_model_at_end=True,           # Load best model at the end
    metric_for_best_model="perplexity",    # Use perplexity for best model
    greater_is_better=False,               # Lower perplexity is better
    report_to=None,                        # Disable wandb/tensorboard
    learning_rate=4e-4,                    # Learning rate
)

print(f"✅ Training configuration set")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


✅ Training configuration set


In [None]:
# Initialize the trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print(f"✅ Trainer initialized")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


✅ Trainer initialized


In [None]:
# Train the model
trainer.train()

print("\n✅ Training completed")


Step,Training Loss,Validation Loss,Perplexity
50,3.7212,3.784583,45.568729
100,3.7821,3.807861,46.625134
150,3.7186,3.782579,45.349892
200,3.635,3.766504,44.615501
250,3.8242,3.770515,44.872063
300,3.7318,3.754681,44.057369
350,3.7775,3.750897,43.900299
400,3.5855,3.748414,43.86018



✅ Training completed


In [None]:
# Save final model
trainer.save_model("./lora_poetry_final")
tokenizer.save_pretrained("./lora_poetry_final")

print("✅ Model saved to './lora_poetry_final'")

✅ Model saved to './lora_poetry_final'


In [None]:
# Test the model with a sample generation
model.eval()

# Get the device the model is on
device = next(model.parameters()).device
print(f"Model device: {device}")

def generate_poem(prompt, max_length=85):
    # Add a line break after the prompt to encourage poetic structure
    full_prompt = f"{prompt}\n"

    encoded = tokenizer(full_prompt, return_tensors="pt")
    inputs = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    torch.manual_seed(42) # Random seed for reproducibility

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            attention_mask=attention_mask,
            max_length=max_length,
            num_beams=5,                  # Recommended 3-10, higher = more thorough but perhaps less creative
            early_stopping=True,
            no_repeat_ngram_size=3,       # Reduces repetition, 3 is balanced
            repetition_penalty=1.2,       # Also reduces repetition
            do_sample=False,              # Off for beam search
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    poem = generated_text[len(prompt):].strip()

    return poem


Model device: cuda:0


In [None]:
# Test with Short Prompts
print("\nTest with Short Prompts:")
print("-" * 50)

short_prompts = ["Thou art as fair as", "A sonnet for the lady of"]

for i, prompt in enumerate(short_prompts):
    print(f"\nPoem {i+1}:")
    print(f"Prompt: '{prompt}'")
    poem = generate_poem(prompt)
    print(f"Generated:\n{poem}")
    print("-" * 40)


Test with Short Prompts:
--------------------------------------------------

Poem 1:
Prompt: 'Thou art as fair as'
Generated:
'Twas the sun, and the moon, and all the stars;
And thou art the fairest of them all.
'Tis true, then, that thou art fairest,
Because thou art a fairest woman,
And I am thy fairest friend,
For thou art my fairest love, and I love thee.

'Now, then
----------------------------------------

Poem 2:
Prompt: 'A sonnet for the lady of'
Generated:
'Tis the season, and thou shalt see it.
'Twas the sun, and the moon,
And the stars, and all the gods,
Which are in heaven, and in the earth;
And thou shall see them, and behold them,
As thou sawest them, as thou sawst them.
But thou shalt not see them again,
----------------------------------------


In [None]:
# Test with Longer Prompts
print("\nTest with Longer Prompts:")
print("-" * 50)

# 2 line prompts
longer_prompts = [
    "To the fairest maid I ever did behold,\nThy beauty shames the sun, so bright and bold",
    "The moon doth sail the night on silver wings,\nWhile mortals dream beneath its gentle light,",
]

for i, prompt in enumerate(longer_prompts):
    print(f"\nExtended Poem {i+1}:")
    print(f"Prompt: '{prompt}'")
    print("-" * 45)

    try:
        poem = generate_poem(prompt, max_length=85)
        if poem.strip():
            print(f"{poem}")
        else:
            print("(No additional text generated)")
    except Exception as e:
        print(f"Error generating poem: {e}")

    print("-" * 45)


Test with Longer Prompts:
--------------------------------------------------

Extended Poem 1:
Prompt: 'To the fairest maid I ever did behold,
Thy beauty shames the sun, so bright and bold'
---------------------------------------------
That I could not see it.
But when I saw it, I knew that it was a beautiful thing.
And when I looked at it again, I said,
"You know that I love you?"
"Yes, I do."
"Then why do you love me?"
---------------------------------------------

Extended Poem 2:
Prompt: 'The moon doth sail the night on silver wings,
While mortals dream beneath its gentle light,'
---------------------------------------------
And when the sun doth shine upon the moon,
Or when the stars do not shine,
Nor when the moon does not rise, nor when the sky does not fall,
But when the heavens do not move,
Neither when the sea does not flow, neither when the wind does not blow,
---------------------------------------------
