### GPT-Neo 125M

#### model_name: EleutherAI/gpt-neo-125M
#### description: Open-source GPT alternative, trained on diverse text
#### pros: More diverse training data, Open source, Good for creative tasks
#### cons: Larger than DistilGPT-2, May need more compute

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets # Keep concatenate_datasets if you plan to re-add Gutenberg later
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from safetensors.torch import load_file # For loading adapter_model.safetensors
import os




In [3]:
# --- 1. Configuration ---
# --- CHANGED MODEL_NAME TO GPT-NEO ---
MODEL_NAME = "EleutherAI/gpt-neo-1.3B"
DATASET_NAME = "merve/poetry"
OUTPUT_DIR = "./gpt_neo_renaissance_love_poems_lora" # Changed output directory for GPT-Neo
LEARNING_RATE = 2e-4
BATCH_SIZE = 4 # Adjust based on your GPU VRAM
NUM_EPOCHS = 3 # Can increase if desired, but watch for overfitting
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

In [4]:
# --- 2. Load and Filter Dataset ---
print(f"Loading dataset: {DATASET_NAME}")
dataset = load_dataset(DATASET_NAME)

def filter_renaissance_love(example):
    return example['age'] == 'Renaissance'

filtered_dataset = dataset['train'].filter(filter_renaissance_love)
print(f"Original dataset size: {len(dataset['train'])}")
print(f"Filtered dataset size (Renaissance Love Poems): {len(filtered_dataset)}")

Loading dataset: merve/poetry


Repo card metadata block was not found. Setting CardData to empty.


Filter:   0%|          | 0/573 [00:00<?, ? examples/s]

Original dataset size: 573
Filtered dataset size (Renaissance Love Poems): 315


In [5]:
# --- 3. Data Preparation (Tokenization) ---
print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# GPT-Neo also typically benefits from setting a pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Use 'content' as confirmed from dataset inspection
    return tokenizer(examples['content'], truncation=True, max_length=512)

tokenized_dataset = filtered_dataset.map(
    tokenize_function,
    batched=True,
    # Corrected column names based on previous error
    remove_columns=['author', 'content', 'poem name', 'age', 'type']
)

# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Loading tokenizer: EleutherAI/gpt-neo-1.3B


Map:   0%|          | 0/315 [00:00<?, ? examples/s]

In [6]:
# --- 4. Load Model and Configure LoRA ---
print(f"Loading model: {MODEL_NAME}")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Configure LoRA
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Loading model: EleutherAI/gpt-neo-1.3B
trainable params: 1,572,864 || all params: 1,317,148,672 || trainable%: 0.11941431012580485


In [7]:
# --- 5. Fine-tuning ---
print("Starting fine-tuning...")

# Ensure output directory exists for Trainer
os.makedirs(OUTPUT_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True, # Set to True to start fresh if directory exists
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,
    save_steps=500, # Save checkpoint every 500 steps
    save_total_limit=2, # Save only the last 2 checkpoints
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(), # Enable mixed precision training if CUDA is available
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Add a try-except block to ensure saving even if training is interrupted
try:
    trainer.train()
    # Save the LoRA adapters after successful training
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"Fine-tuning complete! Model and tokenizer saved to {OUTPUT_DIR}")
except Exception as e:
    print(f"An error occurred during training: {e}")
    print("Attempting to save current model state...")
    try:
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        print(f"Partial model and tokenizer saved to {OUTPUT_DIR}")
    except Exception as save_e:
        print(f"Error saving partial model: {save_e}")

Starting fine-tuning...


Step,Training Loss
50,3.1587
100,3.1446
150,3.1187
200,3.0622


Fine-tuning complete! Model and tokenizer saved to ./gpt_neo_renaissance_love_poems_lora


In [25]:
# --- 6. Text Generation (Evaluation) ---
print("\n--- Generating Sample Text ---")

# Load the base model and then apply the saved LoRA adapters
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
lora_model = get_peft_model(base_model, lora_config)

# Load the state dictionary from the .safetensors file
adapter_path = os.path.join(OUTPUT_DIR, "adapter_model.safetensors")

if not os.path.exists(adapter_path):
    print(f"Error: LoRA adapter file not found at {adapter_path}.")
    print("Please ensure fine-tuning completed successfully and the file was saved.")
    # Exit or handle the error more gracefully if you are running this as a script
    exit()

lora_state_dict = load_file(adapter_path, device="cpu")
lora_model.load_state_dict(lora_state_dict, strict=False)

# Make sure the model is in evaluation mode
lora_model.eval()

#prompt = "O, my love, you are like"
prompt = "O fairest rose that bloomed in summer's garden,"


# Encode with attention_mask
inputs = tokenizer.encode_plus(
    prompt,
    return_tensors='pt',
    padding='longest',
    truncation=True,
    max_length=tokenizer.model_max_length
)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Move to GPU if available
if torch.cuda.is_available():
    lora_model.to('cuda')
    input_ids = input_ids.to('cuda')
    attention_mask = attention_mask.to('cuda')

print(f"Prompt: {prompt}")

# Generate text using beam search (optimized for coherence)
output = lora_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=4,
    repetition_penalty=1.2,
    num_beams=5,          # Using beam search for higher quality
    do_sample=False,      # Turn off sampling for pure beam search
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Poem:\n", generated_text)

print("\n--- Project Complete ---")


--- Generating Sample Text ---
Prompt: O fairest rose that bloomed in summer's garden,
Generated Poem:
 O fairest rose that bloomed in summer's garden,
     The fairest flower that ever grew,
  And the fairest of them all, the fairest rose,
  Was the fairest that ever bloomed

--- Project Complete ---
