## GPT-Neo with Fine Tuning

In [2]:
# load and explore the poetry dataset
from datasets import load_dataset
import pandas as pd

In [4]:
# Load the poetry dataset
poetry_dataset = load_dataset("merve/poetry")

Repo card metadata block was not found. Setting CardData to empty.


In [8]:
# Convert to pandas 
train_df = pd.DataFrame(poetry_dataset['train'])

train_df.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [10]:
# Filter the dataset
renaissance_love = train_df[(train_df['age'] == 'Renaissance') & (train_df['type'] == 'Love')]

renaissance_love.shape

(243, 5)

In [28]:
# Save the filtered dataset
renaissance_love_poems = renaissance_love['content'].tolist()

In [30]:
# Clean and prepare the poetry texts
def clean_poem_text(text):
    """Clean poem text for training"""
    # Remove excessive whitespace and normalize line breaks
    text = text.replace('\
\
', '\
').replace('\
', '\
')
    # Remove multiple consecutive newlines
    import re
    text = re.sub(r'\
\s*\
', '\
\
', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

In [32]:
# Clean all poems
cleaned_poems = [clean_poem_text(poem) for poem in renaissance_love_poems]

In [34]:
cleaned_poems

['Why didst thou promise such a beauteous day,\r\nAnd make me travel forth without my cloak,\r\nTo let base clouds oertake me in my way,\r\nHiding thy bravery in their rotten smoke?\r\nTis not enough that through the cloud thou break,\r\nTo dry the rain on my storm-beaten face,\r\nFor no man well of such a salve can speak\r\nThat heals the wound and cures not the disgrace:\r\nNor can thy shame give physic to my grief;\r\nThough thou repent, yet I have still the loss:\r\nThe offenders sorrow lends but weak relief\r\nTo him that bears the strong offences cross.\r\n   Ah! but those tears are pearl which thy love sheds,\r\n   And they are rich and ransom all ill deeds.',
 'Weret aught to me I bore the canopy,\r\nWith my extern the outward honouring,\r\nOr laid great bases for eternity,\r\nWhich proves more short than waste or ruining;\r\nHave I not seen dwellers on form and favour\r\nLose all, and more, by paying too much rent,\r\nFor compound sweet forgoing simple savour,\r\nPitiful thriv

In [36]:
# Show statistics
poem_lengths = [len(content) for content in cleaned_poems]

print(f"Number of poems: {len(cleaned_poems)}")
print(f"Average poem length: {sum(poem_lengths) / len(poem_lengths):.1f} characters")
print(f"Shortest poem: {min(poem_lengths)} characters")
print(f"Longest poem: {max(poem_lengths)} characters")

Number of poems: 243
Average poem length: 863.2 characters
Shortest poem: 121 characters
Longest poem: 12638 characters


In [38]:
# Install PEFT (Parameter Efficient Fine-Tuning) for LoRA
%pip install peft==0.7.1
%pip install accelerate==0.25.0

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting peft==0.7.1
  Downloading peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Downloading peft-0.7.1-py3-none-any.whl (168 kB)
   ---------------------------------------- 0.0/168.3 kB ? eta -:--:--
   ------------------------------------ --- 153.6/168.3 kB 4.5 MB/s eta 0:00:01
   ---------------------------------------- 168.3/168.3 kB 4.9 MB/s eta 0:00:00
Installing collected packages: peft
Successfully installed peft-0.7.1
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting accelerate==0.25.0
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
   ---------------------------------------- 0.0/265.7 kB ? eta -:--:--
   -------------------------------------- - 256.0/265.7 kB 5.2 MB/s eta 0:00:01
   ---------------------------------------- 265.7/265

In [40]:
# Set up LoRA fine-tuning configuration
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer
import torch




In [42]:
# Configure LoRA parameters
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Causal Language Modeling
    inference_mode=False,
    r=16,  # Rank - higher = more parameters but better quality
    lora_alpha=32,  # LoRA scaling parameter
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc_in", "fc_out"]  # GPT-Neo specific modules
)

In [44]:
print("\
LoRA Configuration:")
print(f"  Task Type: {lora_config.task_type}")
print(f"  Rank (r): {lora_config.r}")
print(f"  Alpha: {lora_config.lora_alpha}")
print(f"  Dropout: {lora_config.lora_dropout}")
print(f"  Target modules: {lora_config.target_modules}"

SyntaxError: incomplete input (2467441758.py, line 7)

In [46]:
# Apply LoRA to GPT-Neo model
peft_model = get_peft_model(neo_model, lora_config)

NameError: name 'neo_model' is not defined

In [None]:
# Print trainable parameters
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in peft_model.parameters())

print(f"\
Model Parameters After LoRA:")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Total parameters: {total_params:,}")
print(f"  Percentage trainable: {100 * trainable_params / total_params:.2f}%")

In [None]:
# Show the model structure
print(f"\
LoRA Model Structure:")
print(peft_model.print_trainable_parameters())

In [None]:
# Create a custom dataset class for poetry training
from torch.utils.data import Dataset
import torch

In [None]:
class PoetryDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # For causal language modeling, labels are the same as input_ids
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()
        }

In [None]:
# Create the dataset
train_dataset = PoetryDataset(training_texts, neo_tokenizer, max_length=512)

print(f"Dataset created with {len(train_dataset)} examples")
print(f"Max sequence length: 512 tokens")

In [None]:
# Test the dataset
sample = train_dataset[0]

print(f"\
Sample data shapes:")
print(f"  input_ids: {sample['input_ids'].shape}")
print(f"  attention_mask: {sample['attention_mask'].shape}")
print(f"  labels: {sample['labels'].shape}")