In [8]:
! pip install numpy pandas tensorflow nltk requests beautifulsoup4



In [9]:
! pip install transformers torch datasets




In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
nltk.download('punkt')


import requests
from bs4 import BeautifulSoup
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Load the cleaned combined dataset
df = pd.read_csv('/content/all_poets_cleaned_poems.csv')

# Combine all poems into a single string with each poem separated by a unique token
all_text = "<|endoftext|>".join(df['Cleaned_Content'].astype(str).tolist())

# Save the formatted text file
with open("poetry_finetune_data.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

print("Poetry data prepared and saved as 'poetry_finetune_data.txt'.")


Poetry data prepared and saved as 'poetry_finetune_data.txt'.


In [12]:
# Load the cleaned dataset
all_poems = pd.read_csv('/content/all_poets_cleaned_poems.csv')

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_poems['Cleaned_Content'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(all_poems['Cleaned_Content'])

# Save the tokenizer
import pickle
with open('poetry_tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Tokenization complete. Tokenizer saved as 'poetry_tokenizer.pkl'.")


Tokenization complete. Tokenizer saved as 'poetry_tokenizer.pkl'.


In [14]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can use "gpt2-medium" or "gpt2-large" for larger models
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Resize the token embeddings to accommodate new tokens if needed
model.resize_token_embeddings(len(tokenizer))

# Load the dataset for GPT-2 fine-tuning
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

# Create a data collator that dynamically pads inputs and labels
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # mlm=False means we are not using masked language modeling
)

# Prepare the dataset and tokenizer
train_dataset = load_dataset("poetry_finetune_data.txt", tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_poetry_model",
    overwrite_output_dir=True,
    num_train_epochs=15,  # Adjust the number of epochs based on performance
    per_device_train_batch_size=2,  # Reduce if running out of memory
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the GPT-2 model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("/content/fine_tuned_model")  # Saves model weights and config files
tokenizer.save_pretrained("/content/tokenizer")  # Save tokenizer files

print("Done")


Step,Training Loss
500,5.0561
1000,4.5657
1500,4.1186
2000,3.7723
2500,3.4823
3000,3.1929
3500,2.9536
4000,2.7851
4500,2.6128
5000,2.5209


Done


In [19]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("/content/fine_tuned_model")
tokenizer = GPT2Tokenizer.from_pretrained("/content/tokenizer")

# Define the seed text
seed_text = "I never asked for much"

# Encode the input text and set up attention mask
input_ids = tokenizer.encode(seed_text, return_tensors='pt')
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# number of poems to generate
num_poems = 20

# Generate  poems
output = model.generate(
    input_ids,
    attention_mask=attention_mask,  # Explicitly set the attention mask
    max_length=150,  # Increase the max length to encourage longer output
    min_length=50,  # Set a minimum length to avoid abrupt endings
    temperature=0.7,  # Control randomness: lower = less random, higher = more creative
    top_k=50,  # Limit the sampling pool to top 50 words for coherence
    top_p=0.95,  # Use nucleus sampling for more flexible outputs
    repetition_penalty=1.2,  # Penalty for repeating words or phrases
    pad_token_id=tokenizer.eos_token_id,  # Use eos_token_id as the pad token
    eos_token_id=tokenizer.eos_token_id,  # Set the end-of-sequence token ID
    no_repeat_ngram_size=2,  # Prevent repeating bigrams
    do_sample=True,  # Enable sampling
    num_return_sequences=num_poems  # Set the number of poems to generate
)

# Decode and display each generated poem
for i, poem in enumerate(output):
    generated_poem = tokenizer.decode(poem, skip_special_tokens=True)
    print(f"Generated Poem {i + 1}:\n{generated_poem}\n")


Generated Poem 1:
I never asked for much in return from them but they gave me what was left of my old men and i found new ones more intelligent than myself so we went to bed together thinking about the whole business its all come true lets face it if you dont mind a little bit extra motivation maybe though whats with your big flat head like that some women wont want that then there might be problems im sure hes just as guilty of being an ass too selfish sometimes even selfish not wanting everything at once doesnt he realize thats his own reward surely better things have been faring well enough id buy him coffee now get out of here do something else

Generated Poem 2:
I never asked for much from you i was young and ignorant of everything the world seemed full only to me a desert of flowers bloomed like roses all afternoon long sun sat down in my room listening music on tv watching cats playing dropped dead sounds then suddenly stars appeared out thru space that didnt seem anywhere else 

In [18]:
 !zip -r poem_generator /content




  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/.last_update_check.json (deflated 22%)
  adding: content/.config/active_config (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2024.10.03/ (stored 0%)
  adding: content/.config/logs/2024.10.03/13.24.37.784488.log (deflated 57%)
  adding: content/.config/logs/2024.10.03/13.24.26.842428.log (deflated 85%)
  adding: content/.config/logs/2024.10.03/13.23.55.907439.log (deflated 93%)
  adding: content/.config/logs/2024.10.03/13.24.27.913856.log (deflated 58%)
  adding: content/.config/logs/2024.10.03/13.24.16.945701.log (deflated 58%)
  adding: content/.config/logs/2024.10.03/13.24.38.390038.log (deflated 56%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  ad