<a href="https://colab.research.google.com/github/theouterlimitz/PoetryGenerator/blob/main/Rap_Shakespeare_Style_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Absolutely! Here's the updated code that trains on both Shakespearean sonnets and rap lyrics, removing the beam search and temperature scaling modifications:

In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl
!pip install transformers datasets torch PyMuPDF accelerate

import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
import fitz
import tempfile
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

def preprocess_text(text):
    text = text.replace("\n", " ")
    return text

def load_pdf_data(uploaded):
    text = ""
    for fn in uploaded.keys():
        with fitz.open(fn) as doc:
            for page in doc:
                text += page.get_text()
    return text

def build_text_dataset(text, tokenizer, block_size=128):
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp_file:
        tmp_file.write(text)
        tmp_file_path = tmp_file.name

    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=tmp_file_path,
        block_size=block_size,
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    return dataset, data_collator

def train_model(dataset, data_collator, model):
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=200,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )
    trainer.train()

def generate_poem(model, tokenizer, prompt="", max_length=1000, num_return_sequences=1): # Added num_return_sequences with default value 1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    # If you want to generate multiple sequences, uncomment the line below and set num_beams > 1
    # output = model.generate(input_ids, max_length=max_length, num_return_sequences=num_return_sequences, num_beams=5)
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=num_return_sequences)
    poem = tokenizer.decode(output[0], skip_special_tokens=True)
    return poem

# Upload Shakespeare Sonnets PDF
from google.colab import files
uploaded_sonnets = files.upload()

# Load and preprocess Shakespeare Sonnets data
sonnets_text = load_pdf_data(uploaded_sonnets)
sonnets_text = preprocess_text(sonnets_text)

# Upload Rap Lyrics PDF
uploaded_lyrics = files.upload()

# Load and preprocess Rap Lyrics data
rap_lyrics_text = load_pdf_data(uploaded_lyrics)
rap_lyrics_text = preprocess_text(rap_lyrics_text)

# Combine Datasets
combined_text = sonnets_text + rap_lyrics_text

# Build Dataset and Data Collator
dataset, data_collator = build_text_dataset(combined_text, tokenizer)

# Train the Model
train_model(dataset, data_collator, model)

# Generate a poem based on the combined dataset
generated_poem = generate_poem(model, tokenizer, prompt="The moon shines dimly, weeping in the sky for the fallen, shards of light piercing dark dimensions")
print(generated_poem)

**Key Changes:**

- **Removed Beam Search and Temperature Scaling:** The `generate_poem` function now uses the default greedy decoding strategy, as the `num_beams` and `temperature` parameters have been removed.

<div class="md-recitation">
  Sources
  <ol>
  <li><a href="https://github.com/mhadihossaini/Custom_GPT2_Text_Generation">https://github.com/mhadihossaini/Custom_GPT2_Text_Generation</a></li>
  <li><a href="https://github.com/MeLLL-UFF/tuning_sentiment">https://github.com/MeLLL-UFF/tuning_sentiment</a> subject to MIT</li>
  <li><a href="https://discuss.huggingface.co/t/gpt2-training-from-scratch-in-german/1157">https://discuss.huggingface.co/t/gpt2-training-from-scratch-in-german/1157</a></li>
  </ol>
</div>