# Poetry Generation

In [21]:
import tensorflow as tf
import string
import requests
import pandas as pd

# Import necessary libraries
import tensorflow as tf
import string
import requests
import pandas as pd

# Fetch poetry data from the provided URL
response = requests.get('https://raw.githubusercontent.com/laxmimerit/poetry-data/master/adele.txt')
data = response.text.splitlines()


In [3]:
print('Length of data: ', len(data))

Length of data:  2400


In [4]:
# Import additional libraries
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

- Let’s do the tokenization

In [5]:
# Create a Tokenizer object
token = Tokenizer()

# Fit the Tokenizer on the poetry data
token.fit_on_texts(data)

# The Tokenizer is now configured based on the text data.
# It assigns a unique integer index to each unique word in the corpus.
# The word_index attribute of the Tokenizer contains the mapping of words to their respective indices.
# The word_counts attribute contains the frequency of each word in the corpus.


- Let’s encode tokenized words. It will convert text data to numerical tokens

In [6]:
# Convert the poetry text to sequences of integers using the Tokenizer
encoded_text = token.texts_to_sequences(data)

# The encoded_text is a list of lists where each inner list represents a line of poetry
# Each word in the poetry is replaced with its corresponding integer index assigned by the Tokenizer.

# Determine the vocabulary size for the one-hot encoding
# It is the total number of unique words in the corpus plus 1.
# The additional 1 is added to account for the reserved index 0 which is typically used for padding.
vocab_size = len(token.word_counts) + 1


- Prepare Training Data

In [7]:
# Create an empty list to store sequences of increasing lengths from each encoded line of poetry
datalist = []

# Iterate over each encoded line of poetry
for d in encoded_text:

    # Check if the length of the encoded line is greater than 1
    if len(d) > 1:

        # Iterate over a range of indices starting from 2 to the length of the encoded line
        for i in range(2, len(d)):

            # Append sequences of increasing lengths to datalist
            datalist.append(d[:i])

            # Print the sequence for visualization (optional, for debugging or understanding)
            print(d[:i])


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[197, 203, 438, 16, 1167]
[27, 6]
[27, 6, 218]
[27, 6, 218, 77]
[27, 6, 218, 77, 27]
[27, 6, 218, 77, 27, 6]
[27, 6, 218, 77, 27, 6, 218]
[27, 6, 218, 77, 27, 6, 218, 77]
[27, 6, 218, 77, 27, 6, 218, 77, 21]
[27, 6]
[27, 6, 218]
[27, 6, 218, 77]
[27, 6, 218, 77, 27]
[27, 6, 218, 77, 27, 6]
[27, 6, 218, 77, 27, 6, 218]
[27, 6, 218, 77, 27, 6, 218, 77]
[27, 6, 218, 77, 27, 6, 218, 77, 21]
[27, 6, 218, 77, 27, 6, 218, 77, 21, 11]
[27, 6]
[27, 6, 218]
[27, 6, 218, 77]
[27, 6, 218, 77, 27]
[27, 6, 218, 77, 27, 6]
[27, 6, 218, 77, 27, 6, 218]
[27, 6, 218, 77, 27, 6, 218, 77]
[27, 6, 218, 77, 27, 6, 218, 77, 21]
[27, 6, 218, 77, 27, 6, 218, 77, 21, 11]
[27, 6, 218, 77, 27, 6, 218, 77, 21, 11, 683]
[27, 6, 218, 77, 27, 6, 218, 77, 21, 11, 683, 129]
[27, 6, 218, 77, 27, 6, 218, 77, 21, 11, 683, 129, 8]
[27, 6, 218, 77, 27, 6, 218, 77, 21, 11, 683, 129, 8, 158]
[27, 6, 218, 77, 27, 6, 218, 77, 21, 11, 683, 129, 8, 158, 243]
[27, 6,

- Padding


- It will make sure all data points have the same length because text sentences could have variable lengths.

In [8]:
# Define the maximum length of sequences
max_length = 20

# Pad sequences in datalist to have a consistent length of max_length, adding padding at the beginning
sequences = pad_sequences(datalist, maxlen=max_length, padding='pre')

# Separate input sequences (X) and their corresponding output labels (y)
X = sequences[:, :-1]
y = sequences[:, -1]

# Convert the output labels to categorical one-hot encoding based on the vocabulary size
y = to_categorical(y, num_classes=vocab_size)

# Define the length of input sequences
seq_length = X.shape[1]


#LSTM for NLP and Poetry Generation:
1. Introduction:

LSTM (Long Short-Term Memory) is a type of recurrent neural network designed to capture long-term dependencies in sequential data, making it ideal for NLP tasks.
2. Applications in NLP:

Sequence modeling, language translation, and named entity recognition benefit from LSTM's ability to handle sequential data effectively.
3. Working Principle:

LSTMs use memory cells and gates to control information flow, facilitating the retention of context in textual sequences.
4. Poetry Generation:

Text is tokenized, and the LSTM is trained on poetic sequences to generate new text based on learned patterns.
5. Hyperparameters:

Embedding size and LSTM units influence the model's capacity to capture information.
6. Training Process:

Categorical crossentropy loss and optimization algorithms like Adam are used during training.
7. Challenges:

Overfitting and data quality impact the model's creativity and coherence in poetry generation.
8. Evaluation:

Perplexity is a common metric used to assess language model quality.
9. Future Improvements:

Attention mechanisms and transfer learning can enhance LSTM models for poetry generation.
10. Conclusion:

LSTMs are effective for NLP tasks and creative applications like poetry generation, with continuous research contributing to their advancements.

- LSTM Model Training

Add two LSTM cells, each having 100 input units.

In [9]:
# Create a Sequential model
model = Sequential()

# Add an Embedding layer to the model with input vocabulary size, output dimension 50, and input length as seq_length
model.add(Embedding(vocab_size, 50, input_length=seq_length))

# Add an LSTM layer with 100 units and return sequences for the entire sequence
model.add(LSTM(100, return_sequences=True))

# Add another LSTM layer with 100 units
model.add(LSTM(100))

# Add a Dense layer with 100 units and ReLU activation function
model.add(Dense(100, activation='relu'))

# Add the output Dense layer with vocabulary size and softmax activation function
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model with categorical crossentropy loss, Adam optimizer, and accuracy metric
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the input sequences (X) and output labels (y) with batch size 32 and 50 epochs
# Note: This step may take some time to complete
model.fit(X, y, batch_size=32, epochs=50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7fb73ea572e0>

## Poetry Generation

In [10]:
# Set the length of each line in the generated poetry
poetry_length = 10

# Define a function to generate poetry given a seed text and the number of lines
def generate_poetry(seed_text, n_lines):
  for i in range(n_lines):
    text = []
    for _ in range(poetry_length):
      # Convert the seed text to sequences and pad them
      encoded = token.texts_to_sequences([seed_text])
      encoded = pad_sequences(encoded, maxlen=seq_length, padding='pre')

      # Predict the next word index using the trained model
      y_pred = np.argmax(model.predict(encoded), axis=-1)

      # Retrieve the predicted word from the index
      predicted_word = ""
      for word, index in token.word_index.items():
        if index == y_pred:
          predicted_word = word
          break

      # Update the seed text with the predicted word and add it to the text list
      seed_text = seed_text + ' ' + predicted_word
      text.append(predicted_word)

    # Update the seed text with the last word in the generated text
    seed_text = text[-1]

    # Combine the generated words into a line of poetry
    text = ' '.join(text)

    # Print the generated poetry line
    print(text)



- This function will take seed text and the number of lines we want to generate.



In [11]:
# Set the initial seed text for poetry generation
seed_text = 'i miss you'

# Call the generate_poetry function to generate 10 lines of poetry
generate_poetry(seed_text, 10)


i miss my mind melt my heart away dropped you
say i'm selfish i agree with you on my lips
to write had a lump in my roots it's in
the world of your right with by the way you
say i'm selfish i agree with you on my lips
to write had a lump in my roots it's in
the world of your right with by the way you
say i'm selfish i agree with you on my lips
to write had a lump in my roots it's in
the world of your right with by the way you


Another Code for poetry Generation

In [12]:
# Import necessary libraries
import requests  # For making HTTP requests
from bs4 import BeautifulSoup  # For web scraping
import string  # For string manipulation
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
from tensorflow.keras.preprocessing.text import Tokenizer  # For tokenizing text
from tensorflow.keras.utils import to_categorical  # For one-hot encoding
from tensorflow.keras.models import Sequential  # For creating a sequential model
from tensorflow.keras.layers import Dense, LSTM, Embedding  # For defining model layers
from tensorflow.keras.preprocessing.sequence import pad_sequences  # For padding sequences

In [13]:
# Function to extract text from an HTML file
def extract_text_from_html(file_path):
    # Open the HTML file in read mode with utf-8 encoding
    with open(file_path, 'r', encoding='utf-8') as file:
        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(file, 'html.parser')

        # Extract text from HTML paragraphs (modify as needed based on your HTML structure)
        text = ' '.join([p.get_text() for p in soup.find_all('p')])

        # Return the extracted text
        return text


In [14]:
# Replace 'your_html_file_path' with the actual path to your HTML file
html_file_path = '/content/pg59824-images.html'

# Extract text from the HTML file and split it into lines
data = extract_text_from_html(html_file_path).splitlines()

In [15]:
# Tokenization
# Create a Tokenizer object
token = Tokenizer()

# Fit the Tokenizer on the text data to generate word indices
token.fit_on_texts(data)

# Convert the text data into sequences of integers based on the Tokenizer's word index
encoded_text = token.texts_to_sequences(data)

In [16]:
# Calculate vocabulary size, adding 1 to account for the reserved index 0
vocab_size = len(token.word_counts) + 1

# Initialize an empty list to store sequences
datalist = []

# Generate sequences by iterating through the encoded_text
for d in encoded_text:
    # Check if the sequence has more than one element
    if len(d) > 1:
        # Create sub-sequences by iterating from 2 to the length of the sequence
        for i in range(2, len(d)):
            datalist.append(d[:i])

# Set the maximum length of the sequences
max_length = 20

# Pad the sequences to ensure uniform length
sequences = pad_sequences(datalist, maxlen=max_length, padding='pre')

# Separate input (X) and output (y) sequences
X = sequences[:, :-1]
y = sequences[:, -1]

# Convert y to categorical representation
y = to_categorical(y, num_classes=vocab_size)

# Define the sequence length
seq_length = X.shape[1]


In [17]:
# Model definition using Keras Sequential API
model = Sequential()

# Add an Embedding layer with vocabulary size, embedding dimension, and input length
model.add(Embedding(vocab_size, 50, input_length=seq_length))

# Add an LSTM layer with 100 units and return sequences for the first layer
model.add(LSTM(100, return_sequences=True))

# Add another LSTM layer with 100 units
model.add(LSTM(100))

# Add a Dense layer with 100 units and ReLU activation function
model.add(Dense(100, activation='relu'))

# Add the output layer with softmax activation for multi-class classification
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model with categorical cross-entropy loss and Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [18]:
# Train the model on the provided data
# X: Input sequences, y: Target labels
# Batch size is set to 32, and the model is trained for 50 epochs
model.fit(X, y, batch_size=32, epochs=50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7fb731fe9510>

In [23]:
# Function for generating poetry lines using the trained model
def generate_poetry(seed_text, n_lines):
    poem_lines = []
    for i in range(n_lines):
        text = []
        for _ in range(poetry_length):
            # Encode the seed text to numeric sequences
            encoded = token.texts_to_sequences([seed_text])
            encoded = pad_sequences(encoded, maxlen=seq_length, padding='pre')

            # Predict the next word index using the trained model
            y_pred = np.argmax(model.predict(encoded), axis=-1)

            predicted_word = ""
            # Map the predicted index back to the word
            for word, index in token.word_index.items():
                if index == y_pred:
                    predicted_word = word
                    break

            # Update the seed text with the predicted word
            seed_text = seed_text + ' ' + predicted_word
            text.append(predicted_word)

        # Combine the generated words to form a line
        generated_line = ' '.join(text)
        poem_lines.append(generated_line)

    # Print the entire poem
    for line in poem_lines:
        print(line)




In [24]:
# Replace 'seed_text' with the desired seed text
seed_text = 'Where are you'
generate_poetry(seed_text, 5)



seeing out except and raise lolling juniper and not reported
it's long to live a miniature for his wife and
when a notion in the sky clouds and blossom in
the sky clouds and blossom in the mountain one i
didn't have if i here i band at toss the
