##**Load and Explore the Dataset**

In [None]:
import pandas as pd
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import kagglehub

#Download the dataset from Kaggle
path = kagglehub.dataset_download("tgdivy/poetry-foundation-poems")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tgdivy/poetry-foundation-poems?dataset_version_number=1...


100%|██████████| 8.88M/8.88M [00:00<00:00, 75.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tgdivy/poetry-foundation-poems/versions/1


In [None]:
#Loading the dataset
df = pd.read_csv(os.path.join(path, "PoetryFoundationData.csv"), nrows=1000)

In [None]:
#Cleaning the data
df['Poem'] = df['Poem'].str.replace(r'\s+', ' ', regex=True)
df['Title'] = df['Title'].str.replace(r'\s+', ' ', regex=True)
df['input'] = df['Title'] + ' *** ' + df['Poem']

#Exploring the columns
print(df.columns)

input_data = df['input'].values.tolist()

#Printing a portion of the corpus to verify
print(input_data[:3])

Index(['Unnamed: 0', 'Title', 'Poem', 'Poet', 'Tags', 'input'], dtype='object')
[" Objects Used to Prop Open a Window  ***  Dog bone, stapler, cribbage board, garlic press because this window is loose—lacks suction, lacks grip. Bungee cord, bootstrap, dog leash, leather belt because this window had sash cords. They frayed. They broke. Feather duster, thatch of straw, empty bottle of Elmer's glue because this window is loud—its hinges clack open, clack shut. Stuffed bear, baby blanket, single crib newel because this window is split. It's dividing in two. Velvet moss, sagebrush, willow branch, robin's wing because this window, it's pane-less. It's only a frame of air. ", ' The New Church  ***  The old cupola glinted above the clouds, shone among fir trees, but it took him an hour for the half mile all the way up the hill. As he trailed, the village passed him by, greeted him, asked about his health, but everybody hurried to catch the mass, left him leaning against fences, measuring the r

In [None]:
#Data Preprocessing:

#Importing the required libraries
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

#Tokenizing the text(converting each word to a unique integer)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_data)
total_words = len(tokenizer.word_index) + 1

print(total_words)

30216


In [None]:
#Creating input sequences using sequences of words
input_sequences = []
for line in input_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, min(len(token_list), 50)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

#Calculating max_sequence_len
max_sequence_len = max(len(seq) for seq in input_sequences)

#Padding sequences and create predictors and labels
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = to_categorical(label, num_classes=total_words)

In [None]:
#Defining the batch size
batch_size = 32

#Generator function to yield batches of data
def data_generator(predictors, labels):
    dataset_size = len(predictors)
    indices = np.arange(dataset_size)
    np.random.shuffle(indices)
    for idx in indices:
        yield predictors[idx], labels[idx]

#Creating a TensorFlow Dataset from the generator function
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(predictors, label),
    output_signature=(
        tf.TensorSpec(shape=(predictors.shape[1],), dtype=tf.int32),
        tf.TensorSpec(shape=(label.shape[1],), dtype=tf.float32)
    )
)

#Shuffle and batch the dataset
dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).repeat()

#Splitting the dataset into training and validation sets
train_size = 100000
val_size = 20000

train_dataset = dataset.take(train_size // batch_size)
val_dataset = dataset.skip(train_size // batch_size).take(val_size // batch_size)

In [None]:
#Importing the required libraries
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
import pickle

#Defining the ModelCheckpoint callback
checkpoint_path = "model_checkpoint.keras"
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path,
                                      monitor='val_loss',
                                      save_best_only=True,
                                      mode='min',
                                      verbose=1)

#Defining EarlyStopping callback
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

In [None]:
#LSTM Model Development
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

def create_model():
    model = Sequential()
    model.add(Embedding(total_words, 50))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(total_words, activation='softmax'))
    return model

model = create_model()
model.build(input_shape=(None, max_sequence_len))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#Training the model with batching
history = model.fit(train_dataset,
                    epochs=30,
                    verbose=1,
                    validation_data=val_dataset,
                    callbacks=[early_stopping_callback, checkpoint_callback])

Epoch 1/30
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.0619 - loss: 7.7105
Epoch 1: val_loss improved from inf to 6.78440, saving model to model_checkpoint.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m458s[0m 144ms/step - accuracy: 0.0619 - loss: 7.7104 - val_accuracy: 0.0739 - val_loss: 6.7844
Epoch 2/30
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.0768 - loss: 6.7559
Epoch 2: val_loss improved from 6.78440 to 6.16784, saving model to model_checkpoint.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m510s[0m 147ms/step - accuracy: 0.0768 - loss: 6.7558 - val_accuracy: 0.0955 - val_loss: 6.1678
Epoch 3/30
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.0969 - loss: 6.2247
Epoch 3: val_loss improved from 6.16784 to 5.53495, saving model to model_checkpoint.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [None]:
#Saving the model
model.save("trained_model.h5")

#Loading the trained model
model = tf.keras.models.load_model('trained_model.h5')



In [14]:
#Text Generation

def generate_poetry(seed_text, next_words, model, max_sequence_len, tokenizer):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

seed_texts = ["The sun", "Love's embrace", "Autumn leaves"]
for seed_text in seed_texts:
    generated_poetry = generate_poetry(seed_text, 20, model, max_sequence_len, tokenizer)
    print(f"Generated poetry with seed '{seed_text}':\n{generated_poetry}\n")

Generated poetry with seed 'The sun':
The sun has the species of tiny paper everywhere we have the angel is the platform and the race he had been

Generated poetry with seed 'Love's embrace':
Love's embrace summer after belly on one day my own harvest men never speak to us we spend the afternoon together watching

Generated poetry with seed 'Autumn leaves':
Autumn leaves this man is a new new twin daughters as not even all last names things always play your kiss me



In [15]:
#Evaluation and Experimentation:

#Defining the LSTM model
vocab_size = len(tokenizer.word_index) + 1
model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim=100))
model2.add(LSTM(50, return_sequences=True))
model2.add(LSTM(50))
model2.add(Dropout(0.4))  #Dropout layer to prevent overfitting
model2.add(Dense(vocab_size, activation='softmax'))  #Output layer for word prediction

#Compiling the model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.build(input_shape=(None, max_sequence_len))
model2.summary()

In [16]:
#Lines of poetry that resemble the style of the training poems

def generate_poetry(seed_text, next_words, model, max_sequence_len, tokenizer):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

seed_texts = ["Whispers of night", "A lone star", "In dreams we wander"]
for seed_text in seed_texts:
    generated_poetry = generate_poetry(seed_text, 20, model, max_sequence_len, tokenizer)
    print(f"Generated poetry with seed '{seed_text}':\n{generated_poetry}\n")

Generated poetry with seed 'Whispers of night':
Whispers of night after another lives in the end of the world we had a bowl to bring the past to the sea

Generated poetry with seed 'A lone star':
A lone star down on the lake the world of its empty through the house that keeps me down to the mountains of

Generated poetry with seed 'In dreams we wander':
In dreams we wander to come in your roses a head filled of course of my mind where did you speak on it came



### Interpretation
The code utilizes an LSTM model to generate new poetry lines by learning patterns and dependencies in a dataset of poems. It involves preparing the dataset, creating sequences, training an LSTM, and iteratively predicting the next word based on a seed text to generate new text. The goal is to produce text that has similar characteristics to the training data, like style and structure.

By adjusting the model's parameters and experimenting with different seed texts, users can influence the creativity and fluency of the generated poetry.

The generated poetry lines resemble the style and structure of the original dataset because the LSTM model has learned syntactic and thematic patterns from it. When each seed text is provided, the model uses it as a starting point and builds upon it, generating coherent lines that match the tone, rhythm, and sometimes even the sentiment of the original poems.