# Poetry Generation (Tensorflow, Keras, LSTM)

In [4]:
import string
import requests
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping

In [5]:
filenames = ['dataset/adele.txt', 'dataset/lady-gaga.txt',
             'dataset/kanye-west.txt', 'dataset/eminem.txt']

In [6]:
with open('dataset/MERGED.txt', 'w', encoding='utf-8') as outfile:
    for fname in filenames:
        with open(fname, 'r', encoding='utf-8') as infile:
            outfile.write(infile.read() + '\n')

In [7]:
with open('dataset/MERGED.txt', 'r', encoding='utf-8') as file:
    data = file.read().splitlines()

In [8]:
len(data)

19212

In [None]:
' '.join(data)

## Building LSTM Model

In [10]:
token = Tokenizer()
token.fit_on_texts(data)
# token.word_counts, token.word_index

In [11]:
encoded_text = token.texts_to_sequences(data)
encoded_text[:3]

[[276, 26, 106, 3698], [153, 7, 64, 218, 2, 140], [28, 11, 993, 1730]]

In [12]:
x = ['Token Text']
token.texts_to_sequences(x)

[[4467, 1965]]

In [13]:
vocab_size = len(token.word_counts) + 1

## Prepare Training Data

In [14]:
datalist = []
for d in encoded_text:
    if len(d) > 1:
        for i in range(2, len(d)):
            datalist.append(d[:i])

In [15]:
datalist[4]

[153, 7, 64, 218]

## Padding

In [16]:
max_length = 20
sequences = pad_sequences(datalist, maxlen=max_length, padding='pre')
sequences[:4]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 276,  26],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 276,  26, 106],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 153,   7],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 153,   7,  64]])

In [17]:
sequences.shape

(127241, 20)

In [18]:
X = sequences[:, :-1]
X[:4]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 276],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 276,  26],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 153],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 153,   7]])

In [19]:
y = sequences[:, -1]
vocab_size

10571

In [20]:
y = to_categorical(y, num_classes=vocab_size)
y.shape, y

((127241, 10571),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [21]:
seq_length = X.shape[1]
seq_length

19

## LSTM Model Training

In [22]:
model = Sequential()
model.add(Embedding(vocab_size, 50))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.build(input_shape=(None, seq_length))

In [23]:
model.summary()

In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=100)

Epoch 1/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 34ms/step - accuracy: 0.0396 - loss: 6.7507
Epoch 2/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32ms/step - accuracy: 0.0538 - loss: 5.9884
Epoch 3/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 32ms/step - accuracy: 0.0653 - loss: 5.7637
Epoch 4/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32ms/step - accuracy: 0.0829 - loss: 5.5757
Epoch 5/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - accuracy: 0.1051 - loss: 5.3723
Epoch 6/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 35ms/step - accuracy: 0.1195 - loss: 5.2188
Epoch 7/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32ms/step - accuracy: 0.1287 - loss: 5.0779
Epoch 8/100
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32ms/step - accuracy: 0.1362 - loss: 4.9437
Epoch 9/100
[1m

<keras.src.callbacks.history.History at 0x1bba83c3110>

## Poetry Generation

In [25]:
poetry_length = 10

def generate_poetry(seed_text, n_lines):
    for i in range(n_lines):
        text = []
        for _ in range(poetry_length):
            encoded = token.texts_to_sequences([seed_text])[0]
            encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre')

            y_pred = np.argmax(model.predict(encoded, verbose=0), axis=-1)

            predicted_word = ''
            for word, index in token.word_index.items():
                if index == y_pred:
                    predicted_word = word
                    break
            
            seed_text = seed_text + ' ' + predicted_word
            text.append(predicted_word)

        seed_text = text[-1]
        text = ' '.join(text)
        print(text)

In [26]:
token.texts_to_sequences(['hello my'])

[[961, 7]]

In [49]:
generate_poetry('', 2)

i know that i was born this way that i
know that i was born this way that i know
