# Poetry Generation (Tensorflow, Keras, LSTM)

In [1]:
import string
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
filenames = ['dataset/adele.txt', 'dataset/lady-gaga.txt',
             'dataset/kanye-west.txt', 'dataset/eminem.txt']

In [3]:
with open('dataset/MERGED.txt', 'w', encoding='utf-8') as outfile:
    for fname in filenames:
        with open(fname, 'r', encoding='utf-8') as infile:
            outfile.write(infile.read() + '\n')

In [4]:
with open('dataset/MERGED.txt', 'r', encoding='utf-8') as file:
    data = file.read().splitlines()

In [5]:
# len(data)
# 19212

## Building LSTM Model

In [6]:
token = Tokenizer()
token.fit_on_texts(data)
encoded_text = token.texts_to_sequences(data)
vocab_size = len(token.word_counts) + 1

## Prepare Training Data

In [8]:
datalist = []
for d in encoded_text:
    if len(d) > 1:
        for i in range(2, len(d)):
            datalist.append(d[:i])

In [16]:
datalist[4]

[153, 7, 64, 218]

## Padding

In [9]:
max_length = 50
sequences = pad_sequences(datalist, maxlen=max_length, padding='pre')
X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)

In [10]:
seq_length = X.shape[1]
seq_length

49

In [11]:
__all__ = ['X', 'y', 'token', 'seq_length', 'vocab_size']