## Text Generation with RNN


In [1]:
import tensorflow as tf
from tensorflow import keras
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
def load_text(filename):
    try:
        with open(filename, 'r') as file:
            doc = file.read()
        return doc
    except FileNotFoundError as e:
        print(e)
    except Exception as e:
        print(e)
        
def create_tokenizer(text):
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(text)
    return tokenizer

def encode_text(tokenizer, text):
    seq = tokenizer.texts_to_sequences(text)
    return seq

In [3]:
# text = load_text('dataset/blake-poems.txt')
# tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
# tokenizer.fit_on_texts(text)

# max_id = len(tokenizer.word_index)
# [encoded] = np.array(tokenizer.texts_to_sequences([text])) - 1
# dataset = tf.data.Dataset.from_tensor_slices(encoded)

# n_steps = 100
# window_length = n_steps + 1
# dataset = dataset.window(window_length, shift=1, drop_remainder=True)
# dataset = dataset.flat_map(lambda window:window.batch(window_length))

# batch_size = 32
# dataset = dataset.shuffle(10000).batch(batch_size)
# dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
# dataset = dataset.map(
#     lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

# dataset = dataset.prefetch(1)

In [5]:
# load text for training
filename = 'dataset/blake-poems.txt'
text = load_text(filename)

# create tokenizer and fit on text
tokenizer = create_tokenizer(text)
# number of unique character
max_id = len(tokenizer.word_index)
# make index id start from 0
[encoded_text] = np.array(encode_text(tokenizer, [text])) - 1
# create dataset 
dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [6]:
n_steps = 100
window_size = n_steps + 1
# make overlapping windows
dataset = dataset.window(window_size, shift = 1, drop_remainder=True)
dataset = dataset.flat_map(lambda window : window.batch(window_size))
# batch the windows
batch_size = 32
dataset = dataset.shuffle(100).batch(32)
# separate the 
dataset = dataset.map(lambda window : (window[:,:-1], window[:,1:]))
# encode each character using one-hot vector
dataset = dataset.map(lambda  X,Y : (tf.one_hot(X, depth=max_id), Y))
dataset = dataset.prefetch(1)

In [7]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
model.save('model_text_generation.h5')

In [15]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id) 

X_new = preprocess(["where are you fro"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

'm'

In [17]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

def complete_text(text, n_chars=100, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [19]:
print(complete_text("t", temperature=0.2))

the cloud & a fierd the skies.


iv.


the complains of graves & the skies.


iv.


i parting to the 
