<a href="https://colab.research.google.com/github/techtreasure/colab/blob/master/Character_level_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction

This is a simple text generation task using LSTM in Keras 

## Setup


In [1]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import random
import io


## Data preperation 


In [2]:

#Import the data file
path = keras.utils.get_file(
    "data.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt"
)

#Read the text file 
with io.open(path, encoding="utf-8") as f:
    text = f.read().lower()

#Eliminate new lines from the text
text = text.replace("\n", " ")
print("Corpus length:", len(text))

#Extract unique charactors 
chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#Extract fixed length semi-reducndent text sequences from the text file
#Each text sequece is in length of 40 charactors and striding with step 3
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen]) # sequence with 40 chars
    next_chars.append(text[i + maxlen]) # next character for as the label
print("Number of sequences:", len(sentences))

#Reshape the data to feed in to LSTM model
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1



Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
Corpus length: 600893
Total chars: 56
Number of sequences: 200285


## Build the model: a single LSTM layer


In [3]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))), #Visible layer
        layers.LSTM(128), #Hidden layer
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)


## Train the model


In [4]:
epochs = 40 # can try higer values 
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Epoch : %d" % epoch)

    
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ""
    #Extract a random sequence with 40 chars
    sentence = text[start_index : start_index + maxlen]
    print('Starting part of the sentence: "' + sentence + '"')

    for i in range(400): # predict 400 chars 
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0] 
        next_index = np.argmax(preds) # Select the higest probable char
        next_char = indices_char[next_index]

        #Append the predicted char and remove the first char of the sentece 
        #(to maintain the maxlength )
        sentence = sentence[1:] + next_char  
        generated += next_char

    print("Generated text sequence : ", generated)
    print()



Epoch : 0
Starting part of the sentence: "laced, and not yet a race, much less suc"
Generated text sequence :  h a profound and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselves to the self--and and soul themselv


Epoch : 1
Starting part of the sentence: "ediocre everlasting,      sans genie et "
Generated text sequence :  in the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the stand to the sta

#Helper function for Sampling


In [8]:
def sample(preds, temperature=1.0):
    EPSILON = 10e-16 # to avoid taking the log of zero
    preds = (np.asarray(preds) + EPSILON).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
epochs = 40
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Epoch : %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('Starting part of the sentence: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]

            # test for different diversities
            next_index = sample(preds, diversity) 
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("Generated text sequence : ", generated)
        print()


Epoch : 0
...Diversity: 0.2
Starting part of the sentence: " acquired, circumscribed needs, based up"
Generated text sequence :  on the head, and in the soul in the standard of the soul of the sense, is a particing the sense of the sense of the sense of the soul of the same always as the world, as a probably, and all the conscious of the substation of the sense of the world, and when it is a principle of the standard of the sense of the sense of the sense of the soul of the strength of the conscious of the most deceive of t

...Diversity: 0.5
Starting part of the sentence: " acquired, circumscribed needs, based up"
Generated text sequence :  on the whole opposite of some consideration of the more possible and his world, the christianity the strong and discove as the fact that the place of the present subsusments, it is esgress simply in englious and wised, in the scientific sense--itself and different afficic of the conscious with a most addear in the possible the herdence of the most