<a href="https://colab.research.google.com/github/tonyychen/Projects/blob/master/Portfolio%20Projects/Text_Generation_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
#Download the dataset
path_to_file = tf.keras.utils.get_file('pg420.txt', 'http://www.gutenberg.org/cache/epub/420/pg420.txt')

Downloading data from http://www.gutenberg.org/cache/epub/420/pg420.txt


In [3]:
#read the file
text = open(path_to_file, 'r').read()

In [4]:
#clean up the file
start_text = 'Produced by Dennis Amundson.'
end_text = 'End of Project Gutenberg'
start_idx = text.find(start_text)
end_idx = text.find(end_text)
text = text[start_idx:end_idx]

In [5]:
#get mappings for encodings and decodings
char_set = sorted(set(text))
char_arr = np.asarray(char_set)
map_size = len(char_set)

In [6]:
char_set[:5]

['\n', ' ', '!', '"', '&']

In [7]:
char2idx = {char:idx for idx, char in enumerate(char_arr)}
idx2char = char_arr

In [8]:
#encode the entire text
text_encoded = [char2idx[char] for char in text]

In [9]:
#create training dataset
ds_train = tf.data.Dataset.from_tensor_slices(text_encoded)

In [10]:
#batch sequence of chars in to sentence chucks of equal length
sequence_length = 30
ds_train = ds_train.batch(sequence_length + 1, drop_remainder = True)

In [11]:
for i in ds_train.take(1):
  print(i)

tf.Tensor(
[39 66 63 52 69 51 53 52  1 50 73  1 27 53 62 62 57 67  1 24 61 69 62 52
 67 63 62 10  0  0  0], shape=(31,), dtype=int32)


In [12]:
#slice each sentence chuck into input sequence and output sequence
ds_train = ds_train.map(lambda x: (x[:sequence_length], x[1:]))

In [13]:
for i in ds_train.take(1):
  print(i)

(<tf.Tensor: shape=(30,), dtype=int32, numpy=
array([39, 66, 63, 52, 69, 51, 53, 52,  1, 50, 73,  1, 27, 53, 62, 62, 57,
       67,  1, 24, 61, 69, 62, 52, 67, 63, 62, 10,  0,  0], dtype=int32)>, <tf.Tensor: shape=(30,), dtype=int32, numpy=
array([66, 63, 52, 69, 51, 53, 52,  1, 50, 73,  1, 27, 53, 62, 62, 57, 67,
        1, 24, 61, 69, 62, 52, 67, 63, 62, 10,  0,  0,  0], dtype=int32)>)


In [14]:
#Finally batch again to group data into batches
ds_train = ds_train.shuffle(10000).batch(10, drop_remainder = True).prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
ds_train

<PrefetchDataset shapes: ((10, 30), (10, 30)), types: (tf.int32, tf.int32)>

In [15]:
#Now we have the training set ready, the shape would be batch_size x sequence_length
#Next we would build the RNN model

input_dim = map_size
output_dim = 200
LSTM_units = 100
stateful = False

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim = input_dim, output_dim = output_dim))
model.add(tf.keras.layers.LSTM(LSTM_units, return_sequences = True, stateful = stateful))
model.add(tf.keras.layers.Dense(map_size))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 200)         15000     
_________________________________________________________________
lstm (LSTM)                  (None, None, 100)         120400    
_________________________________________________________________
dense (Dense)                (None, None, 75)          7575      
Total params: 142,975
Trainable params: 142,975
Non-trainable params: 0
_________________________________________________________________


In [17]:
#compile the model
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [18]:
#Create checkpoint callback
checkpoint_path = './training_checkpoints/ckpt_{epoch}'
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_path, save_weights_only = True)

In [19]:
#Train the model
history = model.fit(ds_train, epochs = 20, callbacks = [checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
#Now in order to make predictions recursively, we need to model to be stateful, so we would rebuild the model using the trained weights
stateful = True

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim = input_dim, output_dim = output_dim, batch_input_shape = (1, None)))
model.add(tf.keras.layers.LSTM(LSTM_units, return_sequences = True, stateful = stateful))
model.add(tf.keras.layers.Dense(map_size))

#compile the model
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 200)            15000     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 100)            120400    
_________________________________________________________________
dense_1 (Dense)              (1, None, 75)             7575      
Total params: 142,975
Trainable params: 142,975
Non-trainable params: 0
_________________________________________________________________


In [22]:
#load trained weights
model.load_weights(tf.train.latest_checkpoint('./training_checkpoints'))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4f6012db70>

In [23]:
model.reset_states()
#Generate the first character
input_string = "I am not sure what's next"
input_encoded = [char2idx[char] for char in input_string]
input_ds = tf.convert_to_tensor(input_encoded)
input_ds = tf.reshape(input_ds, (1, -1))
pred = model.predict(input_ds)

In [24]:
pred = tf.squeeze(pred, axis = 0)
pred_encoded = tf.random.categorical(pred, 1)
pred_encoded = tf.squeeze(pred_encoded, -1)

In [25]:
first_char = pred_encoded[-1].numpy()

In [26]:
#We would store all chars generated in a list
generated_sentence = []
generated_sentence.append(first_char)

In [27]:
#Now we would predict the next 100 chars to come up with a sentence
input_char = first_char #set input char to the first_char predicted
for i in range(100):
  input_char = tf.convert_to_tensor(input_char) #convert to tensor
  input_ds = tf.reshape(input_char, (1, 1)) #reshape to batch_size, sequence_size
  pred = model.predict(input_ds) #Note that previous sequence info is stored in the state because the model is stateful

  pred = tf.squeeze(pred, axis = 0) #remove batch dimension from output
  pred_encoded = tf.random.categorical(pred, 1) #create predictions from logits by drawing samples from categorical distribution
  pred_encoded = tf.squeeze(pred_encoded, -1) #remove feature dimension from output since it is always 1 (represents the encoded prediction)

  output_char = pred_encoded[-1].numpy() #get the last element predicted from the sequence (in this case, there would be only one element predicted)
  generated_sentence.append(output_char) #append the encoded output to generated_sentence
  input_char = output_char #set the new input as the current output

In [28]:
#Finally, decode generated_sentence
generated_sentence = idx2char[generated_sentence]
generated_sentence = ''.join(generated_sentence)

In [29]:
print(generated_sentence)

 more was
not," said the Praniantful graged hind juid to a fuching a sliople, and chads Maggettel.  A
