#### LSTM for text generation: Here I will try to predict new text based on the existing text data using Long-short-term-memory aka LSTM model.

In [42]:
# imports 
import os
from urllib.request import urlretrieve
import tensorflow as tf

##### Data: extracting the data from a website.It has 209 stories which are translated into english from german, making use of urlretrieve and os lib to download and structure the data

In [43]:
url = 'https://www.cs.cmu.edu/~spok/grimmtmp/'
dir_name = 'data'

def download_data(url, filename, download_dir):
    """Download a file if not present"""
    # Create directories if doesn't exist
    os.makedirs(download_dir, exist_ok=True)
    # If file doesn't exist download
    if not os.path.exists(os.path.join(download_dir,filename)):
        filepath, _ = urlretrieve(url + filename, os.path.join(download_dir,filename))
    else:
        filepath = os.path.join(download_dir, filename)
    return filepath

# Number of files and their names to download
num_files = 209
filenames = [format(i, '03d')+'.txt' for i in range(1,num_files+1)]

# Download each file
for fn in filenames:
    download_data(url, fn, dir_name)
    
# Check if all files are downloaded
for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name,filenames[i]))
    assert file_exists
print(f"{len(filenames)} files found.") 

209 files found.


##### Splitting the data: Now splitting the data into train,test and validation sets and printing the results

In [44]:
from sklearn.model_selection import train_test_split
# Fix the random seed so we get the same outptu everytime
random_state = 54321
filenames = [os.path.join(dir_name, f) for f in os.listdir(dir_name)]
# First separate train and valid+test data
train_filenames, test_and_valid_filenames = train_test_split(filenames, test_size=0.2, random_state=random_state)
# Separate valid+test data to validation and test data
valid_filenames, test_filenames = train_test_split(test_and_valid_filenames, test_size=0.5, random_state=random_state) 
# Print size of different subsets
for subset_id, subset in zip(('train', 'valid', 'test'), (train_filenames, valid_filenames, test_filenames)):
    print(f"Got {len(subset)} files in the {subset_id} dataset (e.g. {subset[:3]})")

Got 167 files in the train dataset (e.g. ['data\\117.txt', 'data\\133.txt', 'data\\069.txt'])
Got 21 files in the valid dataset (e.g. ['data\\023.txt', 'data\\078.txt', 'data\\176.txt'])
Got 21 files in the test dataset (e.g. ['data\\129.txt', 'data\\207.txt', 'data\\170.txt'])


##### Finding the vocabulary size:

In [115]:
# defining a bigram set
bigram_set = set()
# Go through each file in the training set
for fname in train_filenames:
    # This will hold all the text
    document = [] 
    with open(fname, 'r') as f:
        for row in f:
            # Convert text to lower case to reduce input dimensionality
            document.append(row.lower())
        # From the list of text we have create a single list having all stories
        document = " ".join(document)
        # Update the set with all bigrams found
        bigram_set.update([document[i:i+2] for i in range(0, len(document), 2)])
# Assign to a variable
n_vocab = len(bigram_set)
print(f"Found {n_vocab} unique bigrams")

Found 705 unique bigrams


In [119]:
train_ds=generate_tf_dataset(train_filenames,ngram_length,window_size,batch_size,shuffle=True)
train_filenames

tf.Tensor(b'There was once upon a time a shepherd boy whose fame spread\r\nfar and wide because of the wise answers which he gave to every\r\nquestion.  The king of the country heard of it likewise, but\r\ndid not believe it, and sent for the boy.  Then he said to\r\nhim, if you can give me an answer to three questions which I\r\nwill ask you, I will look on you as my own child, and you shall\r\ndwell with me in my royal palace.  The boy said, what are the\r\nthree questions.  The king said, the first is, how many drops\r\nof water are there in the ocean.  The shepherd boy answered, lord\r\nking, if you will have all the rivers on earth dammed up so that\r\nnot a single drop runs from them into the sea until I have\r\ncounted it, I will tell you how many drops there are in the sea.\r\nThe king said, the next question is, how many stars are there\r\nin the sky.  The shepherd boy said, give me a great sheet of\r\nwhite paper, and then he made so many fine points on it with a\r\npen that 

['data\\117.txt',
 'data\\133.txt',
 'data\\069.txt',
 'data\\195.txt',
 'data\\107.txt',
 'data\\183.txt',
 'data\\066.txt',
 'data\\074.txt',
 'data\\105.txt',
 'data\\087.txt',
 'data\\152.txt',
 'data\\068.txt',
 'data\\111.txt',
 'data\\168.txt',
 'data\\153.txt',
 'data\\166.txt',
 'data\\080.txt',
 'data\\088.txt',
 'data\\197.txt',
 'data\\042.txt',
 'data\\154.txt',
 'data\\185.txt',
 'data\\196.txt',
 'data\\120.txt',
 'data\\142.txt',
 'data\\186.txt',
 'data\\030.txt',
 'data\\155.txt',
 'data\\045.txt',
 'data\\188.txt',
 'data\\109.txt',
 'data\\191.txt',
 'data\\043.txt',
 'data\\001.txt',
 'data\\124.txt',
 'data\\085.txt',
 'data\\163.txt',
 'data\\100.txt',
 'data\\127.txt',
 'data\\032.txt',
 'data\\146.txt',
 'data\\156.txt',
 'data\\081.txt',
 'data\\016.txt',
 'data\\184.txt',
 'data\\075.txt',
 'data\\137.txt',
 'data\\194.txt',
 'data\\054.txt',
 'data\\009.txt',
 'data\\179.txt',
 'data\\206.txt',
 'data\\209.txt',
 'data\\034.txt',
 'data\\103.txt',
 'data\\11

##### A total of 705 words found, it will be much more if instead of character level bigram, word is taken as a unit

In [118]:
# defining the tf.data pipeline
def generate_tf_dataset(filenames,ngram_width,window_size,batch_size,shuffle=False):
    """
    Generate batched data
    """
    documents=[]
    for f in filenames:
        doc=tf.io.read_file(f)
        print(doc)
        print(type(doc))
        break
        doc=tf.strings.ngrams( # generating ngram from string
            tf.strings.bytes_split( # splititng word into char and creating a list of chars
                tf.strings.regex_replace( # replacing new line with space
                    tf.strings.lower(doc),"\n"," " # convert to lower case
                )
            ),ngram_width,separator=''
        )
        documents.append(doc.numpy().tolist())
        # documents is a list of list of strings, where each string is a story
        # generating a ragged tensor: A ragged tensor has dimensions used to accept arbitrarily sized inputs, 
        # in this case its not possible that all stories have same no of ngrams and there are long sequences
        # of ngrams representing the stories so using ragged tensor to store that
    documents=tf.ragged.constant(documents)
    # creating a dataset where each row in ragged tensor is sample
    doc_dataset = tf.data.Dataset.from_tensor_slices(documents)
    # removing the overlap here created by tf.strings.ngrams:
    # so taking nth ngram in the sequence
    doc_dataset=doc_dataset.map(lambda x:x[::ngram_width])
    # need to generate windows from text:
    # ex- ab,bc,cd,ef,fg,gh.... window_size=3,shift=1 gives-[ab,cd,ef],[cd,ef,gh]...
    # to create shorter, fixed-length windowed sequences from each story:
    doc_dataset = doc_dataset.flat_map(
        lambda x: tf.data.Dataset.from_tensor_slices(
            x
        ).window(
            size=window_size+1, shift=int(window_size * 0.75)
        ).flat_map(
            lambda window: window.batch(window_size+1, drop_remainder=True)
        )
    )
    # from each window generate input and output sequence: take all ngrams except last as input 
    # and all ngrams except first as output/target so at each time step,model predict next ngram 
    # given all previous ngrams, some overlap also needed
    doc_dataset = doc_dataset.map(lambda x: (x[:-1], x[1:]))
    # Shuffle the data if required
    doc_dataset = doc_dataset.shuffle(buffer_size=batch_size*10) if shuffle else doc_dataset
    # Batch the data
    doc_dataset = doc_dataset.batch(batch_size=batch_size)
#     Return the data
    return doc_dataset

In [73]:
# specify hyperparameters and generate train,test and validation data
ngram_length=2
batch_size=128
window_size=128
train_ds=generate_tf_dataset(train_filenames,ngram_length,window_size,batch_size,shuffle=True)
test_ds=generate_tf_dataset(test_filenames,ngram_length,window_size,batch_size)
valid_ds=generate_tf_dataset(valid_filenames,ngram_length,window_size,batch_size)

In [74]:
# for i in train_ds:
#     print(i)

In [120]:
train_ds

In [100]:
for r in ds:
    print(r[0][0][0])

tf.Tensor(b'th', shape=(), dtype=string)
tf.Tensor(b' u', shape=(), dtype=string)
tf.Tensor(b' s', shape=(), dtype=string)
tf.Tensor(b'wh', shape=(), dtype=string)
tf.Tensor(b'ea', shape=(), dtype=string)


In [75]:
# generating some data
ds = generate_tf_dataset(train_filenames, 2, window_size=10, batch_size=1).take(5)
for record in ds:
    print(record[0].numpy(), '->', record[1].numpy())

[[b'th' b'er' b'e ' b'wa' b's ' b'on' b'ce' b' u' b'po' b'n ']] -> [[b'er' b'e ' b'wa' b's ' b'on' b'ce' b' u' b'po' b'n ' b'a ']]
[[b' u' b'po' b'n ' b'a ' b'ti' b'me' b' a' b' s' b'he' b'ph']] -> [[b'po' b'n ' b'a ' b'ti' b'me' b' a' b' s' b'he' b'ph' b'er']]
[[b' s' b'he' b'ph' b'er' b'd ' b'bo' b'y ' b'wh' b'os' b'e ']] -> [[b'he' b'ph' b'er' b'd ' b'bo' b'y ' b'wh' b'os' b'e ' b'fa']]
[[b'wh' b'os' b'e ' b'fa' b'me' b' s' b'pr' b'ea' b'd\r' b' f']] -> [[b'os' b'e ' b'fa' b'me' b' s' b'pr' b'ea' b'd\r' b' f' b'ar']]
[[b'ea' b'd\r' b' f' b'ar' b' a' b'nd' b' w' b'id' b'e ' b'be']] -> [[b'd\r' b' f' b'ar' b' a' b'nd' b' w' b'id' b'e ' b'be' b'ca']]


##### Implementing the language model:

##### First defining the tokenization layer and integrating it into the model

In [76]:
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.backend as K
text_vectorizer=layers.TextVectorization(max_tokens=n_vocab,standardize=None,split=None,input_shape=(window_size,))
# train model on data
text_vectorizer.adapt(train_ds)

In [77]:
# print a few bigrams learnt by the text vectprization layer
text_vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'e ', 'he', ' t', 'th', 'd ', ' a', ' h', ', ']

In [78]:
# train,test and valid sets need to be processed:converting from string to ngram ids
train_ds=train_ds.map(lambda x,y:(x,text_vectorizer(y)))
test_ds=test_ds.map(lambda x,y:(x,text_vectorizer(y)))
valid_ds=valid_ds.map(lambda x,y:(x,text_vectorizer(y)))

In [79]:
# Defining the model: It has previously trained Textvectorization layer, embedding layer,
# two LSTM layers, a fully connected layer with ReLU and a final prediction layer with softmax
lm_model=models.Sequential([
    text_vectorizer,layers.Embedding(n_vocab+2,96),
    layers.LSTM(512,return_state=False,return_sequences=True),
    layers.LSTM(256,return_state=False,return_sequences=True),
    layers.Dense(1024,activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(n_vocab,activation='softmax')
])

##### return_state=False means layer output only final output and if true,it return final output with state output, if its set true for LSTM it returns final output,cell state and hidden state  
##### return_sequences=True cause layer to output full output sequence opposed to final output

In [81]:
lm_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_2 (TextV  (None, 128)              0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 128, 96)           67872     
                                                                 
 lstm_8 (LSTM)               (None, 128, 512)          1247232   
                                                                 
 lstm_9 (LSTM)               (None, 128, 256)          787456    
                                                                 
 dense_4 (Dense)             (None, 128, 1024)         263168    
                                                                 
 dropout_2 (Dropout)         (None, 128, 1024)         0         
                                                      

In [82]:
# Defining the perplexity metric:
class PerplexityMetric(tf.keras.metrics.Mean):
    
    def __init__(self, name='perplexity', **kwargs):
      super().__init__(name=name, **kwargs)
      self.cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

    def _calculate_perplexity(self, real, pred):
      loss_ = self.cross_entropy(real, pred)
      
      # Calculating the perplexity steps: 
      step1 = K.mean(loss_, axis=-1)
      perplexity = K.exp(step1)
    
      return perplexity 

    def update_state(self, y_true, y_pred, sample_weight=None):            
      perplexity = self._calculate_perplexity(y_true, y_pred)
      super().update_state(perplexity)

##### Compile model using  
Sparse categorical cross-entropy as loss function  
Adam as optimizer  
Accuracy and perplexity as metrics

In [83]:
lm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
metrics=['accuracy', PerplexityMetric()])

In [84]:
%env TF_FORCE_GPU_ALLOW_GROWTH=true

env: TF_FORCE_GPU_ALLOW_GROWTH=true


In [85]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

In [86]:
# training the model
lstm_history = lm_model.fit(train_ds, validation_data=valid_ds, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


##### As we can see the accuracy is increasing and perplexity is decreasing, it will be much better if I ran it for more epochs but due to resource constraints, I could not do that

In [87]:
# evaluating 
lm_model.evaluate(test_ds)



[2.9792096614837646, 0.3070947527885437, 20.25795555114746]

##### Building inference model: defining a recursive model that takes the current time step’s output of the model as the input to the next time step.The need is to generate new text, nothing available in the beginning. Therefore,need to make adjustments to trained model, Using functional API not the sequential API

In [91]:
# defining inference model:
inp=layers.Input(dtype=tf.string,shape=(1,))
text_vectorized_out = lm_model.get_layer('text_vectorization_2')(inp)
inp_state_c_lstm=layers.Input(shape=(512,))
inp_state_h_lstm=layers.Input(shape=(512,))
inp_state_c_lstm_1=layers.Input(shape=(256,))
inp_state_h_lstm_1=layers.Input(shape=(256,))
# Define embedding layer and output
emb_layer=lm_model.get_layer('embedding_2')
emb_out=emb_layer(text_vectorized_out)
# Defining a LSTM layers and output
lstm_layer=layers.LSTM(512,return_state=True,return_sequences=True)
lstm_out,lstm_state_c,lstm_state_h=lstm_layer(emb_out,initial_state=[inp_state_c_lstm,inp_state_h_lstm])
lstm_1_layer=tf.keras.layers.LSTM(256,return_state=True,return_sequences=True)
lstm_1_out,lstm_1_state_c,lstm_1_state_h=lstm_1_layer(lstm_out,initial_state=[inp_state_c_lstm_1,inp_state_h_lstm_1])
# Defining a Dense layer and output
dense_out=lm_model.get_layer('dense_4')(lstm_1_out)
# Defining the final Dense layer and output
final_out=lm_model.get_layer('dense_5')(dense_out)
# Copy the weights from the original model
lstm_layer.set_weights(lm_model.get_layer('lstm_8').get_weights())
lstm_1_layer.set_weights(lm_model.get_layer('lstm_9').get_weights())
# Define final model
infer_model=models.Model(
    inputs=[inp, inp_state_c_lstm, inp_state_h_lstm, inp_state_c_lstm_1, inp_state_h_lstm_1], 
    outputs=[final_out, lstm_state_c, lstm_state_h, lstm_1_state_c, lstm_1_state_h])
# Summary
infer_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_29 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_2 (TextVect  multiple            0           ['input_29[0][0]']               
 orization)                                                                                       
                                                                                                  
 embedding_2 (Embedding)        multiple             67872       ['text_vectorization_2[3][0]']   
                                                                                                  
 input_30 (InputLayer)          [(None, 512)]        0           []                         

##### using new inference model to generate a story. defining an initial seed that will be used to generate a story.Taking the the first phrase from one of the test files. Then usig it to generate text recursively, by using the predicted bigram at time t as the input at time t+1. Running for 500 steps:

In [93]:
import numpy as np
text = ["When adam and eve were driven out of paradise, they were compelled to build a house for themselves on barren ground"]
seq = [text[0][i:i+2] for i in range(0, len(text[0]), 2)]
# build up model state using the given string
print(f"Making predictions from a {len(seq)} element long input")
vocabulary = infer_model.get_layer("text_vectorization_2").get_vocabulary()
index_word = dict(zip(range(len(vocabulary)), vocabulary))

# Reset the state of the model initially
infer_model.reset_states()
# Definin the initial state as all zeros
state_c = np.zeros(shape=(1,512))
state_h = np.zeros(shape=(1,512))
state_c_1 = np.zeros(shape=(1,256))
state_h_1 = np.zeros(shape=(1,256))
# Recursively update the model by assining new state to state
for c in seq:    
    #print(c)
    out, state_c, state_h, state_c_1, state_h_1 = infer_model.predict(
        [np.array([[c]]), state_c, state_h, state_c_1, state_h_1]
)
# Get final prediction after feeding the input string
wid = int(np.argmax(out[0],axis=-1).ravel())
word = index_word[wid]
text.append(word)
# Define first input to generate text recursively from
x = np.array([[word]])
for _ in range(500):    
    # Get the next output and state
    out, state_c, state_h, state_c_1, state_h_1  = infer_model.predict([x, state_c, state_h, state_c_1, state_h_1 ])
    # Get the word id and the word from out
    out_argsort = np.argsort(out[0], axis=-1).ravel()        
    wid = int(out_argsort[-1])
    word = index_word[wid]
    # If the word ends with space, we introduce a bit of randomness
    # Essentially pick one of the top 3 outputs for that timestep depending on their likelihood
    if word.endswith(' '):
        if np.random.normal()>0.5:
            width = 5
            i = np.random.choice(list(range(-width,0)), p=out_argsort[-width:]/out_argsort[-width:].sum())    
            wid = int(out_argsort[i])    
            word = index_word[wid]
    # Append the prediction
    text.append(word)
    # Recursively make the current prediction the next input
    x = np.array([[word]])
# Print the final output    
print('\n')
print('='*60)
print("Final text: ")
print(''.join(text))


Making predictions from a 58 element long input




Final text: 
 world, and the king's daughter the king's daughter.  the king's daughter that the king the king's daughter the king's daughter.  the king's daughter them, and the king's daughter the king's daughter the king's daughter the king's daughter." the king's daughter the king's daughter the king's daughter, and then there was the king's daughten the king's daughter the king's daughter th


##### Model is able to generate some meaningful text, it would be more better if I ran for more epochs ~ 100 but this is a small experiment only