#### LSTM for text generation: Here I will try to predict new text based on the existing text data using Long-short-term-memory aka LSTM model.

In [25]:
# imports 
import os
from urllib.request import urlretrieve
import tensorflow as tf

##### Data: extracting the data from a website.It has 209 stories which are translated into english from german, making use of urlretrieve and os lib to download and structure the data

In [26]:
url = 'https://www.cs.cmu.edu/~spok/grimmtmp/'
dir_name = 'data'

def download_data(url, filename, download_dir):
    """Download a file if not present"""
    # Create directories if doesn't exist
    os.makedirs(download_dir, exist_ok=True)
    # If file doesn't exist download
    if not os.path.exists(os.path.join(download_dir,filename)):
        filepath, _ = urlretrieve(url + filename, os.path.join(download_dir,filename))
    else:
        filepath = os.path.join(download_dir, filename)
    return filepath

# Number of files and their names to download
num_files = 209
filenames = [format(i, '03d')+'.txt' for i in range(1,num_files+1)]

# Download each file
for fn in filenames:
    download_data(url, fn, dir_name)
    
# Check if all files are downloaded
for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name,filenames[i]))
    assert file_exists
print(f"{len(filenames)} files found.") 

209 files found.


##### Splitting the data: Now splitting the data into train,test and validation sets and printing the results

In [27]:
from sklearn.model_selection import train_test_split
# Fix the random seed so we get the same outptu everytime
random_state = 54321
filenames = [os.path.join(dir_name, f) for f in os.listdir(dir_name)]
# First separate train and valid+test data
train_filenames, test_and_valid_filenames = train_test_split(filenames, test_size=0.2, random_state=random_state)
# Separate valid+test data to validation and test data
valid_filenames, test_filenames = train_test_split(test_and_valid_filenames, test_size=0.5, random_state=random_state) 
# Print size of different subsets
for subset_id, subset in zip(('train', 'valid', 'test'), (train_filenames, valid_filenames, test_filenames)):
    print(f"Got {len(subset)} files in the {subset_id} dataset (e.g. {subset[:3]})")

Got 167 files in the train dataset (e.g. ['data\\117.txt', 'data\\133.txt', 'data\\069.txt'])
Got 21 files in the valid dataset (e.g. ['data\\023.txt', 'data\\078.txt', 'data\\176.txt'])
Got 21 files in the test dataset (e.g. ['data\\129.txt', 'data\\207.txt', 'data\\170.txt'])


##### Finding the vocabulary size:

In [28]:
# defining a bigram set
bigram_set = set()
# Go through each file in the training set
for fname in train_filenames:
    # This will hold all the text
    document = [] 
    with open(fname, 'r') as f:
        for row in f:
            # Convert text to lower case to reduce input dimensionality
            document.append(row.lower())
        # From the list of text we have create a single list having all stories
        document = " ".join(document)
        # Update the set with all bigrams found
        bigram_set.update([document[i:i+2] for i in range(0, len(document), 2)])
# Assign to a variable
n_vocab = len(bigram_set)
print(f"Found {n_vocab} unique bigrams")

Found 705 unique bigrams


##### A total of 705 words found, it will be much more if instead of character level bigram, word is taken as a unit

In [29]:
# defining the tf.data pipeline
def generate_tf_dataset(filenames,ngram_width,window_size,batch_size,shuffle=False):
    """
    Generate batched data
    """
    documents=[]
    for f in filenames:
        doc=tf.io.read_file(f)
        doc=tf.strings.ngrams( # generating ngram from string
            tf.strings.bytes_split( # splititng word into char and creating a list of chars
                tf.strings.regex_replace( # replacing new line with space
                    tf.strings.lower(doc),"\n"," " # convert to lower case
                )
            ),ngram_width,separator=''
        )
        documents.append(doc.numpy().tolist())
        # documents is a list of list of strings, where each string is a story
        # generating a ragged tensor: A ragged tensor has dimensions used to accept arbitrarily sized inputs, 
        # in this case its not possible that all stories have same no of ngrams and there are long sequences
        # of ngrams representing the stories so using ragged tensor to store that
    documents=tf.ragged.constant(documents)
    # creating a dataset where each row in ragged tensor is sample
    doc_dataset = tf.data.Dataset.from_tensor_slices(documents)
    # removing the overlap here created by tf.strings.ngrams:
    # so taking nth ngram in the sequence
    doc_dataset=doc_dataset.map(lambda x:x[::ngram_width])
    # need to generate windows from text:
    # ex- ab,bc,cd,ef,fg,gh.... window_size=3,shift=1 gives-[ab,cd,ef],[cd,ef,gh]...
    # to create shorter, fixed-length windowed sequences from each story:
    doc_dataset = doc_dataset.flat_map(
        lambda x: tf.data.Dataset.from_tensor_slices(
            x
        ).window(
            size=window_size+1, shift=int(window_size * 0.75)
        ).flat_map(
            lambda window: window.batch(window_size+1, drop_remainder=True)
        )
    )
    # from each window generate input and output sequence: take all ngrams except last as input 
    # and all ngrams except first as output/target so at each time step,model predict next ngram 
    # given all previous ngrams, some overlap also needed
    doc_dataset = doc_dataset.map(lambda x: (x[:-1], x[1:]))
    # Shuffle the data if required
    doc_dataset = doc_dataset.shuffle(buffer_size=batch_size*10) if shuffle else doc_dataset
    # Batch the data
    doc_dataset = doc_dataset.batch(batch_size=batch_size)
    # Return the data
    return doc_dataset

In [30]:
# specify hyperparameters and generate train,test and validation data
ngram_length=2
batch_size=128
window_size=128
train_ds=generate_tf_dataset(train_filenames,ngram_length,window_size,batch_size,shuffle=True)
test_ds=generate_tf_dataset(test_filenames,ngram_length,window_size,batch_size)
valid_ds=generate_tf_dataset(valid_filenames,ngram_length,window_size,batch_size)

In [31]:
# generating some data
ds = generate_tf_dataset(train_filenames, 2, window_size=10, batch_size=1).take(5)
for record in ds:
    print(record[0].numpy(), '->', record[1].numpy())

[[b'th' b'er' b'e ' b'wa' b's ' b'on' b'ce' b' u' b'po' b'n ']] -> [[b'er' b'e ' b'wa' b's ' b'on' b'ce' b' u' b'po' b'n ' b'a ']]
[[b' u' b'po' b'n ' b'a ' b'ti' b'me' b' a' b' s' b'he' b'ph']] -> [[b'po' b'n ' b'a ' b'ti' b'me' b' a' b' s' b'he' b'ph' b'er']]
[[b' s' b'he' b'ph' b'er' b'd ' b'bo' b'y ' b'wh' b'os' b'e ']] -> [[b'he' b'ph' b'er' b'd ' b'bo' b'y ' b'wh' b'os' b'e ' b'fa']]
[[b'wh' b'os' b'e ' b'fa' b'me' b' s' b'pr' b'ea' b'd ' b'fa']] -> [[b'os' b'e ' b'fa' b'me' b' s' b'pr' b'ea' b'd ' b'fa' b'r ']]
[[b'ea' b'd ' b'fa' b'r ' b'an' b'd ' b'wi' b'de' b' b' b'ec']] -> [[b'd ' b'fa' b'r ' b'an' b'd ' b'wi' b'de' b' b' b'ec' b'au']]


##### Implementing the language model:

##### First defining the tokenization layer and integrating it into the model

In [32]:
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.backend as K
text_vectorizer=layers.TextVectorization(max_tokens=n_vocab,standardize=None,split=None,input_shape=(window_size,))
# train model on data
text_vectorizer.adapt(train_ds)

In [33]:
# print a few bigrams learnt by the text vectprization layer
text_vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'e ', 'he', ' t', 'th', 'd ', ' a', ', ', ' h']

In [34]:
# train,test and valid sets need to be processed:converting from string to ngram ids
train_ds=train_ds.map(lambda x,y:(x,text_vectorizer(y)))
test_ds=test_ds.map(lambda x,y:(x,text_vectorizer(y)))
valid_ds=valid_ds.map(lambda x,y:(x,text_vectorizer(y)))

In [35]:
# Defining the model: It has previously trained Textvectorization layer, embedding layer,
# two LSTM layers, a fully connected layer with ReLU and a final prediction layer with softmax
lm_model=models.Sequential([
    text_vectorizer,layers.Embedding(n_vocab+2,96),
    layers.LSTM(512,return_state=False,return_sequences=True),
    layers.LSTM(256,return_state=False,return_sequences=True),
    layers.Dense(1024,activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(n_vocab,activation='softmax')
])

##### return_state=False means layer output only final output and if true,it return final output with state output, if its set true for LSTM it returns final output,cell state and hidden state  
##### return_sequences=True cause layer to output full output sequence opposed to final output

In [36]:
lm_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_4 (TextVe (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 96)           67872     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128, 512)          1247232   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128, 256)          787456    
_________________________________________________________________
dense_2 (Dense)              (None, 128, 1024)         263168    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 1024)         0         
_________________________________________________________________
dense_3 (Dense)              (None, 128, 705)         

In [37]:
# Defining the perplexity metric:
class PerplexityMetric(tf.keras.metrics.Mean):
    
    def __init__(self, name='perplexity', **kwargs):
      super().__init__(name=name, **kwargs)
      self.cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

    def _calculate_perplexity(self, real, pred):
      loss_ = self.cross_entropy(real, pred)
      
      # Calculating the perplexity steps: 
      step1 = K.mean(loss_, axis=-1)
      perplexity = K.exp(step1)
    
      return perplexity 

    def update_state(self, y_true, y_pred, sample_weight=None):            
      perplexity = self._calculate_perplexity(y_true, y_pred)
      super().update_state(perplexity)

##### Compile model using  
Sparse categorical cross-entropy as loss function  
Adam as optimizer  
Accuracy and perplexity as metrics

In [38]:
lm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
metrics=['accuracy', PerplexityMetric()])

In [40]:
%env TF_FORCE_GPU_ALLOW_GROWTH=true

env: TF_FORCE_GPU_ALLOW_GROWTH=true


In [43]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

In [44]:
# training the model
lstm_history = lm_model.fit(train_ds, validation_data=valid_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##### As we can see the accuracy is increasing and perplexity is decreasing, it will be much better if I ran it for more epochs but due to resource constraints, I could not do that

In [45]:
# evaluating 
lm_model.evaluate(test_ds)



[3.1948318481445312, 0.2778160870075226, 25.049728393554688]

##### Building inference model: defining a recursive model that takes the current time step’s output of the model as the input to the next time step.The need is to generate new text, nothing available in the beginning. Therefore,need to make adjustments to trained model, Using functional API not the sequential API

In [53]:
# defining inference model:
inp=layers.Input(dtype=tf.string,shape=(1,))
text_vectorized_out = lm_model.get_layer('text_vectorization_4')(inp)
inp_state_c_lstm=layers.Input(shape=(512,))
inp_state_h_lstm=layers.Input(shape=(512,))
inp_state_c_lstm_1=layers.Input(shape=(256,))
inp_state_h_lstm_1=layers.Input(shape=(256,))
# Define embedding layer and output
emb_layer=lm_model.get_layer('embedding_1')
emb_out=emb_layer(text_vectorized_out)
# Defining a LSTM layers and output
lstm_layer=layers.LSTM(512,return_state=True,return_sequences=True)
lstm_out,lstm_state_c,lstm_state_h=lstm_layer(emb_out,initial_state=[inp_state_c_lstm,inp_state_h_lstm])
lstm_1_layer=tf.keras.layers.LSTM(256,return_state=True,return_sequences=True)
lstm_1_out,lstm_1_state_c,lstm_1_state_h=lstm_1_layer(lstm_out,initial_state=[inp_state_c_lstm_1,inp_state_h_lstm_1])
# Defining a Dense layer and output
dense_out=lm_model.get_layer('dense_2')(lstm_1_out)
# Defining the final Dense layer and output
final_out=lm_model.get_layer('dense_3')(dense_out)
# Copy the weights from the original model
lstm_layer.set_weights(lm_model.get_layer('lstm_2').get_weights())
lstm_1_layer.set_weights(lm_model.get_layer('lstm_3').get_weights())
# Define final model
infer_model=models.Model(
    inputs=[inp, inp_state_c_lstm, inp_state_h_lstm, inp_state_c_lstm_1, inp_state_h_lstm_1], 
    outputs=[final_out, lstm_state_c, lstm_state_h, lstm_1_state_c, lstm_1_state_h])
# Summary
infer_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
text_vectorization_4 (TextVecto multiple             0           input_17[0][0]                   
__________________________________________________________________________________________________
embedding_1 (Embedding)         multiple             67872       text_vectorization_4[4][0]       
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 512)]        0                                            
____________________________________________________________________________________________

##### using new inference model to generate a story. defining an initial seed that will be used to generate a story.Taking the the first phrase from one of the test files. Then usig it to generate text recursively, by using the predicted bigram at time t as the input at time t+1. Running for 500 steps:

In [54]:
import numpy as np
text = ["When adam and eve were driven out of paradise, they were compelled to build a house for themselves on barren ground"]
seq = [text[0][i:i+2] for i in range(0, len(text[0]), 2)]
# build up model state using the given string
print(f"Making predictions from a {len(seq)} element long input")
vocabulary = infer_model.get_layer("text_vectorization_4").get_vocabulary()
index_word = dict(zip(range(len(vocabulary)), vocabulary))

# Reset the state of the model initially
infer_model.reset_states()
# Definin the initial state as all zeros
state_c = np.zeros(shape=(1,512))
state_h = np.zeros(shape=(1,512))
state_c_1 = np.zeros(shape=(1,256))
state_h_1 = np.zeros(shape=(1,256))
# Recursively update the model by assining new state to state
for c in seq:    
    #print(c)
    out, state_c, state_h, state_c_1, state_h_1 = infer_model.predict(
        [np.array([[c]]), state_c, state_h, state_c_1, state_h_1]
)
# Get final prediction after feeding the input string
wid = int(np.argmax(out[0],axis=-1).ravel())
word = index_word[wid]
text.append(word)
# Define first input to generate text recursively from
x = np.array([[word]])
for _ in range(500):    
    # Get the next output and state
    out, state_c, state_h, state_c_1, state_h_1  = infer_model.predict([x, state_c, state_h, state_c_1, state_h_1 ])
    # Get the word id and the word from out
    out_argsort = np.argsort(out[0], axis=-1).ravel()        
    wid = int(out_argsort[-1])
    word = index_word[wid]
    # If the word ends with space, we introduce a bit of randomness
    # Essentially pick one of the top 3 outputs for that timestep depending on their likelihood
    if word.endswith(' '):
        if np.random.normal()>0.5:
            width = 5
            i = np.random.choice(list(range(-width,0)), p=out_argsort[-width:]/out_argsort[-width:].sum())    
            wid = int(out_argsort[i])    
            word = index_word[wid]
    # Append the prediction
    text.append(word)
    # Recursively make the current prediction the next input
    x = np.array([[word]])
# Print the final output    
print('\n')
print('='*60)
print("Final text: ")
print(''.join(text))

Making predictions from a 58 element long input


Final text: 
When adam and eve were driven out of paradise, they were compelled to build a house for themselves on barren groundd the king, and the king, and the king, and the king's she was said, and they the king, and the king, and they he happen, and they he had the king, and the king's she was said, and then then the she was said, and then the king, and the king, and the king's she was said, and then the king, and the king, and the king, anyouive the will the king, and the king's she was said, and then the king, anyouive the will they they them if them, and the king's she was said, and then that there, and the king, and the king, and that there, and the king, anyouive the will that they the king's she was said, and then the king's she was said, and they the king, and the king, and the king, and the king, anyouive the will they they the king, and the king, and the king, and they he had that they was they they walk, and they had they 

##### Model is able to generate some meaningful text, it would be more better if I ran for more epochs ~ 100 but this is a small experiment only