In [1]:
import numpy as np
import keras
from keras.models import Sequential, load_model
from keras.layers import LSTM, GRU, Dense, Input, Bidirectional, Dropout
from keras.layers.core import  Activation
import tensorflow as tf

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = '/content/drive/MyDrive/Lecture 14  26042023-20230426/1661-0.txt'
text = open(path).read().lower()
print('corpus length:', len(text))
text = text.replace("\n", " ")  # We remove newlines chars for nicer display
print(f"text[:100]:{text[:100]}")
chars = sorted(list(set(text)))
print("Total chars:", len(chars))
# char dictionaries
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 581888
text[:100]:﻿ project gutenberg's the adventures of sherlock holmes, by arthur conan doyle  this ebook is for th
Total chars: 72


In [4]:
# cut the text in overlapping sequences of maxlen characters
# each time we slide of one character
maxlen = 40

sentences = [] #input sequences
next_chars = [] #target characters
for i in range(len(text) - maxlen):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])

print("Number of sequences:", len(sentences))

Number of sequences: 581848


In [5]:
# Creating one hot encoding
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


where num_sequences is the total number of sequences, maxlen is the length of each sequence, and num_chars is the total number of unique characters in the input text.

In [6]:
print(x.shape, y.shape) 

(581848, 40, 72) (581848, 72)


These one-hot encoded inputs and targets can be fed into the RNN for training

### Build the model

> Model 1: a single LSTM layer

This model has a single LSTM layer and can capture long-term dependencies in the input text. However, it may not be able to capture complex patterns in the data that require multiple layers.

The model architecture is as follows:

- The input layer takes in a sequence of characters with a maximum length of "maxlen" and a one-hot encoding of "len(chars)".


- The LSTM layer has 128 units, which are a type of recurrent neural network unit that is good at processing sequential data.

- The output layer is a Dense layer with "len(chars)" units and a softmax activation function, which will output a probability distribution over the possible characters in the dataset.


**Dropout** is a regularization technique used in neural networks to prevent overfitting. It works by randomly dropping out (i.e., setting to zero) a certain percentage of the input units of a layer during each training iteration.

The idea behind dropout is that by randomly dropping out input units, the network becomes less sensitive to the specific weights of individual neurons. This reduces the network's tendency to overfit by forcing it to learn more robust features that are useful across multiple neurons.

In [8]:
model = keras.Sequential(
    [
        Input(shape=(maxlen, len(chars))), #maximum legth of the input sequence + numer of unique characters in the input data
        Dropout(0.2),
        LSTM(128), #n of units in the layer
        Dropout(0.2),
        Dense(len(chars), activation="softmax"),
    ]
)
#for check if the file already exists
model_file = 'NCP_model.h5'

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='categorical_crossentropy',
                metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout (Dropout)           (None, 40, 72)            0         
                                                                 
 lstm (LSTM)                 (None, 128)               102912    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 72)                9288      
                                                                 
Total params: 112,200
Trainable params: 112,200
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
epochs = 50
batch_size = 128 # the model will process 128 examples at a time during each training iteration.
model.fit(x, y, 
          batch_size=batch_size, 
          epochs=epochs,
          shuffle=True)
# Evaluate the model
score = model.evaluate(x, y, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

model.save_weights(model_file)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 1.3033686876296997
Test accuracy: 0.5990825891494751


> Model 2: multiple LSTM hidden layers

This model has multiple LSTM hidden layers and can capture more complex patterns in the input data. However, it may be slower to train and more prone to overfitting.

In [12]:
model2 = keras.Sequential(
    [
        Input(shape=(maxlen, len(chars))),
        LSTM(256, return_sequences=True),# it will output the entire sequence, which is then used as input to the second LSTM layer.
        Dropout(0.2),
        LSTM(128),   #second LSTM
        Dropout(0.2),
        Dense(len(chars), activation="softmax")
    ]
)
model_file2 = 'NCP2_model.h5'
'''import os
if os.path.isfile(model_file2):
  
  print("file exists, loading")'''
model2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='categorical_crossentropy',
                metrics=['accuracy'])
print(model2.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 40, 256)           336896    
                                                                 
 dropout_2 (Dropout)         (None, 40, 256)           0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               197120    
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 72)                9288      
                                                                 
Total params: 543,304
Trainable params: 543,304
Non-trainable params: 0
_________________________________________________________________
None


The **softmax function** is used to convert the outputs of the RNN into probabilities that can be used to make predictions.
Takes as input a vector of values and outputs a vector of the same length, with each element in the output vector representing the probability of the corresponding input element belonging to a particular class. The values in the output vector are between 0 and 1, and they sum to 1, ensuring that the output represents a valid probability distribution.

**Cross-entropy**: measures the difference between the predicted probability distribution and the true probability distribution of the target variable. 
During training, the goal is to **minimize** the cross-entropy loss over the training set. 

**Accuracy:** number of correct predictions / total number of predictions.
Useful when predicted class is balanced


In [13]:
epochs = 50
batch_size = 128
model2.fit(x, y, 
          batch_size=batch_size, 
          epochs=epochs,
          shuffle=True)
model2.save_weights(model_file2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# > Model 3: **bidirectional RNN**
consists of two LSTMs: one that processes the input sequence in a forward direction, and another that processes the input sequence in a backward direction. The output of each LSTM is fed to a dense layer, and the final output is a concatenation of the two dense layer outputs.

In [24]:
model3 = keras.Sequential(
    [
        Input(shape=(maxlen, len(chars))),
        Bidirectional(LSTM(128)),  
        Dense(len(chars), activation="softmax"),
    ]
)
model_file3 = 'NCP3_model.h5'

model3.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), #to minimize the categorical cross-entropy loss function (for multi-class classification)
                loss='categorical_crossentropy',
                metrics=['accuracy'])
print(model3.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (None, 256)              205824    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 72)                18504     
                                                                 
Total params: 224,328
Trainable params: 224,328
Non-trainable params: 0
_________________________________________________________________
None


**Adam**: it is an adaptive learning rate optimization algorithm, which means that it dynamically adjusts the learning rate of each weight parameter based on the gradient statistics computed during training.
is used to minimize the categorical cross-entropy loss function, which is a common choice for multi-class classification problems.

In [25]:
epochs = 50
batch_size = 128
model3.fit(x, y, 
          batch_size=batch_size, 
          epochs=epochs,
          shuffle=True)
# Evaluate the model
score = model3.evaluate(x, y, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

model3.save_weights(model_file3)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 1.1243928670883179
Test accuracy: 0.6505255699157715


> Model 4: GRU

In [7]:
model4 = keras.Sequential(
    [
        Input(shape=(maxlen, len(chars))),
        GRU(256),
        Dense(len(chars), activation="softmax"),
    ]
)
model_file4 = 'NCP4_model.h5'
model4.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='categorical_crossentropy',
                metrics=['accuracy'])
print(model4.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 256)               253440    
                                                                 
 dense (Dense)               (None, 72)                18504     
                                                                 
Total params: 271,944
Trainable params: 271,944
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
epochs = 50
batch_size = 128
model4.fit(x, y, 
          batch_size=batch_size, 
          epochs=epochs,
          shuffle=True,
          verbose=1)
# Evaluate the model
score = model4.evaluate(x, y, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

model4.save_weights(model_file4)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: 0.9451407194137573
Test accuracy: 0.6967730522155762


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=374496d5-c170-4e50-80a7-05bb8c49dee7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>