# CSC8637 - Deep Learning - Task 3 - Language Model

### A language text model using the text from the book “Poirot Investigates” by Agatha Christie (code run on Google Collab)



In [1]:
# use of g-drive
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import modules
import os
import nltk
import tensorflow as tf
import numpy as np
import pandas
from tensorflow.keras.layers.experimental import preprocessing

In [3]:
# Load the ebook (format:txt,online)
text_data = tf.keras.utils.get_file('61262-0.txt', 'https://www.gutenberg.org/files/61262/61262-0.txt')

# Read and decode the file
book = open(text_data, 'rb').read().decode(encoding='utf-8')

# Print the length of text
print('The ebook has {} characters'.format(len(book)))

# Print the unique characters
vocab = sorted(set(book))
print('There are {} unique characters'.format(len(vocab)))

Downloading data from https://www.gutenberg.org/files/61262/61262-0.txt
The ebook has 324542 characters
There are 105 unique characters


In [4]:
# An overview of the data
print(book[:524])

The Project Gutenberg EBook of Poirot Investigates, by Agatha Christie

This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost no restrictions
whatsoever.  You may copy it, give it away or re-use it under the terms of
the Project Gutenberg License included with this eBook or online at
www.gutenberg.org.  If you are not located in the United States, you'll have
to check the laws of the country where you are located before using this ebook.



In [5]:
# Character to index
chartoindex = {v:i for i,v in enumerate(vocab)}
int_text = np.array([chartoindex[i] for i in book])

# Index to character 
indextochar = np.array(vocab)

In [6]:
print("Character to index: \n")
for char,_ in zip(chartoindex, range(105)):
    print('  {:4s}: {:3d}'.format(repr(char), chartoindex[char]))

print("\n Text to integer: \n")
print('{} to {}'.format(repr(book[:20]),int_text[:20]))

Character to index: 

  '\n':   0
  '\r':   1
  ' ' :   2
  '!' :   3
  '"' :   4
  '#' :   5
  '$' :   6
  '%' :   7
  '&' :   8
  "'" :   9
  '(' :  10
  ')' :  11
  '*' :  12
  ',' :  13
  '-' :  14
  '.' :  15
  '/' :  16
  '0' :  17
  '1' :  18
  '2' :  19
  '3' :  20
  '4' :  21
  '5' :  22
  '6' :  23
  '7' :  24
  '8' :  25
  '9' :  26
  ':' :  27
  ';' :  28
  '?' :  29
  '@' :  30
  'A' :  31
  'B' :  32
  'C' :  33
  'D' :  34
  'E' :  35
  'F' :  36
  'G' :  37
  'H' :  38
  'I' :  39
  'J' :  40
  'K' :  41
  'L' :  42
  'M' :  43
  'N' :  44
  'O' :  45
  'P' :  46
  'Q' :  47
  'R' :  48
  'S' :  49
  'T' :  50
  'U' :  51
  'V' :  52
  'W' :  53
  'X' :  54
  'Y' :  55
  'Z' :  56
  '[' :  57
  ']' :  58
  '_' :  59
  'a' :  60
  'b' :  61
  'c' :  62
  'd' :  63
  'e' :  64
  'f' :  65
  'g' :  66
  'h' :  67
  'i' :  68
  'j' :  69
  'k' :  70
  'l' :  71
  'm' :  72
  'n' :  73
  'o' :  74
  'p' :  75
  'q' :  76
  'r' :  77
  's' :  78
  't' :  79
  'u' :  80
  'v' 

In [7]:
# Maximum characters as an input
length= 80   # sequence length
examples_per_epoch = len(book)

# Text to character index stream
char_dt = tf.data.Dataset.from_tensor_slices(int_text)

In [8]:
# Sequences from individual characters
sequences = char_dt.batch(length+1, drop_remainder=True)

In [9]:
# Create input-target pairs
def input_target_pairs(k):
    input_text = k[:-1]
    target_text = k[1:]          # next step
    return input_text, target_text

data_text = sequences.map(input_target_pairs)

In [10]:
# Batch size, buffer size for shuffling
batch_size = 128
buffer_size = 10000

dataset = data_text.shuffle(buffer_size).batch(batch_size , drop_remainder=True)
dataset

<BatchDataset element_spec=(TensorSpec(shape=(128, 80), dtype=tf.int64, name=None), TensorSpec(shape=(128, 80), dtype=tf.int64, name=None))>

In [11]:
print("Character_Stream: \n")
for i in char_dt.take(13):
  print(indextochar[i.numpy()])  

print("\nSequence: \n")
for i in sequences.take(13):
  print(repr(''.join(indextochar[i.numpy()]))) 

Character_Stream: 

﻿
T
h
e
 
P
r
o
j
e
c
t
 

Sequence: 

'\ufeffThe Project Gutenberg EBook of Poirot Investigates, by Agatha Christie\r\n\r\nThis e'
'Book is for the use of anyone anywhere in the United States and most\r\nother parts'
' of the world at no cost and with almost no restrictions\r\nwhatsoever.  You may co'
'py it, give it away or re-use it under the terms of\r\nthe Project Gutenberg Licens'
'e included with this eBook or online at\r\nwww.gutenberg.org.  If you are not locat'
"ed in the United States, you'll have\r\nto check the laws of the country where you "
'are located before using this ebook.\r\n\r\nTitle: Poirot Investigates\r\n\r\nAuthor: Aga'
'tha Christie\r\n\r\nRelease Date: January 28, 2020 [EBook #61262]\r\n\r\nLanguage: Englis'
'h\r\n\r\nCharacter set encoding: UTF-8\r\n\r\n*** START OF THIS PROJECT GUTENBERG EBOOK P'
'OIROT INVESTIGATES ***\r\n\r\n\r\n\r\n\r\nProduced by an anonymous Project Gutenberg volunt'
'eer.\r\n\r\n\r\n\r\n\r\n\r\n\r\n  POIROT INVEST

In [12]:
# Create the lstm model
def model_lstm(vocabul_size, embedding_dim, rnn_units, batch_size):
    txt_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabul_size, embedding_dim,           # embedding
                              batch_input_shape=[batch_size, None]), 
    tf.keras.layers.LSTM(rnn_units,                                  # LSTM
                        return_sequences=True,
                        stateful=True,),
    tf.keras.layers.Dense(vocabul_size)                              # Dense
  ])
    return txt_model

In [13]:
batch_size = 128
vocabul_size = len(vocab)
embedding_dim = 256
rnn_units= 1024

lstm_txt_model = model_lstm(vocabul_size = vocabul_size,embedding_dim=embedding_dim, rnn_units=rnn_units,batch_size=batch_size)

In [14]:
# Test the shape
for input_example_batch, target_example_batch in dataset.take(1):
    prediction = lstm_txt_model(input_example_batch)
    assert (prediction.shape == (batch_size, length, vocabul_size)) 
    print(prediction.shape)

(128, 80, 105)


In [16]:
# Loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

# Test the loss
example_loss  = loss(target_example_batch, prediction)
print("Prediction shape: ", prediction.shape)
print("Loss:      ", example_loss.numpy().mean())

Prediction shape:  (128, 80, 105)
Loss:       4.654209


In [17]:
# Compile the model
lstm_txt_model.compile(optimizer='adam', loss=loss)

In [18]:
# Model checkpoints
#lstm_dir_checkpoints = 'lstm_checkpoints'
#checkpoint_prefix = os.path.join(lstm_dir_checkpoints, "checkpt_{epoch}") 
#checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)

In [19]:
#history = lstm_txt_model.fit(dataset, epochs=180, callbacks=[checkpoint_callback]) # train the model for 6 times,55 epochs each 

Epoch 1/180
Epoch 2/180
Epoch 3/180
Epoch 4/180
Epoch 5/180
Epoch 6/180
Epoch 7/180
Epoch 8/180
Epoch 9/180
Epoch 10/180
Epoch 11/180
Epoch 12/180
Epoch 13/180
Epoch 14/180
Epoch 15/180
Epoch 16/180
Epoch 17/180
Epoch 18/180
Epoch 19/180
Epoch 20/180
Epoch 21/180
Epoch 22/180
Epoch 23/180
Epoch 24/180
Epoch 25/180
Epoch 26/180
Epoch 27/180
Epoch 28/180
Epoch 29/180
Epoch 30/180
Epoch 31/180
Epoch 32/180
Epoch 33/180
Epoch 34/180
Epoch 35/180
Epoch 36/180
Epoch 37/180
Epoch 38/180
Epoch 39/180
Epoch 40/180
Epoch 41/180
Epoch 42/180
Epoch 43/180
Epoch 44/180
Epoch 45/180
Epoch 46/180
Epoch 47/180
Epoch 48/180
Epoch 49/180
Epoch 50/180
Epoch 51/180
Epoch 52/180
Epoch 53/180
Epoch 54/180
Epoch 55/180
Epoch 56/180
Epoch 57/180
Epoch 58/180
Epoch 59/180
Epoch 60/180
Epoch 61/180
Epoch 62/180
Epoch 63/180
Epoch 64/180
Epoch 65/180
Epoch 66/180
Epoch 67/180
Epoch 68/180
Epoch 69/180
Epoch 70/180
Epoch 71/180
Epoch 72/180
Epoch 73/180
Epoch 74/180
Epoch 75/180
Epoch 76/180
Epoch 77/180
Epoch 78

In [20]:
# load the weights(I chose to load the weights of epoch n.179/50 (with lower loss) the time we trained the model )
lstm_mod = model_lstm(vocabul_size, embedding_dim, rnn_units, batch_size=1)
lstm_mod.load_weights('/content/lstm_checkpoints/checkpt_179').expect_partial()
lstm_mod.build(tf.TensorShape([1, None]))

In [21]:
lstm_mod.save('/content/lstm_checkpoints.h5')



In [22]:
# Function to generate text
def generate_text(model, input_string):
    num = 50                 # chars to be generated

    for_input = [chartoindex[s] for s in input_string]  # text to indexes
    for_input = tf.expand_dims(for_input, 0)

    # result with predicted characters
    text_result = []

    # Set temperature
    temperature = 0.5

    model.reset_states()
    for i in range(num):
        predictions = model(for_input)
        # Remove the dimension from batch
        predictions = tf.squeeze(predictions, 0)

        # Categorical distribution for prediction
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character
        for_input = tf.expand_dims([predicted_id], 0)
        text_result.append(indextochar[predicted_id])

    return (input_string + ''.join(text_result))

In [23]:
# User Input 
lstm_pred = input("Enter your text: ")
# Prediction
print(generate_text(lstm_mod, input_string=lstm_pred))

Enter your text: This telegram has
This telegram has been a man. His right hand still grasped the base
