<a href="https://colab.research.google.com/github/urvashiramdasani/ML-DL-Python/blob/master/DL/18BCE247_DL7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Name : Urvashi Ramdasani

Division : EL3

Aim : Build a language model using RNN. Write functions to sample novel sentences and find the probability of input sentence.

In [17]:
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [18]:
data = """ Jack and Jill went up the hill .\n To fetch a pail of water .\n Jack fell down and broke his crown .\n And Jill came tumbling after .""".split("\n")
print(data, type(data))

[' Jack and Jill went up the hill .', ' To fetch a pail of water .', ' Jack fell down and broke his crown .', ' And Jill came tumbling after .'] <class 'list'>


In [19]:
tokenizer = Tokenizer(filters = '!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n') # all special characters except period

In [20]:
tokenizer.fit_on_texts(data)
vocabulary = tokenizer.word_index
print('Word Indices : ', vocabulary)

Word Indices :  {'.': 1, 'and': 2, 'jack': 3, 'jill': 4, 'went': 5, 'up': 6, 'the': 7, 'hill': 8, 'to': 9, 'fetch': 10, 'a': 11, 'pail': 12, 'of': 13, 'water': 14, 'fell': 15, 'down': 16, 'broke': 17, 'his': 18, 'crown': 19, 'came': 20, 'tumbling': 21, 'after': 22}


In [21]:
vocab_size = len(vocabulary) + 1
print('Vocabulary Size: ', vocab_size)

Vocabulary Size:  23


In [22]:
sequences = tokenizer.texts_to_sequences(data)
print(sequences)

[[3, 2, 4, 5, 6, 7, 8, 1], [9, 10, 11, 12, 13, 14, 1], [3, 15, 16, 2, 17, 18, 19, 1], [2, 4, 20, 21, 22, 1]]


In [23]:
# Generating lists with and without period

x = list()
y = list()

for i in range(len(sequences)):
  x.insert(i, sequences[i][:-1])
  y.insert(i, sequences[i])

print('List x = ', x)
print('List y = ', y)

List x =  [[3, 2, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14], [3, 15, 16, 2, 17, 18, 19], [2, 4, 20, 21, 22]]
List y =  [[3, 2, 4, 5, 6, 7, 8, 1], [9, 10, 11, 12, 13, 14, 1], [3, 15, 16, 2, 17, 18, 19, 1], [2, 4, 20, 21, 22, 1]]


In [24]:
max_len = max([len(sequence) for sequence in x])
print('Maximum length sequence : ', max_len)

Maximum length sequence :  7


In [25]:
x_padded = pad_sequences(x, maxlen = max_len, padding = 'pre')
print(x_padded)

[[ 3  2  4  5  6  7  8]
 [ 0  9 10 11 12 13 14]
 [ 3 15 16  2 17 18 19]
 [ 0  0  2  4 20 21 22]]


In [26]:
y_padded = pad_sequences(y, maxlen = max_len, padding = 'post')
print(y_padded)

[[ 2  4  5  6  7  8  1]
 [ 9 10 11 12 13 14  1]
 [15 16  2 17 18 19  1]
 [ 2  4 20 21 22  1  0]]


In [27]:
y_padded = to_categorical(y_padded, num_classes = vocab_size)
print(y_padded)

[[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [28]:
# Define the model

model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 10))
model.add(SimpleRNN(units = 50, return_sequences = True))
model.add(Dense(units = vocab_size, activation = 'softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 10)          230       
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, None, 50)          3050      
_________________________________________________________________
dense_1 (Dense)              (None, None, 23)          1173      
Total params: 4,453
Trainable params: 4,453
Non-trainable params: 0
_________________________________________________________________


In [29]:
# Compile the model

model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [30]:
# Fit the model

model.fit(x_padded, y_padded, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f07d4078750>

In [31]:
def prob_input_sentence(model, tokenizer, sentence):
  encoded = tokenizer.texts_to_sequences([sentence])[0]
  encoded.insert(0, 0)
  encoded = np.array(encoded)
  encoded = np.reshape(encoded, newshape = (1, -1))
  prob = model.predict_proba(encoded, verbose = 0)
  probability = 1
  for i in range(prob.shape[1] - 1):
    probability *= prob[0, i, encoded[0, i + 1]]
  print('The probability of sentence ', sentence, ' is ', probability)

In [32]:
prob_input_sentence(model, tokenizer, 'Jack and Jill went up')
prob_input_sentence(model, tokenizer, 'and Jill went up')
prob_input_sentence(model, tokenizer, 'went up the hill')
prob_input_sentence(model, tokenizer, 'Jack and Jill went up the hill')
prob_input_sentence(model, tokenizer, 'to fetch a pail')
prob_input_sentence(model, tokenizer, 'fetch a pail')
prob_input_sentence(model, tokenizer, 'to fetch a pail of water')
prob_input_sentence(model, tokenizer, 'Jack fell down and')
prob_input_sentence(model, tokenizer, 'Jack fell down and broke')
prob_input_sentence(model, tokenizer, 'fell down and broke')
prob_input_sentence(model, tokenizer, 'Jack fell down and broke his crown')
prob_input_sentence(model, tokenizer, 'and Jill came tumbling')
prob_input_sentence(model, tokenizer, 'Jill came tumbling after')
prob_input_sentence(model, tokenizer, 'and Jill came tumbling after')



The probability of sentence  Jack and Jill went up  is  4.046618947431747e-08
The probability of sentence  and Jill went up  is  0.0006520378597691507
The probability of sentence  went up the hill  is  1.302733400251648e-05
The probability of sentence  Jack and Jill went up the hill  is  2.0313087823372113e-10
The probability of sentence  to fetch a pail  is  0.014515784069070246
The probability of sentence  fetch a pail  is  2.0629078529688497e-06
The probability of sentence  to fetch a pail of water  is  0.009894810556752778
The probability of sentence  Jack fell down and  is  2.299133628315368e-06
The probability of sentence  Jack fell down and broke  is  1.3553367788220498e-07
The probability of sentence  fell down and broke  is  0.0001451889069790434
The probability of sentence  Jack fell down and broke his crown  is  6.122907999162922e-09
The probability of sentence  and Jill came tumbling  is  0.0020658467480818045
The probability of sentence  Jill came tumbling after  is  2.722

## Word Level Language Model

In [1]:
from tensorflow import keras
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding
from keras.preprocessing.sequence import pad_sequences

In [2]:
data = """ Jack and Jill went up the hill .\n To fetch a pail of water .\n Jack fell down and broke his crown .\n And Jill came tumbling after .""".split('\n')
print(data, type(data))

[' Jack and Jill went up the hill .', ' To fetch a pail of water .', ' Jack fell down and broke his crown .', ' And Jill came tumbling after .'] <class 'list'>


In [3]:
tokenizer = Tokenizer(filters = '!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')

In [4]:
tokenizer.fit_on_texts(data)
vocabulary = tokenizer.word_index
print('Word Indices : ', vocabulary)

Word Indices :  {'.': 1, 'and': 2, 'jack': 3, 'jill': 4, 'went': 5, 'up': 6, 'the': 7, 'hill': 8, 'to': 9, 'fetch': 10, 'a': 11, 'pail': 12, 'of': 13, 'water': 14, 'fell': 15, 'down': 16, 'broke': 17, 'his': 18, 'crown': 19, 'came': 20, 'tumbling': 21, 'after': 22}


In [5]:
vocab_size = len(vocabulary) + 1
print('Vocabulary Size: ', vocab_size)

Vocabulary Size:  23


In [6]:
sequences = tokenizer.texts_to_sequences(data)
print(sequences)

[[3, 2, 4, 5, 6, 7, 8, 1], [9, 10, 11, 12, 13, 14, 1], [3, 15, 16, 2, 17, 18, 19, 1], [2, 4, 20, 21, 22, 1]]


In [7]:
# Generating lists with and without period

x = list()
y = list()

for i in range(len(sequences)):
  x.insert(i, sequences[i][:-1])
  y.insert(i, sequences[i])

print('List x = ', x)
print('List y = ', y)

List x =  [[3, 2, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14], [3, 15, 16, 2, 17, 18, 19], [2, 4, 20, 21, 22]]
List y =  [[3, 2, 4, 5, 6, 7, 8, 1], [9, 10, 11, 12, 13, 14, 1], [3, 15, 16, 2, 17, 18, 19, 1], [2, 4, 20, 21, 22, 1]]


In [8]:
max_len = max([len(sequence) for sequence in x])
print('Maximum length sequence : ', max_len)

Maximum length sequence :  7


In [9]:
x_padded = pad_sequences(x, maxlen = max_len, padding = 'pre')
print(x_padded)

[[ 3  2  4  5  6  7  8]
 [ 0  9 10 11 12 13 14]
 [ 3 15 16  2 17 18 19]
 [ 0  0  2  4 20 21 22]]


In [10]:
y_padded = pad_sequences(y, maxlen = max_len, padding = 'post')
print(y_padded)

[[ 2  4  5  6  7  8  1]
 [ 9 10 11 12 13 14  1]
 [15 16  2 17 18 19  1]
 [ 2  4 20 21 22  1  0]]


In [11]:
y_padded = to_categorical(y_padded, num_classes = vocab_size)
print(y_padded)

[[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [12]:
# Define the model

model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 10))
model.add(SimpleRNN(units = 50, return_sequences = True))
model.add(Dense(units = vocab_size, activation = 'softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          230       
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, None, 50)          3050      
_________________________________________________________________
dense (Dense)                (None, None, 23)          1173      
Total params: 4,453
Trainable params: 4,453
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Compile the model

model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [14]:
# Fit the model

model.fit(x_padded, y_padded, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f07dbbd4590>

In [15]:
def sample_seq_wo_seed(model, tokenizer, n_words, vocab_size):
  encoded = []
  in_text = ''
  for i in range(n_words):
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    encoded.insert(0, 0)
    encoded = np.array(encoded)
    encoded = np.reshape(encoded, newshape = (1, -1))

    if i == 0:
      prob = model.predict_proba(encoded, verbose = 0)
      yhat = 0

      while yhat == 0:
        yhat = np.random.choice(range(vocab_size), p = prob.ravel())
      
      yhat = [yhat]
      yhat = np.array(yhat)
      yhat = np.reshape(yhat, newshape = (1, -1))
    
    else:
      yhat = np.append(yhat, 0)
      yhat = np.reshape(yhat, newshape = (1, -1))

      while yhat[0, i] == 0:
        yhat = model.predict_classes(encoded, verbose = 0)
    
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat[0, i]:
        out_word = word
        break
      
    in_text = in_text + out_word + ' '
  return in_text

In [16]:
print(sample_seq_wo_seed(model, tokenizer, 8, vocab_size))



to jill a pail of water . hill 
