In [63]:
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding

from tensorflow.keras.utils import to_categorical


In [64]:

data = """Once there was a dog.\nOne day, he found a big juicy bone.
Dog immediately grabbed it between his mouth and took it home.
On his way home, he crossed a river.
That night, he went home hungry."""
print(data)

Once there was a dog.
One day, he found a big juicy bone.
Dog immediately grabbed it between his mouth and took it home.
On his way home, he crossed a river.
That night, he went home hungry.


In [65]:
data_splitted = data.split('\n')
data_splitted

['Once there was a dog.',
 'One day, he found a big juicy bone.',
 'Dog immediately grabbed it between his mouth and took it home.',
 'On his way home, he crossed a river.',
 'That night, he went home hungry.']

In [66]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~')

# Initializing vocabulary
tokenizer.fit_on_texts(data_splitted)
print(tokenizer.word_index)

vocab_length = len(tokenizer.word_index) + 1
vocab_length

{'a': 1, 'he': 2, 'it': 3, 'his': 4, 'home': 5, 'once': 6, 'there': 7, 'was': 8, 'dog.': 9, 'one': 10, 'day': 11, 'found': 12, 'big': 13, 'juicy': 14, 'bone.': 15, 'dog': 16, 'immediately': 17, 'grabbed': 18, 'between': 19, 'mouth': 20, 'and': 21, 'took': 22, 'home.': 23, 'on': 24, 'way': 25, 'crossed': 26, 'river.': 27, 'that': 28, 'night': 29, 'went': 30, 'hungry.': 31}


32

In [67]:
sequences = tokenizer.texts_to_sequences(data_splitted)
sequences

[[6, 7, 8, 1, 9],
 [10, 11, 2, 12, 1, 13, 14, 15],
 [16, 17, 18, 3, 19, 4, 20, 21, 22, 3, 23],
 [24, 4, 25, 5, 2, 26, 1, 27],
 [28, 29, 2, 30, 5, 31]]

In [68]:
X = []
y = []

for i in range(len(sequences)):
  X.append(sequences[i][:-1])

y = sequences

print(X)
print(y)

[[6, 7, 8, 1], [10, 11, 2, 12, 1, 13, 14], [16, 17, 18, 3, 19, 4, 20, 21, 22, 3], [24, 4, 25, 5, 2, 26, 1], [28, 29, 2, 30, 5]]
[[6, 7, 8, 1, 9], [10, 11, 2, 12, 1, 13, 14, 15], [16, 17, 18, 3, 19, 4, 20, 21, 22, 3, 23], [24, 4, 25, 5, 2, 26, 1, 27], [28, 29, 2, 30, 5, 31]]


In [69]:
for x in X:
  x.insert(0,0)

for op in y:
  op.insert(0,0)

X
y

[[0, 6, 7, 8, 1, 9],
 [0, 10, 11, 2, 12, 1, 13, 14, 15],
 [0, 16, 17, 18, 3, 19, 4, 20, 21, 22, 3, 23],
 [0, 24, 4, 25, 5, 2, 26, 1, 27],
 [0, 28, 29, 2, 30, 5, 31]]

In [70]:
# finding the max length of input sequences
max_len = 0; 
for x in X:
  max_len = max(max_len,len(x))

max_len

11

In [71]:
X = pad_sequences(X,max_len,padding='pre')
X

array([[ 0,  0,  0,  0,  0,  0,  0,  6,  7,  8,  1],
       [ 0,  0,  0,  0, 10, 11,  2, 12,  1, 13, 14],
       [ 0, 16, 17, 18,  3, 19,  4, 20, 21, 22,  3],
       [ 0,  0,  0,  0, 24,  4, 25,  5,  2, 26,  1],
       [ 0,  0,  0,  0,  0,  0, 28, 29,  2, 30,  5]], dtype=int32)

In [72]:
y = pad_sequences(y,max_len,padding='pre')
y

array([[ 0,  0,  0,  0,  0,  0,  6,  7,  8,  1,  9],
       [ 0,  0,  0, 10, 11,  2, 12,  1, 13, 14, 15],
       [16, 17, 18,  3, 19,  4, 20, 21, 22,  3, 23],
       [ 0,  0,  0, 24,  4, 25,  5,  2, 26,  1, 27],
       [ 0,  0,  0,  0,  0, 28, 29,  2, 30,  5, 31]], dtype=int32)

In [73]:
y = to_categorical(y,num_classes = vocab_length)
y

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

    

In [74]:
y.shape


(5, 11, 32)

In [75]:
model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=10))
model.add(SimpleRNN(50,return_sequences=True))
model.add(Dense(units=vocab_length, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 10)          320       
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, None, 50)          3050      
_________________________________________________________________
dense_2 (Dense)              (None, None, 32)          1632      
Total params: 5,002
Trainable params: 5,002
Non-trainable params: 0
_________________________________________________________________


In [76]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(X,y,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f0df412a7d0>

In [77]:
def prob_of_sentence(model,tokenizer,sentence):

  # converting sentence into numerical form
  encoded_sentence = tokenizer.texts_to_sequences([sentence])[0]
  print(encoded_sentence)

  # adding 0 as X<0>
  encoded_sentence.insert(0,0)
  print(encoded_sentence)

  encoded_sentence = np.array(encoded_sentence).reshape((1,-1))
  print(encoded_sentence)

  prob = model.predict(encoded_sentence)
  print(prob.shape)

  probability = 1
  for i in range(0,prob.shape[1] - 1):
    probability *= prob[0,i,encoded_sentence[0,i+1]]
  print(probability)

In [78]:
prob_of_sentence(model,tokenizer,"fell down")

[]
[0]
[[0]]
(1, 1, 32)
1


ii) Sentence Generation

In [81]:
def sample_all_wo_seed(model,tokenizer,n_words,vocab_length):
  encoded_sentence = []
  inp_text = ''

  for i in range(n_words):
    print('-'*50)
    print('Input text : ', inp_text)

    # converting sentence into numerical form
    encoded_sentence = tokenizer.texts_to_sequences([inp_text])[0]

    # adding 0 as X<0>
    encoded_sentence.insert(0,0)
    
    encoded_sentence = np.array(encoded_sentence).reshape((1,-1))
    print("For i : {} Encoded is : {}".format(i, encoded_sentence))

    if i == 0:
      prob = model.predict(encoded_sentence, verbose= 0)
      y_hat = 0
      while y_hat == 0:
        y_hat = np.random.choice(range(vocab_length),p=prob.ravel())
        y_hat = np.array(y_hat).reshape((1,-1))
      print("For i : {} yhat in if is : {}".format(i, y_hat))

    else:
      prob = model.predict(encoded_sentence, verbose= 0)
      print(prob.shape)
      y_hat = np.append(y_hat,0)
      y_hat = np.array(y_hat).reshape((1,-1))

      while y_hat[0][i] == 0:
        y_hat[0][i] = np.random.choice(range(vocab_length),p=prob[0][i].ravel())
      print("For i : {} yhat in else is : {}".format(i, y_hat))

    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == y_hat[0][i]:
        output_word = word
        break
    inp_text += output_word + ' '

    print('-'*50)

  return inp_text
print('\n\n' + color.BOLD + sample_all_wo_seed(model,tokenizer,3,vocab_length) + color.END)


--------------------------------------------------
Input text :  
For i : 0 Encoded is : [[0]]
For i : 0 yhat in if is : [[16]]
--------------------------------------------------
--------------------------------------------------
Input text :  dog 
For i : 1 Encoded is : [[ 0 16]]
(1, 2, 32)
For i : 1 yhat in else is : [[16 17]]
--------------------------------------------------
--------------------------------------------------
Input text :  dog immediately 
For i : 2 Encoded is : [[ 0 16 17]]
(1, 3, 32)
For i : 2 yhat in else is : [[16 17 18]]
--------------------------------------------------


[1mdog immediately grabbed [0m


In [87]:
def sample_all_wo_seed_with_hp(model,tokenizer,n_words,vocab_length):
  encoded_sentence = []
  inp_text = ''

  for i in range(n_words):
    print('-'*50)
    print('Input text : ', inp_text)

    # converting sentence into numerical form
    encoded_sentence = tokenizer.texts_to_sequences([inp_text])[0]

    # adding 0 as X<0>
    encoded_sentence.insert(0,0)
    
    encoded_sentence = np.array(encoded_sentence).reshape((1,-1))
    print("For i : {} Encoded is : {}".format(i, encoded_sentence))

    if i == 0:
      prob = model.predict(encoded_sentence, verbose= 0)
      y_hat = 0
      while y_hat == 0:
        y_hat = np.random.choice(range(vocab_length),p=prob.ravel())
        y_hat = np.array(y_hat).reshape((1,-1))
      print("For i : {} yhat in if is : {}".format(i, y_hat))

    else:
      prob = model.predict(encoded_sentence, verbose= 0)
      print(prob.shape)
      y_hat = np.append(y_hat,0)
      y_hat = np.array(y_hat).reshape((1,-1))

      # while y_hat[0][i] == 0:
      
      y_hat[0][i] = np.argmax(prob[0][i].ravel()[1:] , axis=0)
      print("For i : {} yhat in else is : {}".format(i, y_hat))

    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == y_hat[0][i]:
        output_word = word
        break
    inp_text += output_word + ' '

    print('-'*50)

  return inp_text
print('\n\n' + color.BOLD + sample_all_wo_seed_with_hp(model,tokenizer,5,vocab_length) + color.END)


--------------------------------------------------
Input text :  
For i : 0 Encoded is : [[0]]
For i : 0 yhat in if is : [[2]]
--------------------------------------------------
--------------------------------------------------
Input text :  he 
For i : 1 Encoded is : [[0 2]]
(1, 2, 32)
For i : 1 yhat in else is : [[ 2 16]]
--------------------------------------------------
--------------------------------------------------
Input text :  he dog 
For i : 2 Encoded is : [[ 0  2 16]]
(1, 3, 32)
For i : 2 yhat in else is : [[ 2 16 17]]
--------------------------------------------------
--------------------------------------------------
Input text :  he dog immediately 
For i : 3 Encoded is : [[ 0  2 16 17]]
(1, 4, 32)
For i : 3 yhat in else is : [[ 2 16 17  2]]
--------------------------------------------------
--------------------------------------------------
Input text :  he dog immediately he 
For i : 4 Encoded is : [[ 0  2 16 17  2]]
(1, 5, 32)
For i : 4 yhat in else is : [[ 2 16 17 