#LSTM for next word prediction


In [57]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout





In [58]:

text = """The US set a 19-run target after their super over but the Green Shirts could only manage 13.

The teams went to the super over after the home side levelled the score in the last ball of the 20th over.

The US earlier began cautiously against the Green Team as Pakistan got dispirited by the home side’s start.Naseem Shah dismissed Steven Taylor on the first ball of his second over before Andries Gous’s edge on the next delivery dropped short of Iftikhar Ahmed in slips and went to the covers for four.

Gous smashed Naseem for another boundary and ended the power play at 44 for the loss of one wicket.

Pakistan kept searching for their second wicket as Babar Azam brought on Shadab Khan but Gous and Monank Patel comfortably kept the score ticking for their side.

Earlier, Pakistan were held to a total of 159-7 as the co-hosts eyed an upset win.

The US chased down a target of 195 to beat fellow non-Test nation Canada in their first Group A encounter but were now up against a far superior bowling attack.

Nevertheless, the US could be proud of their efforts in the field against Pakistan given they reduced the 2009 T20 World Cup winners to 26-3 after winning the toss.

Left-arm spinner Nosthush Kenjige, who opened the bowling, took 3-30 — including two wickets in two balls — from his maximum four overs and left-arm paceman Saurabh Netravalkar a miserly 2-18.

Pakistan were in dire straits at 26-3 inside five overs before a partnership of 72 between skipper Babar (44) and Shadab (40).

Shadab and Azam Khan fell in successive balls to Kenjige.

Pakistan, runners-up to England at the last T20 World Cup in Australia two years ago, were faltering again at 98-5 before tailender Shaheen Shah Afridi’s unbeaten 23 boosted the total.

Earlier, Mohammad Rizwan was superbly caught one-handed at slip by Steven Taylor off Netravalkar before Usman Khan holed out off Kenjige to leave Pakistan 14-2."""



In [59]:
#Tokenizing the text
tokenized_text = Tokenizer()
tokenized_text.fit_on_texts([text])
total_words = len(tokenized_text.word_index) + 1

In [60]:
tokenized_text.word_index

{'the': 1,
 'to': 2,
 'in': 3,
 'of': 4,
 'a': 5,
 'pakistan': 6,
 'and': 7,
 'their': 8,
 'for': 9,
 'at': 10,
 'us': 11,
 'over': 12,
 'before': 13,
 'were': 14,
 'after': 15,
 'but': 16,
 'earlier': 17,
 'against': 18,
 'as': 19,
 'on': 20,
 'shadab': 21,
 'khan': 22,
 '3': 23,
 'kenjige': 24,
 'two': 25,
 'target': 26,
 'super': 27,
 'green': 28,
 'could': 29,
 'went': 30,
 'home': 31,
 'side': 32,
 'score': 33,
 'last': 34,
 'ball': 35,
 'by': 36,
 'naseem': 37,
 'shah': 38,
 'steven': 39,
 'taylor': 40,
 'first': 41,
 'his': 42,
 'second': 43,
 'four': 44,
 'gous': 45,
 '44': 46,
 'one': 47,
 'wicket': 48,
 'kept': 49,
 'babar': 50,
 'azam': 51,
 'total': 52,
 'up': 53,
 'bowling': 54,
 't20': 55,
 'world': 56,
 'cup': 57,
 '26': 58,
 'left': 59,
 'arm': 60,
 '—': 61,
 'balls': 62,
 'overs': 63,
 'netravalkar': 64,
 '2': 65,
 'off': 66,
 'set': 67,
 '19': 68,
 'run': 69,
 'shirts': 70,
 'only': 71,
 'manage': 72,
 '13': 73,
 'teams': 74,
 'levelled': 75,
 '20th': 76,
 'began': 77

In [61]:
# Text pre-processing to prepare it for next word prediction.
# We will split each senttence in such way that First we have the first two word to predict the
# third word, then we have the 1s, 2nd and 3rd word to predict the 4th word and so on...

training_data = [
    token_list[:i+1]
    for line in text.split('\n')
    for token_list in [tokenized_text.texts_to_sequences([line])[0]]
    for i in range(1, len(token_list))
]
# Below is how our training data looks like. Our goal is to predict the next word based on each list
#print(training_data)
for i in training_data:
  print(i)

[1, 11]
[1, 11, 67]
[1, 11, 67, 5]
[1, 11, 67, 5, 68]
[1, 11, 67, 5, 68, 69]
[1, 11, 67, 5, 68, 69, 26]
[1, 11, 67, 5, 68, 69, 26, 15]
[1, 11, 67, 5, 68, 69, 26, 15, 8]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16, 1]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16, 1, 28]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16, 1, 28, 70]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16, 1, 28, 70, 29]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16, 1, 28, 70, 29, 71]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16, 1, 28, 70, 29, 71, 72]
[1, 11, 67, 5, 68, 69, 26, 15, 8, 27, 12, 16, 1, 28, 70, 29, 71, 72, 73]
[1, 74]
[1, 74, 30]
[1, 74, 30, 2]
[1, 74, 30, 2, 1]
[1, 74, 30, 2, 1, 27]
[1, 74, 30, 2, 1, 27, 12]
[1, 74, 30, 2, 1, 27, 12, 15]
[1, 74, 30, 2, 1, 27, 12, 15, 1]
[1, 74, 30, 2, 1, 27, 12, 15, 1, 31]
[1, 74, 30, 2, 1, 27, 12, 15, 1, 31, 32]
[1, 74, 30, 2, 1, 27, 12, 15, 1, 31,


---
Now we have sequences of varying lengths but our model will need feature vecters of same length. So we perform madding so that each and every sequence is equal in length to the largest size sequence.

---

In [62]:
max_sequence_len = 0
for seq in training_data:
    if len(seq) > max_sequence_len:
        max_sequence_len = len(seq)

# Pad the sequences to the maximum length
input_sequences = pad_sequences(training_data, maxlen=max_sequence_len, padding='pre')

# Convert to a numpy array (if needed)
input_sequences = np.array(input_sequences)

# Output the padded sequences
#print(input_sequences)

In [63]:
input_sequences[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1, 11], dtype=int32)

---
Now for each and every sequence, the last word is the target since it is to be predicted and all the other words are fearures which will help the model predict the last word

---

In [64]:

X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [65]:
X[2]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 11,
       67], dtype=int32)

In [66]:
y[2]

5

---
Now finally, the y values need to undergo one hot encoding. I.e., we get a one hot vector for each value of y

---

In [67]:
from sklearn.preprocessing import OneHotEncoder
# Reshape y to be a 2D array
y = np.array(y).reshape(-1, 1)

# Initialize the OneHotEncoder with the specified number of categories
encoder = OneHotEncoder(categories=[range(total_words)], sparse=False)

# Fit and transform y to get the one-hot encoded matrix
y = encoder.fit_transform(y)



In [68]:
y[10]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [69]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 52, 100)           19700     
                                                                 
 lstm_4 (LSTM)               (None, 150)               150600    
                                                                 
 dense_3 (Dense)             (None, 197)               29747     
                                                                 
Total params: 200047 (781.43 KB)
Trainable params: 200047 (781.43 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [70]:


model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))

# Add multiple LSTM layers with dropout for regularization
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))  # Add dropout to prevent overfitting
model.add(LSTM(100))  # Another LSTM layer
model.add(Dropout(0.2))


# Output layer with softmax activation
model.add(Dense(total_words, activation='softmax'))

In [71]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7c0fb419f160>

In [72]:
# Predicting next word based on the input
input_text = "Pakistan kept searching for their second"
token_list = tokenized_text.texts_to_sequences([input_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = np.argmax(model.predict(token_list), axis=-1)
predicted




array([48])

In [73]:
#tokenized_text.word_index.items()
for word, index in tokenized_text.word_index.items():
    if index == predicted:
        output_word = word
        break
output_word

'wicket'