# Data Formation
Data is in the text format. We have to convert it into dataframe


In [77]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [78]:
text = open("/content/tare zameen par.txt", 'r').read()

In [79]:
# Tokenizing the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
max_word_index = max(tokenizer.word_index.values())
max_word_index

168

In [80]:
# Getting sentences in the form of tokens
input_sequences = []
for sentence in text.split('\n'): # Spliting the sentences
  token_sent = tokenizer.texts_to_sequences([sentence])[0] # Transforming word sentences to the tokens
  for i in range (1, len(token_sent)):
   input_sequences.append(token_sent [:i+1]) #



In [81]:
input_sequences

[[35, 36],
 [35, 36, 2],
 [35, 36, 2, 6],
 [35, 36, 2, 6, 37],
 [35, 36, 2, 6, 37, 3],
 [35, 36, 2, 6, 37, 3, 38],
 [39, 3],
 [39, 3, 40],
 [39, 3, 40, 5],
 [39, 3, 40, 5, 41],
 [39, 3, 40, 5, 41, 15],
 [39, 3, 40, 5, 41, 15, 42],
 [43, 44],
 [43, 44, 45],
 [43, 44, 45, 46],
 [43, 44, 45, 46, 20],
 [43, 44, 45, 46, 20, 16],
 [47, 15],
 [47, 15, 48],
 [47, 15, 48, 49],
 [47, 15, 48, 49, 17],
 [47, 15, 48, 49, 17, 50],
 [47, 15, 48, 49, 17, 50, 16],
 [11, 12],
 [11, 12, 8],
 [11, 12, 8, 2],
 [11, 12, 8, 2, 13],
 [11, 12, 8, 2, 13, 14],
 [11, 12, 8, 2, 13, 14, 9],
 [2, 7],
 [2, 7, 6],
 [2, 7, 6, 51],
 [2, 7, 6, 51, 5],
 [2, 7, 6, 51, 5, 52],
 [2, 7, 6, 51, 5, 52, 3],
 [2, 7, 6, 51, 5, 52, 3, 53],
 [54, 55],
 [54, 55, 56],
 [54, 55, 56, 21],
 [54, 55, 56, 21, 57],
 [54, 55, 56, 21, 57, 18],
 [54, 55, 56, 21, 57, 18, 16],
 [54, 55, 56, 21, 57, 18, 16, 17],
 [22, 4],
 [22, 4, 58],
 [22, 4, 58, 21],
 [22, 4, 58, 21, 59],
 [22, 4, 58, 21, 59, 18],
 [22, 4, 58, 21, 59, 18, 16],
 [22, 4, 58, 21,

In [82]:
# max length among the sentences came out to be 10
max_len = max([len(x) for x in input_sequences])
max_len

10

# Data Preprocessing

In [83]:
import pandas as pd
from  tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding = 'pre')

In [84]:
# Padding the sentences
x = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]


In [85]:
x.shape

(260, 9)

In [86]:
y.shape

(260,)

In [87]:
# This is the dataset required for predictions
data = pd.DataFrame(padded_input_sequences[:,:], columns = ['x1','x2','x3','x4','x5','x6','x7','x8','x9','y',])
data

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y
0,0,0,0,0,0,0,0,0,35,36
1,0,0,0,0,0,0,0,35,36,2
2,0,0,0,0,0,0,35,36,2,6
3,0,0,0,0,0,35,36,2,6,37
4,0,0,0,0,35,36,2,6,37,3
...,...,...,...,...,...,...,...,...,...,...
255,0,0,1,160,5,161,162,163,164,165
256,0,0,0,0,0,0,0,0,1,166
257,0,0,0,0,0,0,0,1,166,167
258,0,0,0,0,0,0,1,166,167,10


In [88]:
# Hence, Multiclass Classification problem

# Model Creation (LSTM)


In [89]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam


In [90]:
Y = to_categorical(y, num_classes = max_word_index +1)
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [96]:
# Creating the model with 1 LSTM layer(355 nodes),
# 'max_word_index +1' is the output dimension of the LSTM,
# input length = max length of the sentence - 1, because the last word is in the prediction (y) column,
# no. of embeddings = 240
model = Sequential ()
model.add(Embedding(max_word_index +1, 240, input_length = max_len - 1 ))
model.add((LSTM(355)))
model.add(Dense(max_word_index+1,activation = 'sigmoid'))

In [97]:
model.compile(loss = 'categorical_crossentropy', optimizer = Adam(), metrics = ['accuracy'])

In [98]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 9, 240)            40560     
                                                                 
 lstm_5 (LSTM)               (None, 355)               846320    
                                                                 
 dense_5 (Dense)             (None, 169)               60164     
                                                                 
Total params: 947044 (3.61 MB)
Trainable params: 947044 (3.61 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [99]:
model.fit(x,Y,epochs = 100, batch_size= 32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7ed860f07010>

# Prediction

In [None]:

import numpy as np
while True:

  text = input("Enter the incomplete text:  ")
  token_text = tokenizer.texts_to_sequences([text])[0]
  complete_text =[]

  for i in range(4): # next 4 words are getting predicted
    padded_text = pad_sequences([token_text],maxlen = max_len-1, padding = 'pre')
    answer = np.argmax(model.predict(padded_text))
    if text == '0':break

    for word,index in tokenizer.word_index.items():
        if index == answer:
          complete_text.append(word)

    token_text.append(index)

  print(text +" "+ " ".join(complete_text), "\n")


Enter the incomplete text:  Dekho inhein yeh
Dekho inhein yeh hain oas ki boodein 

Enter the incomplete text:  Yeh hain buzurgon
Yeh hain buzurgon ke dil ki duayein 

Enter the incomplete text:  Khilne ki zid
Khilne ki zid par kaliyan jaise jaise 

