# Data Formation
Data is in the text format. We have to convert it into dataframe


In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
text = open("/content/Common_email_sentences.txt", 'r').read()

In [3]:
# Tokenizing the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
max_word_index = max(tokenizer.word_index.values())
max_word_index

436

In [4]:
# Getting sentences in the form of tokens
input_sequences = []
for sentence in text.split('\n'): # Spliting the sentences
  token_sent = tokenizer.texts_to_sequences([sentence])[0] # Transforming word sentences to the tokens
  for i in range (1, len(token_sent)):
   input_sequences.append(token_sent [:i+1]) #



In [5]:
input_sequences

[[3, 39],
 [3, 39, 16],
 [3, 39, 16, 61],
 [3, 39, 16, 61, 190],
 [3, 39, 16, 61, 190, 2],
 [3, 39, 16, 61, 190, 2, 107],
 [3, 5],
 [3, 5, 6],
 [3, 5, 6, 1],
 [3, 5, 6, 1, 78],
 [3, 5, 6, 1, 78, 21],
 [44, 2],
 [44, 2, 8],
 [44, 2, 8, 4],
 [44, 2, 8, 4, 54],
 [44, 2, 8, 4, 54, 79],
 [13, 73],
 [13, 73, 62],
 [3, 65],
 [3, 65, 148],
 [3, 65, 148, 1],
 [3, 65, 148, 1, 89],
 [3, 65, 148, 1, 89, 29],
 [3, 65, 148, 1, 89, 29, 17],
 [58, 2],
 [58, 2, 13],
 [58, 2, 13, 123],
 [58, 2, 13, 123, 317],
 [58, 2, 13, 123, 317, 80],
 [58, 2, 13, 123, 317, 80, 21],
 [3, 149],
 [3, 149, 8],
 [3, 149, 8, 7],
 [3, 149, 8, 7, 150],
 [3, 63],
 [3, 63, 4],
 [3, 63, 4, 37],
 [13, 19],
 [13, 19, 15],
 [13, 19, 15, 20],
 [13, 19, 15, 20, 14],
 [13, 19, 15, 20, 14, 2],
 [13, 19, 15, 20, 14, 2, 40],
 [13, 19, 15, 20, 14, 2, 40, 41],
 [13, 19, 15, 20, 14, 2, 40, 41, 108],
 [318, 191],
 [318, 191, 1],
 [318, 191, 1, 319],
 [318, 191, 1, 319, 90],
 [318, 191, 1, 319, 90, 2],
 [318, 191, 1, 319, 90, 2, 192],
 [44, 

In [6]:
# max length among the sentences came out to be 10
max_len = max([len(x) for x in input_sequences])
max_len

15

# Data Preprocessing

In [7]:
import pandas as pd
from  tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding = 'pre')

In [8]:
# Padding the sentences
x = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]


In [9]:
x.shape

(4069, 14)

In [10]:
y.shape

(4069,)

In [11]:
# This is the dataset required for predictions
data = pd.DataFrame(padded_input_sequences[:,:], columns = ['x1','x2','x3','x4','x5','x6',
                                                            'x7','x8','x9','x10','x11','x12',
                                                            'x13','x14','Y',])
data

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,Y
0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,39
1,0,0,0,0,0,0,0,0,0,0,0,0,3,39,16
2,0,0,0,0,0,0,0,0,0,0,0,3,39,16,61
3,0,0,0,0,0,0,0,0,0,0,3,39,16,61,190
4,0,0,0,0,0,0,0,0,0,3,39,16,61,190,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4064,0,0,0,0,0,0,0,9,12,11,1,48,23,8,122
4065,0,0,0,0,0,0,9,12,11,1,48,23,8,122,88
4066,0,0,0,0,0,9,12,11,1,48,23,8,122,88,68
4067,0,0,0,0,9,12,11,1,48,23,8,122,88,68,8


In [12]:
# Hence, Multiclass Classification problem

# Model Creation (LSTM)


In [27]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [14]:
Y = to_categorical(y, num_classes = max_word_index +1)
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [22]:
# Creating the model with 1 LSTM layer(636 nodes),
# 'max_word_index +1' is the number of embeddings ,
# input length = max length of the sentence - 1, because the last word is in the prediction (y) column,
# output dimension of the LSTM = 336
model = Sequential()
model.add(Embedding(max_word_index +1, 336, input_length = max_len - 1 ))
model.add((LSTM(636)))
model.add(Dense(max_word_index+1,activation = 'sigmoid'))

In [23]:
model.compile(loss = 'categorical_crossentropy', optimizer = Adam(), metrics = ['accuracy'])

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 14, 336)           146832    
                                                                 
 lstm_1 (LSTM)               (None, 636)               2475312   
                                                                 
 dense_1 (Dense)             (None, 437)               278369    
                                                                 
Total params: 2900513 (11.06 MB)
Trainable params: 2900513 (11.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
model.fit(x,Y,epochs = 50, batch_size= 64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7fe8b02dcc40>

# Prediction

In [None]:
while True:

  text = input("Enter the incomplete text:  ")
  token_text = tokenizer.texts_to_sequences([text])[0]
  complete_text =[]

  n = 1 # next n words are getting predicted
  for i in range(n):
    padded_text = pad_sequences([token_text],maxlen = max_len-1, padding = 'pre')
    answer = model.predict(padded_text)
    top_3_indices = np.argsort(answer)[0][-3:]
    top_3_words = []
    for index in top_3_indices:
      for word, idx in tokenizer.word_index.items():
        if idx == index:
          top_3_words.append(word)
    complete_text.append(top_3_words)

    token_text.append(index)

  #print(text +" "+ " ".join(complete_text), "\n")
  print(complete_text, "\n")

Enter the incomplete text:  i am writing to
[['request', 'express', 'confirm']] 

Enter the incomplete text:  i am writing to request
[['clarification', 'an', 'your']] 

Enter the incomplete text:  i am writing to request an
[['on', 'extension', 'update']] 

Enter the incomplete text:  i am writing to request an extension
[['you', 'to', 'on']] 

Enter the incomplete text:  Thank you for your
[['prompt', 'understanding', 'attention']] 

Enter the incomplete text:  thank you for your understanding
[['of', 'during', 'and']] 

Enter the incomplete text:  thank you for your understanding during
[['to', 'a', 'this']] 

Enter the incomplete text:  I hope this email finds
[['well', 'is', 'you']] 

Enter the incomplete text:  I hope this email finds you
[['good', 'well', 'in']] 

