<a href="https://colab.research.google.com/github/varun1608/Automatic-Sentence-Completion/blob/main/AutoSentenceCompletion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***STEP1: IMPORT LIBRARIES***


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

***STEP 2: LOAD YOUR FILE***


In [None]:
from google.colab import files
uploaded = files.upload()

***STEP 3:OPEN AND PRE-PROCESS THE DATA***

In [None]:
file = open("dataset.txt", "r", encoding = "utf8")

lines = []
for i in file:
    lines.append(i)


data = ""
for i in lines:
  data = ' '. join(lines)


data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space


data = data.split()
data = ' '.join(data)
data[:500]

***STEP 4: HYPERPARAMETER TUNING***

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_model(learning_rate=0.001, embedding_dim=10, lstm_units=100, dense_units=100):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=3))
    model.add(LSTM(lstm_units))
    model.add(Dense(dense_units, activation="relu"))
    model.add(Dense(vocab_size, activation="softmax"))
    model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=learning_rate))
    return model

model = KerasClassifier(build_fn=create_model, verbose=0)

param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'embedding_dim': [10, 50, 100],
    'lstm_units': [50, 100, 200],
    'dense_units': [50, 100, 200]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


***STEP 5:IMPLEMENT TOKENIZATION AND MAKE ADDITIONAL ADJUSMENTS***

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]
len(sequence_data)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])

X = np.array(X)
y = np.array(y)
print("Data: ", X[:10])
print("Response: ", y[:10])
y = to_categorical(y, num_classes=vocab_size)
y[:5]

***STEP 6: NEURAL ARCHITECTURE SEARCH***

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import random

def create_model(learning_rate=0.001, embedding_dim=10, lstm_units=1000, dense_units=1000, num_lstm_layers=2):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=3))

    for _ in range(num_lstm_layers):
        model.add(LSTM(lstm_units, return_sequences=True))

    model.add(LSTM(lstm_units))
    model.add(Dense(dense_units, activation="relu"))
    model.add(Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=learning_rate))
    return model

model = KerasClassifier(build_fn=create_model, verbose=0)

param_dist = {
    'learning_rate': [0.001, 0.01, 0.1],
    'embedding_dim': [10, 50, 100],
    'lstm_units': [100, 500, 1000],
    'dense_units': [100, 500, 1000],
    'num_lstm_layers': [1, 2, 3]
}

n_iter_search = 10
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter_search, cv=3)
random_search_result = random_search.fit(X, y)

print("Best: %f using %s" % (random_search_result.best_score_, random_search_result.best_params_))


***STEP 7: CREATING A MODEL***

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()

***STEP 8: PLOT THE MODEL***

In [None]:
import tensorflow as tf
from tensorflow import keras
keras.utils.plot_model(model, to_file='plot.png', show_layer_names=True)

***STEP 9: TRAIN THE MODEL***

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

***STEP 10: LET'S PREDICT***

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break

  print(predicted_word)
  return predicted_word
while(True):
  text = input("Enter your line: ")

  if text == "0":
      print("Execution completed.....")
      break

  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)

          Predict_Next_Words(model, tokenizer, text)

      except Exception as e:
        print("Error occurred: ",e)
        continue