<a href="https://colab.research.google.com/github/youssefhesham200/predict_next_word_nlp/blob/master/predict_next_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tarfile
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
import gensim.downloader as api
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
# Download a pretrained word2vec model
word2vec_model =  api.load("word2vec-google-news-300")


In [None]:
word2vec_model.most_similar("nice")

[('good', 0.6836091876029968),
 ('lovely', 0.6676310896873474),
 ('neat', 0.6616737246513367),
 ('fantastic', 0.6569240689277649),
 ('wonderful', 0.6561347246170044),
 ('terrific', 0.6552367806434631),
 ('great', 0.6454657912254333),
 ('awesome', 0.6404187679290771),
 ('nicer', 0.6302445530891418),
 ('decent', 0.5993332862854004)]

In [None]:
def load_embeddings(word_index, embedding_dim):
    # Create the embedding matrix using the pre-trained Word2Vec model
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, embedding_dim))

    for word, i in word_index.items():
        if word in word2vec_model.key_to_index:
            embedding_matrix[i] = word2vec_model.get_vector(word)
        else:
            # If word is out-of-vocabulary, use the average of constituent word embeddings
            words = word.split()
            if all(w in word2vec_model.key_to_index for w in words):
                embedding_matrix[i] = np.mean([word2vec_model.get_vector(w) for w in words], axis=0)
            else:
                # If word is still out-of-vocabulary, use standardized token
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

In [None]:
def extract_data():
  tar = tarfile.open("enronsentv1 (1).tar.gz", "r:gz")
  tar.extractall()
  tar.close()

  
def loading_data(folder_path):
  # Initialize list to hold all paragraphs in the Enron Sent Corpus Dataset
  training = []
  validation = []

  counter = 0 

  # Loop through all files in the folder
  for filename in os.listdir(folder_path):
          # Open the file and read its contents
          with open(os.path.join(folder_path, filename), "r", encoding='latin-1') as f:
              contents = f.read()
              para = contents.split('\n\n')
              print(len(para))
              # Split the contents into paragraphs
          if counter <= 10:
            # Append the paragraphs to the all_paragraphs list
            training.extend(para)

          elif counter > 10 and counter <= 15:
            validation.extend(para)

          else:
            break

          counter += 1

  return training, validation

In [None]:
def clean(paragraphs):
    """Remove stopwords and lowercase words in paragraphs"""
    cleaned_paragraphs = []
    english_words = set(nltk.corpus.words.words())
    for i in range(len(paragraphs)):
        words = paragraphs[i].split()
        non_stop = []
        for j in range(len(words)):
            if words[j] not in stopwords.words('english') and words[j].lower() in english_words:
                non_stop.append(words[j].lower())
        cleaned_paragraphs.append(' '.join(non_stop))
    return cleaned_paragraphs

In [None]:
def prepare_data(sentences_train, sentences_val, seq_len):
    # Tokenize the sentences
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences_train)
    train_sequences = tokenizer.texts_to_sequences(sentences_train)
    val_sequences = tokenizer.texts_to_sequences(sentences_val)
    
    # Generate input/output sequences for training data
    train_inputs = []
    train_outputs = []
    for seq in train_sequences:
        for i in range(seq_len, len(seq)):
            train_inputs.append(seq[i-seq_len:i])
            train_outputs.append(seq[i])

    # Generate input/output sequences for validation data
    val_inputs = []
    val_outputs = []
    for seq in val_sequences:
        for i in range(seq_len, len(seq)):
            val_inputs.append(seq[i-seq_len:i])
            val_outputs.append(seq[i])

    # Reshape the input sequences to have the correct shape
    train_inputs = np.asarray(train_inputs).reshape(-1, seq_len, 1)
    val_inputs = np.asarray(val_inputs).reshape(-1, seq_len, 1)

    return train_inputs, train_outputs, val_inputs, val_outputs , tokenizer.word_index, tokenizer

In [None]:
extract_data()

In [None]:
paragraphs_training, paragraphs_validation = loading_data("/content/enronsent")

13797
12521
8173
11867
9143
7534
12781
12755
9625
8980
13261
12562
11314
11370
11565
13337
17


In [None]:
clean_training = clean(paragraphs_training)

In [None]:
clean_valid = clean(paragraphs_validation)

In [None]:
train_inputs, train_outputs, val_inputs, val_outputs, word_index, tokenizer = prepare_data(clean_training, clean_valid, 10)

In [None]:
embedding_matrix = load_embeddings(word_index, 300)

In [None]:
#note :::  this results of accuracy is normal because data is small for (auto complete task) compare with other datasets so this lead to overfiting after all trying to solve it 

from tensorflow.keras.optimizers import Adam

# Build the LSTM model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                              weights=[embedding_matrix], input_length=10),

    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),

    tf.keras.layers.Dense(embedding_matrix.shape[0], activation='softmax')
])

model.compile(optimizer= Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_inputs, np.array(train_outputs), epochs=35, batch_size = 256,  validation_data=(val_inputs,  np.array(val_outputs)))

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7fcc11e55450>

In [None]:
def generate_next_word(model, tokenizer, text):
    sequence = tokenizer.texts_to_sequences([text])[0]
    # Pad the sequence to the same length as the training sequences
    sequence = tf.keras.preprocessing.sequence.pad_sequences([sequence], maxlen=10)
    prediction = model.predict(sequence)[0]
    # Get the index of the predicted word
    predicted_index = np.argmax(prediction)
    # Convert the index to the predicted word
    predicted_word = tokenizer.index_word[predicted_index]
    return predicted_word


In [None]:

sentence = " "

while True:
  word = input("Enter Next word (-1 to terminate)")
  if word == "-1":
    break

  sentence += word + " "
  predicted_word = generate_next_word(model, tokenizer, sentence)
  
  decision = input(f"Is your next word: “{predicted_word}”")

  if decision == "-1":
    break

  while decision.lower() == "yes":
    sentence += predicted_word + " "
    predicted_word = generate_next_word(model, tokenizer, sentence)
    decision = input(f"Is your next word: “{predicted_word}”")
  
  
  print("Sorry, ")

print("Your final Sentence is '" + sentence + "'")

Enter Next word (-1 to terminate)financial
Is your next word: “quick”no
Sorry, 
Enter Next word (-1 to terminate)account
Is your next word: “look”yes
Is your next word: “incremental”yes
Is your next word: “transmission”no
Sorry, 
Enter Next word (-1 to terminate)-1
Your final Sentence is ' financial account look incremental '
