In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.callbacks import EarlyStopping
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import os
import multiprocessing

# Function to preprocess the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove punctuation and apply stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum()]
    
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Load and preprocess text files in batches
def load_and_preprocess(file_paths, batch_size):
    tokens = []
    pool = multiprocessing.Pool()
    results = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        results.append(pool.apply_async(preprocess_text, args=(text,)))
        if len(results) >= batch_size:
            tokens.extend(result.get() for result in results)
            results = []
    tokens.extend(result.get() for result in results)
    return tokens

# Predict the next word based on the input sentence using LSTM model
def predict_next_word_lstm(tokenizer, model, input_sentence, max_sequence_length):
    input_sequence = tokenizer.texts_to_sequences([input_sentence])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)
    predicted_index = np.argmax(model.predict(padded_input_sequence))
    predicted_word = tokenizer.index_word[predicted_index]
    return predicted_word

def main():
    batch_size = 10
    file_paths = ["C:\\Users\\varsh\\Downloads\\human_chat.txt"]
    tokens = load_and_preprocess(file_paths, batch_size)
 
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tokens)
    sequences = tokenizer.texts_to_sequences(tokens)
    max_sequence_length = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
    
    vocab_size = len(tokenizer.word_index) + 1
    embedding_dim = 100
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(128))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    
    model.fit(padded_sequences, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
    
    input_sentence = input("Enter a word or a sentence: ")
 
    predicted_word_lstm = predict_next_word_lstm(tokenizer, model, input_sentence, max_sequence_length)
    print("Predicted next word (LSTM):", predicted_word_lstm)

if __name__ == "__main__":
    main()
