In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.callbacks import EarlyStopping
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import tkinter as tk
import os
import multiprocessing

class CustomKeyboard:
    def __init__(self):
        self.user_input_dataset = []

    def record_user_input(self, user_input):
        self.user_input_dataset.append(user_input)

    def get_user_input_dataset(self):
        return self.user_input_dataset

def predict_next_word_lstm(tokenizer, model, input_sentence):
    input_sequence = tokenizer.texts_to_sequences([input_sentence])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)
    predicted_index = np.argmax(model.predict(padded_input_sequence))
    predicted_word = tokenizer.index_word[predicted_index]
    return predicted_word

def predict_from_normal_dataset(input_sentence, tokenizer, model):
    # Preprocess input sentence
    preprocessed_sentence = preprocess_text(input_sentence)
    # Tokenize preprocessed sentence
    tokenized_sentence = tokenizer.texts_to_sequences([preprocessed_sentence])
    # Pad tokenized sequence
    padded_input_sequence = pad_sequences(tokenized_sentence, maxlen=max_sequence_length)
    # Predict using LSTM model
    predicted_index = np.argmax(model.predict(padded_input_sequence))
    predicted_word = tokenizer.index_word[predicted_index]
    return predicted_word

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token.isalnum()]
    return tokens

def train_lstm_model(dataset):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataset)
    sequences = tokenizer.texts_to_sequences(dataset)
    max_sequence_length = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

    vocab_size = len(tokenizer.word_index) + 1
    embedding_dim = 100
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(128))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(padded_sequences, epochs=10, batch_size=32, validation_split=0.2)

    return tokenizer, model

def load_and_preprocess(file_paths, batch_size):
    tokens = []
    pool = multiprocessing.Pool()
    results = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        results.append(pool.apply_async(preprocess_text, args=(text,)))
        if len(results) >= batch_size:
            tokens.extend(result.get() for result in results)
            results = []
    tokens.extend(result.get() for result in results)
    return tokens

def main():
    keyboard = CustomKeyboard()
    file_paths = ["C:\\Users\\varsh\\Downloads\\train.csv"]

    # Load and preprocess text files
    batch_size = 10
    tokens = load_and_preprocess(file_paths, batch_size)

    # Train LSTM model using the processed data
    tokenizer, model = train_lstm_model(tokens)

    def record_input():
        user_input = text_entry.get()
        keyboard.record_user_input(user_input)
        text_entry.delete(0, tk.END)

    def predict_next_word():
        input_sentence = text_entry.get()
        predicted_word = predict_next_word_lstm(tokenizer, model, input_sentence)
        if predicted_word is None:
            predicted_word = predict_from_normal_dataset(input_sentence, tokenizer, model)
        prediction_label.config(text="Predicted next word: " + predicted_word)

    root = tk.Tk()
    root.title("Custom Keyboard")
    root.geometry("400x200")

    text_entry = tk.Entry(root, width=50)
    text_entry.pack(pady=10)

    record_button = tk.Button(root, text="Record Input", command=record_input)
    record_button.pack()

    predict_button = tk.Button(root, text="Predict Next Word", command=predict_next_word)
    predict_button.pack()

    prediction_label = tk.Label(root, text="")
    prediction_label.pack()

    root.mainloop()

    # Train LSTM model using the recorded user input dataset
    user_input_dataset = keyboard.get_user_input_dataset()
    tokenizer, model = train_lstm_model(user_input_dataset)

if __name__ == "__main__":
    main()
