In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
lines = []
with open('/kaggle/input/next-word-prediction/1661-0.txt', 'r') as f:
    for line in f:
        lines.append(line.strip())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [None]:
# Example data preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample text data
text_data = lines


# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)

# Create word-to-index and index-to-word mappings
word_to_index = tokenizer.word_index
index_to_word = {index: word for word, index in word_to_index.items()}

# Encode text data as sequences of integers
sequences = tokenizer.texts_to_sequences(text_data)

# Generate input-output pairs
input_sequences = []
output_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        input_seq = sequence[:i]
        output_seq = sequence[i]
        input_sequences.append(input_seq)
        output_sequences.append(output_seq)

# Pad sequences to a fixed length if needed
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word_to_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(word_to_index) + 1, activation='softmax'))

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
input_sequences

In [None]:
output_sequences = np.array(output_sequences)

In [None]:
# Assuming you have prepared your training data as input_sequences and output_sequences
model.fit(input_sequences, output_sequences, epochs=150, batch_size=500)

In [None]:
# Generate predictions for the next word
input_text = str(input())
input_sequence = tokenizer.texts_to_sequences([input_text])[0]
input_sequence = pad_sequences([input_sequence], maxlen=max_sequence_length, padding='pre')

predicted_word_id = np.argmax(model.predict(input_sequence))
predicted_word = index_to_word.get(predicted_word_id)

print(f"Predicted next word: {predicted_word}")