In [2]:
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from transformers import pipeline
import os


In [3]:
# Load and preprocess dataset
DATASET_PATH = 'metamorphosis_clean.txt'
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    corpus = f.read().lower().split("\n")

In [4]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [5]:
# Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])


In [6]:
# Pad sequences
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)  # No need for one-hot encoding with sparse categorical loss

In [7]:
# Build Improved BiLSTM Model
model = Sequential([
    Embedding(total_words, 256, input_length=max_sequence_length - 1),
    Bidirectional(LSTM(256, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(128)),
    Dropout(0.2),
    Dense(total_words, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [9]:
# Train model
epochs = 5
model.fit(X, y, epochs=epochs, verbose=1)

Epoch 1/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 158ms/step - accuracy: 0.2280 - loss: 3.6963
Epoch 2/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 155ms/step - accuracy: 0.2442 - loss: 3.5521
Epoch 3/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 158ms/step - accuracy: 0.2559 - loss: 3.4752
Epoch 4/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 156ms/step - accuracy: 0.2680 - loss: 3.3604
Epoch 5/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 162ms/step - accuracy: 0.2828 - loss: 3.2711


<keras.src.callbacks.history.History at 0x299e7b37b60>

In [10]:
# Save model and tokenizer
model.save('bilstm_model.h5')
with open('tokenizer1.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("✅ Training completed! Model and tokenizer saved.")




✅ Training completed! Model and tokenizer saved.


In [11]:
# Load BiLSTM Model
bilstm_model = load_model('bilstm_model.h5')
print("✅ BiLSTM model loaded successfully!")




✅ BiLSTM model loaded successfully!


In [12]:
# Load tokenizer
with open('tokenizer1.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
print("✅ Tokenizer loaded successfully!")

✅ Tokenizer loaded successfully!


In [13]:
# Load BERT fill-mask pipeline
fill_mask = pipeline("fill-mask", model="bert-base-uncased")





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [14]:
# Load dataset vocabulary
if os.path.exists(DATASET_PATH):
    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        dataset_words = set(f.read().split())
else:
    dataset_words = set()

In [15]:
# Inappropriate words filter
BAD_WORDS = {"damn", "hell", "shit", "fuck", "bitch", "bastard", "ass", "asshole", "dumbass", "jackass", 
             "motherfucker", "cock", "piss", "crap", "slut", "whore", "dick", "cunt", "nigger", 
             "retard", "faggot", "twat", "wanker", "moron", "idiot", "stupid"}

# Ensure valid words
def is_valid_word(word):
    return word.lower() not in BAD_WORDS

In [16]:
# Predict next word using BiLSTM
def predict_next_word_bilstm(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length-1, padding='pre')
    prediction = bilstm_model.predict(padded_sequence)
    predicted_word = tokenizer.index_word.get(np.argmax(prediction), "unknown")
    return predicted_word if is_valid_word(predicted_word) else "[filtered]"

In [17]:
# Predict next word using BERT
def predict_next_word_bert(text):
    masked_text = text + " [MASK]."
    predictions = fill_mask(masked_text)
    for pred in predictions:
        word = pred['token_str']
        if is_valid_word(word):
            return word
    return "[filtered]"

In [18]:
# Hybrid model prediction
def predict_next_word(text):
    words = text.split()
    last_word = words[-1] if words else ""
    if last_word in dataset_words:
        return predict_next_word_bilstm(text)
    else:
        new_word = predict_next_word_bert(text)
        if new_word != "[filtered]":
            dataset_words.add(new_word)
            with open(DATASET_PATH, 'a', encoding='utf-8') as f:
                f.write(f" {new_word}")  # Save valid words
        return new_word

In [19]:
# Predict multiple words
def Predict_Next_Words(text, num_words):
    predicted_sentence = text
    for _ in range(num_words):
        next_word = predict_next_word(predicted_sentence)
        predicted_sentence += " " + next_word.strip()
    return predicted_sentence

In [20]:
# Test the model
if __name__ == "__main__":
    input_text = "The book was"
    num_predictions = 5
    result = Predict_Next_Words(input_text, num_predictions)
    print(f"\n🔹 Input: {input_text}\n✅ Predicted Sentence: {result}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 581ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step

🔹 Input: The book was
✅ Predicted Sentence: The book was and then he had been


In [22]:
# Take user input
user_input = input("Enter a starting phrase: ")
num_words_to_predict = int(input("Enter the number of words to predict: "))

predicted_sentence = Predict_Next_Words(user_input, num_words_to_predict)
print(f"\n🔹 Input: {user_input}\n✅ Predicted Sentence: {predicted_sentence}")

ValueError: invalid literal for int() with base 10: 'Just from each'