In [3]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [4]:
file  = open('data.txt', 'r', encoding = "utf8")

In [5]:
lines=[]
for i in file:
    lines.append(i)

In [6]:
lines

['\n',
 '\n',
 '\n',
 '\n',
 '                        THE ADVENTURES OF SHERLOCK HOLMES\n',
 '\n',
 '                               Arthur Conan Doyle\n',
 '\n',
 '\n',
 '\n',
 '                                Table of contents\n',
 '\n',
 '               A Scandal in Bohemia\n',
 '               The Red-Headed League\n',
 '               A Case of Identity\n',
 '               The Boscombe Valley Mystery\n',
 '               The Five Orange Pips\n',
 '               The Man with the Twisted Lip\n',
 '               The Adventure of the Blue Carbuncle\n',
 '               The Adventure of the Speckled Band\n',
 "               The Adventure of the Engineer's Thumb\n",
 '               The Adventure of the Noble Bachelor\n',
 '               The Adventure of the Beryl Coronet\n',
 '               The Adventure of the Copper Beeches\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '                              A SCANDAL IN BOHEMIA\n',
 '\n',
 '\n',
 '\n',
 '\n'

In [7]:
data=""
for i in lines:
    data = ' '.join(lines)

In [9]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')

In [10]:
data = data.split()
data = ' '.join(data)

In [14]:
data[:1000]

"THE ADVENTURES OF SHERLOCK HOLMES Arthur Conan Doyle Table of contents A Scandal in Bohemia The Red-Headed League A Case of Identity The Boscombe Valley Mystery The Five Orange Pips The Man with the Twisted Lip The Adventure of the Blue Carbuncle The Adventure of the Speckled Band The Adventure of the Engineer's Thumb The Adventure of the Noble Bachelor The Adventure of the Beryl Coronet The Adventure of the Copper Beeches A SCANDAL IN BOHEMIA Table of contents Chapter 1 Chapter 2 Chapter 3 CHAPTER I To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false positio

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

In [16]:
pickle.dump(tokenizer, open('token.pkl', 'wb'))

In [17]:
sequence_data = tokenizer.texts_to_sequences([data])[0]

In [18]:
sequence_data[:10]

[1, 1561, 5, 129, 34, 647, 4498, 4499, 226, 5]

In [20]:
len(sequence_data)

105879

In [22]:
vocab_size = len(tokenizer.word_index)+1
vocab_size

8200

In [23]:
sequence=[]
for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequence.append(words)

In [24]:
len(sequence)

105876

In [25]:
sequence = np.array(sequence)
sequence

array([[   1, 1561,    5,  129],
       [1561,    5,  129,   34],
       [   5,  129,   34,  647],
       ...,
       [  28,    1, 8198, 8199],
       [   1, 8198, 8199, 3187],
       [8198, 8199, 3187, 3186]])

In [30]:
X=[]
y=[]

for i in sequence:
    X.append(i[0:3])
    y.append(i[3])

In [31]:
X=np.array(X)
y=np.array(y)

In [32]:
y = to_categorical(y, num_classes=vocab_size)

In [33]:
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length = 3))
model.add(LSTM(1000, return_sequences = True))
model.add(LSTM(1000))
model.add(Dense(1000, activation='relu'))
model.add(Dense(vocab_size, activation="softmax"))



In [35]:
model.summary()

In [36]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("next_words.h5", monitor="loss", verbose = 1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer = Adam(learning_rate = 0.001))
model.fit(X, y, epochs=2, batch_size=64, callbacks=[checkpoint])

Epoch 1/2
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - loss: 6.6619
Epoch 1: loss improved from inf to 6.34005, saving model to next_words.h5




[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m577s[0m 337ms/step - loss: 6.6617
Epoch 2/2
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step - loss: 5.7852
Epoch 2: loss improved from 6.34005 to 5.75356, saving model to next_words.h5




[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m586s[0m 354ms/step - loss: 5.7852


<keras.src.callbacks.history.History at 0x2a10891c790>

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def predict_top_words(model, tokenizer, text, top_n=5):
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)

    sequence = sequence[:, -3:]
    while sequence.shape[1] < 3:
        sequence = np.insert(sequence, 0, 0, axis=1)

    preds = model.predict(sequence, verbose=0)[0]
    
    top_indices = np.argsort(preds)[-top_n:][::-1]

    predicted_words = []
    for idx in top_indices:
        for word, index in tokenizer.word_index.items():
            if index == idx:
                predicted_words.append(word)
                break
    return predicted_words

while True:
    text = input("Enter your line (or '1' to exit): ")
    if text == "1":
        break
    else:
        last_words = text.strip().lower().split()[-3:]  # last 3 words
        predicted_words = predict_top_words(model, tokenizer, last_words, top_n=5)
        
        print("\nTop 5 predictions:")
        for i, word in enumerate(predicted_words, 1):
            full_sentence = ' '.join(last_words) + ' ' + word
            print(f"{i}. {full_sentence}")
        print()









Top 5 predictions:
1. i am going to
2. i am going that
3. i am going in
4. i am going of
5. i am going a


Top 5 predictions:
1. do you know? have
2. do you know? had
3. do you know? not
4. do you know? know
5. do you know? think


Top 5 predictions:
1. hey?? up?? i
2. hey?? up?? and
3. hey?? up?? the
4. hey?? up?? in
5. hey?? up?? to


Top 5 predictions:
1. fuck you have
2. fuck you was
3. fuck you had
4. fuck you are
5. fuck you is

