In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
text_data = [
    "hello",
    "how are you doing",
    "what is your name",
    "where are you from",
    "hi, how are you doing?",
    "i'm fine. how about yourself?",
    "i'm pretty good. thanks for asking.",
    "no problem. so how have you been?",
    "i've been great. what about you?",
    "i've been good. i'm in school right now."
    "what school do you go to?",
    "i go to pcc.",
    "do you like it there?",
    "it's okay. it's a really big campus."
    "good luck with school.",
    "how's it going?",
    "i'm doing well. how about you?"
    "never better, thanks.",
    "so how have you been lately?",
    "i've actually been pretty good. you?",
    "i'm actually in school right now.",
    "which school do you attend?",
    "i'm attending pcc right now.",
    "are you enjoying it there?"
]
# text_file = open("dialogs.txt")
# text_data = text_file.read()
# print(text_data)


In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

In [4]:
input_sequences = []
next_words = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence[:-1])
        next_words.append(n_gram_sequence[-1])

In [5]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [6]:
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]

In [7]:
# Build the model
model = Sequential()
model.add(Embedding(total_words, 60, input_length=max_sequence_len-1))
model.add(LSTM(200))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(predictors, label, epochs=600, verbose=2)
model.save('PANDA.h5')

Epoch 1/600
4/4 - 2s - loss: 4.0906 - accuracy: 0.0500 - 2s/epoch - 540ms/step
Epoch 2/600
4/4 - 0s - loss: 4.0610 - accuracy: 0.1500 - 58ms/epoch - 14ms/step
Epoch 3/600
4/4 - 0s - loss: 4.0159 - accuracy: 0.1600 - 57ms/epoch - 14ms/step
Epoch 4/600
4/4 - 0s - loss: 3.9145 - accuracy: 0.1000 - 63ms/epoch - 16ms/step
Epoch 5/600
4/4 - 0s - loss: 3.7688 - accuracy: 0.1100 - 56ms/epoch - 14ms/step
Epoch 6/600
4/4 - 0s - loss: 3.7241 - accuracy: 0.1100 - 58ms/epoch - 14ms/step
Epoch 7/600
4/4 - 0s - loss: 3.7163 - accuracy: 0.1300 - 57ms/epoch - 14ms/step
Epoch 8/600
4/4 - 0s - loss: 3.7009 - accuracy: 0.1400 - 51ms/epoch - 13ms/step
Epoch 9/600
4/4 - 0s - loss: 3.6848 - accuracy: 0.1000 - 54ms/epoch - 14ms/step
Epoch 10/600
4/4 - 0s - loss: 3.6748 - accuracy: 0.1000 - 48ms/epoch - 12ms/step
Epoch 11/600
4/4 - 0s - loss: 3.6559 - accuracy: 0.1000 - 50ms/epoch - 12ms/step
Epoch 12/600
4/4 - 0s - loss: 3.6449 - accuracy: 0.1000 - 50ms/epoch - 13ms/step
Epoch 13/600
4/4 - 0s - loss: 3.6384 -

In [8]:
def predict_next_word(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predicted)  # Get the index with the highest probability
    predicted_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            predicted_word = word
            break
    return predicted_word

In [9]:
print('I\'m PANDA , Paradigm-based Artificial Neural Dialogue Agent , A Language Model which is able to predict next words')

I'm PANDA , Paradigm-based Artificial Neural Dialogue Agent , A Language Model which is able to predict next words


In [None]:
while True:
    user_input = input("user > ")
    response = predict_next_word(user_input)
    print("next word > ", response)

user >  hi


next word >  how


user >  hello


next word >  i'm


user >  hello how


next word >  are


user >  hello how are


next word >  you


user >  what


next word >  is


user >  what is 


next word >  your


user >  what is your


next word >  your


user >  hi


next word >  how


In [None]:
# text = "how"

# for i in range(5):
#     response = predict_next_word(text)
#     text = f'{text} {response}'
#     print(text)

# print(res)