In [4]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences

In [5]:
# text_data = [
#     "hello",
#     "how are you doing",
#     "what is your name",
#     "where are you from",
#     "hi, how are you doing?",
#     "i'm fine. how about yourself?",
#     "i'm pretty good. thanks for asking.",
#     "no problem. so how have you been?",
#     "i've been great. what about you?",
#     "i've been good. i'm in school right now."
#     "what school do you go to?",
#     "i go to pcc.",
#     "do you like it there?",
#     "it's okay. it's a really big campus."
#     "good luck with school.",
#     "how's it going?",
#     "i'm doing well. how about you?"
#     "never better, thanks.",
#     "so how have you been lately?",
#     "i've actually been pretty good. you?",
#     "i'm actually in school right now.",
#     "which school do you attend?",
#     "i'm attending pcc right now.",
#     "are you enjoying it there?"
# ]
text_file = open("dialogs.txt")
text_data = text_file.read()
print(text_data)


In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

In [7]:
input_sequences = []
next_words = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence[:-1])
        next_words.append(n_gram_sequence[-1])

In [8]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [9]:
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]

In [10]:
# Build the model
model = Sequential()
model.add(Embedding(total_words, 60, input_length=max_sequence_len-1))
model.add(LSTM(200))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(predictors, label, epochs=600, verbose=2)
model.save('PANDA.h5')

Epoch 1/600
4/4 - 3s - loss: 4.0883 - accuracy: 0.0600 - 3s/epoch - 700ms/step
Epoch 2/600
4/4 - 0s - loss: 4.0528 - accuracy: 0.1000 - 73ms/epoch - 18ms/step
Epoch 3/600
4/4 - 0s - loss: 3.9836 - accuracy: 0.1000 - 63ms/epoch - 16ms/step
Epoch 4/600
4/4 - 0s - loss: 3.8161 - accuracy: 0.1000 - 81ms/epoch - 20ms/step
Epoch 5/600
4/4 - 0s - loss: 3.7889 - accuracy: 0.1000 - 71ms/epoch - 18ms/step
Epoch 6/600
4/4 - 0s - loss: 3.6778 - accuracy: 0.1000 - 65ms/epoch - 16ms/step
Epoch 7/600
4/4 - 0s - loss: 3.7146 - accuracy: 0.0700 - 63ms/epoch - 16ms/step
Epoch 8/600
4/4 - 0s - loss: 3.7083 - accuracy: 0.0400 - 71ms/epoch - 18ms/step
Epoch 9/600
4/4 - 0s - loss: 3.6857 - accuracy: 0.0400 - 74ms/epoch - 19ms/step
Epoch 10/600
4/4 - 0s - loss: 3.6794 - accuracy: 0.0400 - 69ms/epoch - 17ms/step
Epoch 11/600
4/4 - 0s - loss: 3.6566 - accuracy: 0.0600 - 70ms/epoch - 18ms/step
Epoch 12/600
4/4 - 0s - loss: 3.6420 - accuracy: 0.0600 - 56ms/epoch - 14ms/step
Epoch 13/600
4/4 - 0s - loss: 3.6408 -

In [11]:
# def predict_next_word(seed_text):
#     token_list = tokenizer.texts_to_sequences([seed_text])[0]
#     token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
#     predicted = model.predict(token_list, verbose=0)
#     predicted_index = np.argmax(predicted)  # Get the index with the highest probability
#     predicted_word = ""
#     for word, index in tokenizer.word_index.items():
#         if index == predicted_index:
#             predicted_word = word
#             break
#     return predicted_word

def completion(seed_text, num_words=5):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list], 
        maxlen=max_sequence_len - 1,
        padding='pre'
    )
    predicted_words = []

    for _ in range(num_words):
        predicted = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted)
        predicted_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                predicted_word = word
                break
        predicted_words.append(predicted_word)
        token_list = np.append(
            token_list[:, 1:], 
            [[predicted_index]], 
            axis=1
        )

    response = ' '.join(predicted_words)
    return response

In [12]:
print('I\'m PANDA , Paradigm-based Artificial Neural Dialogue Agent , A Language Model which is able to predict next words')

I'm PANDA , Paradigm-based Artificial Neural Dialogue Agent , A Language Model which is able to predict next words


In [13]:
import os
import sys
import time


def typed(response):
    for char in response:
        sys.stdout.write(char)
        sys.stdout.flush()
        time.sleep(0.01)
    sys.stdout.write('\n')
    
    
os.system('cls')
while True:
    user_input = input("\n ↳ ")
    response = completion(user_input)
    typed(response)


i'm fine how about you
you enjoying it it it
enjoying it it it it
it's okay it's a really
enjoying it it how are
i'm fine how about you
to to big better better
i'm fine how about you
i'm fine how about you


KeyboardInterrupt: Interrupted by user

In [None]:
# text = "how"

# for i in range(5):
#     response = predict_next_word(text)
#     text = f'{text} {response}'
#     print(text)

# print(res)