In [40]:
import pandas as pd
from pyvi import ViTokenizer, ViPosTagger
import nltk
import numpy as np
import warnings
import time
warnings.filterwarnings('ignore')

In [41]:
filename =["bạn bè.txt","các câu hỏi phức tạp.txt","du lịch.txt","gia đình.txt","giải trí.txt",
           "học tập.txt","nghề nghiệp.txt","nghỉ lễ.txt","người yêu.txt","robot.txt","shoping.txt",
           "sở thích.txt","tdtu.txt","thông tin cá nhân.txt","trò chuyện về đi ăn.txt","tán gẫu.txt","đất nước.txt","địa chỉ.txt"]

In [42]:
# import train data
temp_ques = []
temp_ans = []
tag = []
for k in range(len(filename)):
    with open('dataset/' + filename[k], encoding='utf-8') as f:
        lines = f.readlines()
    for i in range(len(lines)):
        if lines[i].startswith('__eou__'):
            continue
        else:
            part = lines[i].strip('__eou__').split('__eou__')
            temp_ques.append(ViTokenizer.tokenize(part[0].lower().strip()))
            temp_ans.append(ViTokenizer.tokenize(part[1].lower().strip()))
            tag.append(filename[k].split(".")[0])

In [43]:
data = pd.DataFrame({'Question':temp_ques,'Answer':temp_ans,'Tag':tag})
data.head()

Unnamed: 0,Question,Answer,Tag
0,thích đánh_lộn không ?,ngon nhà_vô,bạn bè
1,solo yasua không,chấp lun 2 mạng đầu,bạn bè
2,mai đi picnic không ?,mai bận học rồi,bạn bè
3,mai học ca mấy vậy ?,mai học ca 3,bạn bè
4,còn tiền không ?,còn chết liền,bạn bè


In [44]:
ques = np.array(data["Question"])
ans = np.array(data["Answer"])

In [45]:
lst_empty_answer_index = []
for i in range(len(ans)):
    if(ans[i] == ""):
        lst_empty_answer_index.append(i)

In [46]:
for i in lst_empty_answer_index:
    ques = np.delete(ques, i)
    ans = np.delete(ans, i)

In [47]:
def clean_sentences(sentences):
    Punc = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''"}
    for i, sent in enumerate(sentences):
        sent = sent.lower()
        
        sent = [char for char in sent if char not in Punc]
        sent = "".join(sent)
        
        sent = sent.replace("   ", " ")
        sent = sent.replace("  ", " ")
        sent = sent.strip()
        
        sentences[i] = sent
    return sentences

In [48]:
clean_ques = clean_sentences(ques)
clean_ans = clean_sentences(ans)

In [49]:
word2count = {}
for sent in clean_ques:
    for word in sent.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

for sent in clean_ans:
    for word in sent.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [50]:
len(word2count)

4766

In [51]:
thresh = 1
word2index = {}
word_num = 0

for word, count in word2count.items():
    if (count >= thresh):
        word2index[word] = word_num
        word_num += 1

In [52]:
len(word2index)

4766

In [53]:
for i in range(len(clean_ans)):
    clean_ans[i] = '<BOS> ' + clean_ans[i] + ' <EOS>'

In [54]:
len(clean_ans)

5769

In [55]:
tokens = ['<BOS>', '<EOS>', '<OUT>']
x = len(word2index)
for token in tokens:
    word2index[token] = x
    x += 1

In [56]:
len(word2index)

4769

In [57]:
index2word = {w: v for v, w in word2index.items()}
len(index2word)

4769

In [60]:
encoder_input = []
for sent in clean_ques:
    lst = []
    for word in sent.split():
        if word not in word2index:
            lst.append(word2index["<OUT>"])
        else:
            lst.append(word2index[word])
    encoder_input.append(lst)

In [62]:
len(encoder_input)

5769

In [61]:
decoder_input = []
for sent in clean_ans:
    lst = []
    for word in sent.split():
        if word not in word2index:
            lst.append(word2index["<OUT>"])
        else:
            lst.append(word2index[word])
    decoder_input.append(lst)

In [59]:
len(decoder_input)

5769

In [77]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 20
encoder_input = pad_sequences(encoder_input, MAX_LEN, padding='post', truncating='post')
decoder_input = pad_sequences(decoder_input, MAX_LEN, padding='post', truncating='post')

In [65]:
decoder_final_output = []
for i in decoder_input:
    decoder_final_output.append(i[1:])

In [66]:
decoder_final_output[:3]

[array([ 986, 2522, 4767,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]),
 array([2523, 2252,  453,  490,  474, 4767,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]),
 array([   5,  803,    8,   15, 4767,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])]

In [67]:
decoder_final_output = pad_sequences(decoder_final_output, MAX_LEN, padding='post', truncating='post')
decoder_final_output[:3]

array([[ 986, 2522, 4767,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [2523, 2252,  453,  490,  474, 4767,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   5,  803,    8,   15, 4767,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [68]:
from keras.utils import to_categorical
decoder_final_output = to_categorical(decoder_final_output, len(word2index))

In [69]:
decoder_final_output.shape

(5769, 20, 4769)

In [70]:
from keras.layers import Embedding
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

In [71]:
# Define input
enc_inp = Input(shape=(MAX_LEN, ))
dec_inp = Input(shape=(MAX_LEN, ))

VOCAB_SIZE = len(word2index)
HIDDEN_DIM = 300
embedding_dimention = 100

# Define embedding layer
embed = Embedding(VOCAB_SIZE + 1, output_dim = embedding_dimention, input_length = MAX_LEN, trainable = True)

In [75]:
# Define encoder layers
enc_embed = embed(enc_inp)
enc_lstm = LSTM(HIDDEN_DIM, return_sequences = True, return_state = True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h, c]

#Define decoder layers
dec_embed = embed(dec_inp)
dec_lstm = LSTM(HIDDEN_DIM, return_sequences = True, return_state = True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state = enc_states)

dense = Dense(VOCAB_SIZE, activation = "softmax")

dense_op = dense(dec_op)

model = Model([enc_inp, dec_inp], dense_op)

In [88]:
# Train model
model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')

BATCH_SIZE = 32
EPOCHS = 100

model.fit([encoder_input, decoder_input],decoder_final_output,epochs=EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x136decbd978>

In [89]:
model.save("LSTM_model")

INFO:tensorflow:Assets written to: LSTM_model\assets


In [122]:
import keras.models
model = keras.models.load_model("LSTM_model.h5")

In [121]:
model.save("LSTM_model.h5")

In [123]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 100)      477000      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 20, 300), (N 481200      embedding[0][0]       

In [124]:
#Load encoder model
enc_inp = model.input[0]
enc_op, h, c = model.layers[3].output
enc_states = [h, c]

enc_model = Model([enc_inp], enc_states)

In [125]:
#Load decoder model to predict next word
decoder_state_input_h = Input(shape=(HIDDEN_DIM,))
decoder_state_input_c = Input(shape=(HIDDEN_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_inp = model.input[1]
embed = model.layers[2] # Embedding layer
dec_lstm = model.layers[4] # Decoder layer
dense = model.layers[5] # Dense

dec_embed = embed(dec_inp)
decoder_outputs, state_h, state_c = dec_lstm(dec_embed , initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
# output = dense(decoder_outputs)

dec_model = Model([dec_inp]+ decoder_states_inputs, [decoder_outputs] + decoder_states)

In [126]:
def clean_one_sent(sent):
    Punc = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''"}
    sent = sent.lower()

    sent = [char for char in sent if char not in Punc]
    sent = "".join(sent)

    sent = sent.replace("   ", " ")
    sent = sent.replace("  ", " ")
    sent = sent.strip()

    sent = ViTokenizer.tokenize(sent)
    
    return sent

In [127]:
prepro1 = ""
while prepro1 != 'quit':
    prepro1  = input("Question : ")
    prepro1 = clean_one_sent(prepro1)
    prepro = [prepro1]
    txt = []
    for x in prepro:
        lst = []
        for y in x.split():
            try:
                lst.append(word2index[y])
            except:
                lst.append(word2index['<OUT>'])
        txt.append(lst)
        txt = pad_sequences(txt, MAX_LEN, padding='post')

    stat = enc_model.predict( txt )
    empty_target_seq = np.zeros( ( 1 , 1) )
    empty_target_seq[0, 0] = word2index['<BOS>']
    stop_condition = False

    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h, c= dec_model.predict([empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)
        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
        sampled_word = index2word[sampled_word_index] + ' '

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word
        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LEN+1:
            stop_condition = True

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index

        stat = [h, c]
    print("Answer : ", decoded_translation )

Question : bạn có crush chưa?
Answer :  có rồi bạn 
Question : quit
Answer :  
