In [None]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import save_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

In [None]:
import bs4
import requests
response = requests.get("https://vi.wikisource.org/wiki/Truy%E1%BB%87n_Ki%E1%BB%81u")
truyenkieu =[]
if response is not None:
    html = bs4.BeautifulSoup(response.text, 'html.parser')
    title = html.select("#firstHeading")[0].text
    paragraphs = html.select("p")
    #for para in paragraphs:
        #print (para.text)
    # just grab the text up to contents as stated in question
    intro = '\n'.join([ para.text for para in paragraphs[0:5]])
    truyenkieu.append(intro)
    #print (intro)
truyenkieu =truyenkieu[0].split("\n")[2:]

In [None]:
punct = string.punctuation + string.digits + "''" + "\n"
def clean_data(text):
    text = text.lower()
    text = text.replace('[<p style="text-align: center;">', "")
    text = text.replace('<br/>\n', "")
    text = text.replace('</p>, <p style="text-align: center;">', " ")
    text = text.replace('</p>]', "")
    text = text.replace('\t', "")
    text = text.replace('\'', "")
    text = text.translate(str.maketrans(" ", " ", punct))
    return text
corpus = [clean_data(i) for i in truyenkieu]
for i in corpus:
    if len(i) == 0:
        corpus.remove(i)

In [None]:
train_sentences = []
train_labels = []
for line in corpus:
  list_word= line.split()
  for i in range(1, len(list_word)):
    sentence = list_word[:i]
    label = list_word[i]
    train_sentences.append(sentence)
    train_labels.append(label)


In [None]:
for i in range(10):
  print("input data : {} ===> output data: {}".format(train_sentences[i],train_labels[i]))

input data : ['trăm'] ===> output data: năm
input data : ['trăm', 'năm'] ===> output data: trong
input data : ['trăm', 'năm', 'trong'] ===> output data: cõi
input data : ['trăm', 'năm', 'trong', 'cõi'] ===> output data: người
input data : ['trăm', 'năm', 'trong', 'cõi', 'người'] ===> output data: ta
input data : ['chữ'] ===> output data: tài
input data : ['chữ', 'tài'] ===> output data: chữ
input data : ['chữ', 'tài', 'chữ'] ===> output data: mệnh
input data : ['chữ', 'tài', 'chữ', 'mệnh'] ===> output data: khéo
input data : ['chữ', 'tài', 'chữ', 'mệnh', 'khéo'] ===> output data: là


In [None]:
vocab_size = 2410
max_len = 7
embedding = 300
hidden_size = 256

In [None]:
#tokenizer
tokenizer = Tokenizer(num_words= vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(corpus)
len(tokenizer.word_index)
len(tokenizer.word_index)
num_word = len(tokenizer.word_index) +1
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_sequences[:5]

[[141], [141, 74], [141, 74, 27], [141, 74, 27, 498], [141, 74, 27, 498, 4]]

In [None]:
print(tokenizer.word_index)

{'<OOV>': 1, 'một': 2, 'đã': 3, 'người': 4, 'nàng': 5, 'lòng': 6, 'lời': 7, 'cho': 8, 'là': 9, 'cũng': 10, 'có': 11, 'rằng': 12, 'ra': 13, 'lại': 14, 'hoa': 15, 'tình': 16, 'mới': 17, 'còn': 18, 'đâu': 19, 'ai': 20, 'chẳng': 21, 'mà': 22, 'thì': 23, 'mình': 24, 'biết': 25, 'này': 26, 'trong': 27, 'đến': 28, 'đường': 29, 'nhà': 30, 'càng': 31, 'nào': 32, 'ngày': 33, 'trời': 34, 'thân': 35, 'như': 36, 'vào': 37, 'sao': 38, 'mặt': 39, 'khi': 40, 'vàng': 41, 'duyên': 42, 'xa': 43, 'về': 44, 'tay': 45, 'sinh': 46, 'trước': 47, 'làm': 48, 'chàng': 49, 'thôi': 50, 'thấy': 51, 'chi': 52, 'nghe': 53, 'những': 54, 'sau': 55, 'hai': 56, 'nỗi': 57, 'từ': 58, 'nước': 59, 'hương': 60, 'nói': 61, 'trông': 62, 'xuân': 63, 'hồng': 64, 'phải': 65, 'ta': 66, 'con': 67, 'thương': 68, 'gió': 69, 'thế': 70, 'đây': 71, 'tiếng': 72, 'chưa': 73, 'năm': 74, 'mấy': 75, 'tơ': 76, 'ở': 77, 'với': 78, 'nghĩ': 79, 'chút': 80, 'gì': 81, 'xưa': 82, 'nhau': 83, 'đi': 84, 'bên': 85, 'giờ': 86, 'công': 87, 'được': 88, 'm

In [None]:
# padding  or truncating
train_padded_sequences = pad_sequences(train_sequences, maxlen= max_len, truncating = "pre", padding ="pre")
train_padded_sequences[:5]

array([[  0,   0,   0,   0,   0,   0, 141],
       [  0,   0,   0,   0,   0, 141,  74],
       [  0,   0,   0,   0, 141,  74,  27],
       [  0,   0,   0, 141,  74,  27, 498],
       [  0,   0, 141,  74,  27, 498,   4]], dtype=int32)

In [None]:
train_labels = tokenizer.texts_to_sequences(train_labels)
train_labels[:5]

[[74], [27], [498], [4], [66]]

In [None]:
train_labels = ku.to_categorical(train_labels, num_classes = num_word)
train_labels[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Simple RNN

In [None]:
vocab_size = 2386
max_len = 7
embedding = 300
hidden_size = 256

In [None]:
Simple_RNN = Sequential()
Simple_RNN.add(Embedding(num_word, embedding, input_length = max_len-1))#vocab_size, embedding_dim, input_length=max_len-1
Simple_RNN.add(SimpleRNN(hidden_size))
Simple_RNN.add(Dense(1000, activation ='relu'))
Simple_RNN.add(Dense(num_word, activation = 'softmax'))



In [None]:
Simple_RNN.compile(loss ='categorical_crossentropy',
                        optimizer = 'adam',
                        metrics = ['acc'])
Simple_RNN_history = Simple_RNN.fit(train_padded_sequences, train_labels, epochs = 2)

Epoch 1/2
[1m611/611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 66ms/step - acc: 0.0140 - loss: 7.1084
Epoch 2/2
[1m611/611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 59ms/step - acc: 0.0360 - loss: 6.3034
