In [1]:
import json
with open('preprocessed_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [2]:
words = []
types = []
for sentence in data:
    word_seq, type_seq = zip(*sentence)
    words.append(list(word_seq))
    types.append(list(type_seq))

In [3]:
print(words[:10])
print(types[:10])

[['Cha con', 'đổi', 'nhà', ',', 'thủ tục', 'có', 'rắc rối', '?.'], ['Hướng dẫn', 'chi tiết', 'cách', 'làm', 'visa', 'đi', 'Úc', 'tự túc', 'nhanh', 'nhất', '.'], ['Hà Nội', ':', 'Cụ bà', '97', 'tuổi', 'với', 'hành trình', 'đi', 'đòi', 'nhà', 'qua', 'hai', 'thập kỷ', '.'], ["'", 'Mạnh', 'tay', "'", 'với', 'nạn', 'cho', 'vay', 'nặng', 'lãi', '.'], ['BIDV', 'vạch', 'rõ', '6', 'hệ lụy khi', 'Viện Kiểm', 'sát', 'đề nghị', 'thu hồi', '2', '.', '550', 'tỷ đồng', '.'], ['Ngành', 'Tư pháp', 'Đà Nẵng', 'cần', 'sẵn sàng', 'nội lực', 'và', 'tâm thế', 'cho', 'năm 2018', 'thật', 'xuất sắc', '!.'], ['Đà Nẵng', ':', 'Dân', 'tố', 'doanh nghiệp', 'lấy', 'đất', 'đã', 'bán xây', 'công trình', '.'], ['Xuân', 'này', 'với', 'ngành', 'Tư pháp', 'Đà Nẵng', '....'], ['Dự án nhà', 'liền kề', 'nhưng', 'lại', 'bán', 'đất nền', '.'], ['Thời hạn', 'thực hiện', 'yêu cầu', 'chứng thực', '.']]
[['NOUN', 'VERB', 'NOUN', 'PUNCT', 'NOUN', 'VERB', 'ADJ', 'PUNCT'], ['VERB', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'VERB', 'PROPN', 'VE

In [4]:
# Create word_to_index mapping
word_to_index = {}
index = 1  # Start index from 1, leaving 0 for padding
for sentence in words:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = index
            index += 1

# Add special tokens for padding and unknown words
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = index

# You can optionally reverse the dictionary for index_to_word mapping
index_to_word = {index: word for word, index in word_to_index.items()}

# Create type_to_index mapping
type_to_index = {}
index = 0  # Start index from 0
for sentence in types:
    for type in sentence:
        if type not in type_to_index:
            type_to_index[type] = index
            index += 1

# You can optionally reverse the dictionary for index_to_type mapping
index_to_type = {index: type for type, index in type_to_index.items()}


In [5]:
vocab_size = len(word_to_index)
max_length = 50

# Convert words and types to numerical sequences
word_sequences = [[word_to_index[word] for word in sentence] for sentence in words]
type_sequences = [[type_to_index[type] for type in sentence] for sentence in types]


In [6]:
X = []
Y = []
for word_seq, type_seq in zip(word_sequences, type_sequences):
    for i in range(1, len(word_seq)):
        X.append(word_seq[:i] + type_seq[:i])
        Y.append(word_seq[i])

In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

X_padded = pad_sequences(X, maxlen=max_length, padding='pre')
Y_categorical = to_categorical(Y, num_classes=vocab_size)

2024-04-11 21:37:21.971400: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

model = Sequential()
model.add(Embedding(vocab_size, 100)) #  input_length=max_length
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))

In [9]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_padded, Y_categorical, epochs=50, batch_size=16, validation_split=0.2)

2024-04-11 21:38:00.018096: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 5563781760 exceeds 10% of free system memory.


Epoch 1/50
[1m5014/5014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.0997 - loss: 7.8763

2024-04-11 21:42:05.237057: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1391014800 exceeds 10% of free system memory.


[1m5014/5014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 44ms/step - accuracy: 0.0997 - loss: 7.8763 - val_accuracy: 0.1179 - val_loss: 7.5720
Epoch 2/50
[1m5014/5014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 39ms/step - accuracy: 0.1234 - loss: 6.9495 - val_accuracy: 0.1302 - val_loss: 7.5487
Epoch 3/50
[1m5014/5014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 40ms/step - accuracy: 0.1405 - loss: 6.4231 - val_accuracy: 0.1346 - val_loss: 7.6580
Epoch 4/50
[1m5014/5014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 39ms/step - accuracy: 0.1598 - loss: 5.9693 - val_accuracy: 0.1409 - val_loss: 7.7613
Epoch 5/50
[1m2962/5014[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m1:17[0m 38ms/step - accuracy: 0.1793 - loss: 5.5392

In [None]:
def generate_text(seed_sequence, length=10):
    for _ in range(length):
        encoded = [word_to_index[word] for word in seed_sequence]
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        predicted_index = model.predict_classes(encoded, verbose=0)
        predicted_word = index_to_word[predicted_index[0]]
        seed_sequence += ' ' + predicted_word
    return seed_sequence

In [None]:
seed_sequence = 'Hướng dẫn chi tiết cách làm'
generated_text = generate_text(seed_sequence)
print("Generated text:", generated_text)