In [33]:
#install pyvi package
!pip install pyvi



In [34]:
from pyvi import ViTokenizer
import numpy as np
import gensim 
from random import randint
import os
import json
import pandas as pd

In [35]:
DATA_TRAIN_PATH = '/kaggle/input/vietnamese-text-classification-dskt/VNeseTextClassification/trainning.txt'
DATA_TEST_PATH = '/kaggle/input/vietnamese-text-classification-dskt/VNeseTextClassification/testing.txt'
LABEL_TEST_PATH = '/kaggle/input/vietnamese-text-classification-dskt/VNeseTextClassification/testing_labels.txt'
STOP_WORDS = '/kaggle/input/vietnamese-text-classification-dskt/stopwords-nlp-vi.txt'
SPECIAL_CHARACTER = '0123456789%@$.,=+-!;/()*"&^:#|\n\t\''

In [36]:
# Read different types of data
class FileReader(object):
    def __init__(self, file_path, encoder=None):
        self.file_path = file_path
        self.encoder = encoder if encoder is not None else 'utf-8'

    # Read stopwords from txt file
    def read_stopwords(self):
        with open(self.file_path, 'r', encoding=self.encoder) as f:
            stopwords = set([w.strip().replace(' ', '_') for w in f.readlines()])
        return stopwords

    # Read dictionary from txt file
    def load_dictionary(self):
        return corpora.Dictionary.load_from_text(self.file_path)

In [37]:
# Store different type of data
class FileStore(object):
    def __init__(self, file_path, data = None):
        self.file_path = file_path
        self.data = data

    def store_json(self):
        with open(self.file_path, 'w') as outfile:
            json.dump(self.data, outfile)

    def save_pickle(self,  obj):
        outfile = open(self.file_path, 'wb')
        fastPickler = cPickle.Pickler(outfile, cPickle.HIGHEST_PROTOCOL)
        fastPickler.fast = 1
        fastPickler.dump(obj)
        outfile.close()

In [38]:
class NLP(object):
    def __init__(self, text = None):
        self.text = text
        self.__set_stopwords()

    def __set_stopwords(self):
        self.stopwords = FileReader(STOP_WORDS).read_stopwords()

    def segmentation(self):
        return ViTokenizer.tokenize(self.text)

    def split_words(self):
        text = self.segmentation()
        try:
            return [x.strip(SPECIAL_CHARACTER).lower() for x in text.split()]
        except TypeError:
            return []

    def get_words_feature(self):
        split_words = self.split_words()
        return ' '.join([word for word in split_words if word not in self.stopwords])

In [39]:
class TrainDataLoader(object):
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path, delimiter='\t', header=None, names=['label', 'content'], encoding="utf-8")
    
    def get_data(self):
        data = []
        label = []
        for _, row in self.df.iterrows():
            data.append(NLP(row['content']).get_words_feature())
            label.append(row['label'])
        return data, label

In [40]:
class TestDataLoader:
    def __init__(self, content_path, label_path, encoding='utf-8'):
        with open(content_path, 'r', encoding=encoding) as f:
            self.content = f.readlines()
     
        with open(label_path, 'r', encoding=encoding) as f:
            self.labels = f.readlines()

        if len(self.content) != len(self.labels):
            raise ValueError("Số lượng nội dung và nhãn không khớp.")

    def get_data(self):
        data = []
        labels = []
        for content, label in zip(self.content, self.labels):
            data.append(NLP(text = content.strip()).get_words_feature())
            labels.append(label.strip()) 
        return data, labels

In [41]:
X_data, y_data = TrainDataLoader(DATA_TRAIN_PATH).get_data()
print(X_data[1])
print(y_data[1])

ngầm_hóa công_trình hạ_tầng kỹ_thuật ubnd tp chấp_thuận giao khu quản_lý giao_thông đô_thị phối_hợp quản_lý chuyên_ngành di_dời tái_lập công_trình hạ_tầng kỹ_thuật phạm_vi qui_hoạch xây_dựng dự_án phương_án ngầm_hóa riêng_biệt công_trình cáp_quang bưu_điện truyền_thông điện_lực cấp_nước ubnd tp chấp_thuận bưu_điện tp nghiên_cứu đầu_tư khai_thác xây_dựng đường_ống ngầm kỹ_thuật cáp_quang bưu_điện
__CTXH__


In [42]:
X_test, y_test = TestDataLoader(DATA_TEST_PATH, LABEL_TEST_PATH).get_data()
print(X_test[1])
print(y_test[1])

nước_giải hạn tận_dụng cầu_thang đem đơn_giản giải_nhiệt mùa khô hoành_hành bắt_đầu_vào mùa khô bắc vương_vấn chút hơi lạnh cư_dân miền nam đối_mặt nóng nắng đổ mùa khô dự_báo nóng điện sinh_hoạt hăm_he đòi tăng_giá giải_nhiệt bớt chủ đau_đầu phụ_thuộc thiết_bị máy_móc hiện_đại giải_pháp truyền_thống kiến_trúc_sư thầy_địa_lý khuyên cố_gắng lẽ đơn_giản thủy trị hỏa giải_pháp dễ_dàng ứng_dụng nằm ý_tưởng mùa khô gia_đình chuẩn xây_dựng đô_thị diện_tích sinh_hoạt nước_giải_nhiệt gọi hòn non_bộ đơn_giản vuông góc sân_nhà dây súng cây_cảnh dịu mát bớt nóng tường vách bê_tông ngồn_ngộn cũ cải_thiện không_gian sống tận_dụng cầu_thang góc bể cá chậu thả hoa tươi đơn_giản hao tốn điện trở_nên dịu mát hẳn góc thư giãn chung_cư sân góc sân tận_dụng góc cầu_thang mát góc sân_nhà tận_dụng góc sân_nhà tận_dụng hiên sân_nhà mát không_gian sống
__DS__


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer='word', max_features=8000)
#Tính tfidf cho tập train
tfidf_vect.fit(X_data)

tfidf_X_data =  tfidf_vect.transform(X_data)
tfidf_X_test =  tfidf_vect.transform(X_test)

In [44]:
n_components = 150

In [45]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components= n_components, random_state=2004)
svd.fit(tfidf_X_data)
#pickle.dump(svd, open("/content/drive/My Drive/NLP/Model/selector.pickle", "wb"))

tfidf_X_data_svd = svd.transform(tfidf_X_data)
tfidf_X_test_svd = svd.transform(tfidf_X_test)

In [46]:
from keras import models
from keras.models import *
from keras.layers import *
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam, SGD
from keras import optimizers
from sklearn.model_selection import train_test_split

In [47]:
def lstm_model():
    
    input_layer = Input(shape=(n_components,))
    layer = Reshape((1, n_components))(input_layer)
    layer = LSTM(256, activation='relu',dropout=0.4,recurrent_dropout=0.3)(layer)
#     layer = LSTM(128, activation='relu',dropout=0.4,recurrent_dropout=0.3)(layer)
#     keras.layers.RNN(cell, return_sequences=False, return_state=False, go_backwards=False, stateful=False, unroll=False)
    layer = Dense(512, activation='relu')(layer)
    layer = Dropout(0.3)(layer)
#     layer = Reshape((16, 32))(layer)
#     layer = LSTM(256, activation='relu',dropout=0.4,recurrent_dropout=0.3)(layer)
#     layer = Dense(512, activation='relu')(layer)
    layer = Dense(128, activation='relu')(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(256, activation='relu')(layer)
#     layer = Dense(128, activation='relu')(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(64, activation='relu')(layer)
    layer = Dropout(0.3)(layer)
    output_layer = Dense(10, activation='softmax')(layer)
    model = models.Model(input_layer, output_layer)
#     models.Model.summary()
    
    return model

In [48]:
model = lstm_model()
model.summary()

In [49]:
from sklearn import preprocessing
import numpy
encoder = preprocessing.LabelEncoder()
y_data_one_hot = encoder.fit_transform(y_data)
y_test_one_hot = encoder.fit_transform(y_test)
#numpy.save('/content/drive/My Drive/NLP/Model/classes.npy', encoder.classes_)

In [50]:
print(y_test_one_hot)

[1 1 4 ... 5 0 3]


In [51]:
# encoder.classes_ = numpy.load('/content/drive/My Drive/NLP/Model/classes.npy')

In [60]:
from keras.callbacks import ReduceLROnPlateau
# Chọn optimizer và learning_rate
learning_rate = 1e-4

# optimizer = 'adam'  # Hoặc một optimizer khác, ví dụ: Adam(learning_rate=0.001)
optimizer = SGD(learning_rate=learning_rate) 
loss = 'sparse_categorical_crossentropy'
metrics = ['accuracy']

# Biên dịch mô hình
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=5) 

In [61]:
from sklearn import metrics
from sklearn.metrics import classification_report

def train(model, X_data, y_data, X_test, y_test, n_epochs=50):   
    #split training data and validation data
    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.05, random_state=2019)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=n_epochs, batch_size=512, callbacks=[reduce_lr])
    
    train_predictions = model.predict(X_train)
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    
    val_predictions = val_predictions.argmax(axis=-1)
    test_predictions = test_predictions.argmax(axis=-1)
    train_predictions = train_predictions.argmax(axis=-1)

    print("Train accuract", metrics.accuracy_score(train_predictions, y_train))
    print("Validation accuracy: ", metrics.accuracy_score(val_predictions, y_val))
    
    print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(test_predictions)))
    return history

In [54]:
n_epochs = 10

In [65]:
history = train(model=model, X_data=tfidf_X_data_svd, y_data=y_data_one_hot, X_test=tfidf_X_test_svd, y_test=y_test_one_hot, n_epochs = n_epochs)

Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.8351 - loss: 0.5394 - val_accuracy: 0.8840 - val_loss: 0.3587 - learning_rate: 1.0000e-06
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.8381 - loss: 0.5523 - val_accuracy: 0.8840 - val_loss: 0.3587 - learning_rate: 1.0000e-06
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.8342 - loss: 0.5607 - val_accuracy: 0.8840 - val_loss: 0.3587 - learning_rate: 1.0000e-06
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.8295 - loss: 0.5648 - val_accuracy: 0.8840 - val_loss: 0.3587 - learning_rate: 1.0000e-06
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.8193 - loss: 0.5949 - val_accuracy: 0.8840 - val_loss: 0.3587 - learning_rate: 1.0000e-06
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [66]:
save_model_path = 'lstm_model.h5'

In [67]:
model.save(save_model_path)

In [70]:
model = load_model(save_model_path)