In [583]:
from random import randint
import os
import json
import settings
import pickle
from pyvi import ViTokenizer
# from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from gensim import corpora, matutils
from sklearn.metrics import classification_report


## 1. Load Data

Dữ liệu sẽ được lưu vào biến để xử lý, dữ liệu đã được chia thành 2 phần data-train và data-test.

In [584]:
class FileReader(object):
    def __init__(self, filePath, encoder = None):
        self.filePath = filePath
        self.encoder = encoder if encoder != None else 'utf-16le'

    def read(self):
        with open(self.filePath, encoding=self.encoder) as f:
            s = f.read()
        return s

    def content(self):
        s = self.read()
        return s

    def read_json(self):
        with open(self.filePath) as f:
            s = json.load(f)
        return s

    def read_stopwords(self):
        with open(self.filePath, 'r',encoding="utf-8") as f:
            stopwords = set([w.strip().replace(' ', '_') for w in f.readlines()])
        return stopwords

    def load_dictionary(self):
        return corpora.Dictionary.load_from_text(self.filePath)

In [585]:
class FileStore(object):
    def __init__(self, filePath, data = None):
        self.filePath = filePath
        self.data = data

    def store_json(self):
        with open(self.filePath, 'w') as outfile:
            json.dump(self.data, outfile)
        # with open(self.filePath, 'w', encoding='utf-8') as outfile:
        #     json.dump(self.data, outfile, ensure_ascii=False, indent=4)

    def store_dictionary(self, dict_words):
        dictionary = corpora.Dictionary(dict_words)
        dictionary.filter_extremes(no_below=20, no_above=0.3)
        dictionary.save_as_text(self.filePath)

    def save_pickle(self,  obj):
        outfile = open(self.filePath, 'wb')
        fastPickler = pickle.Pickler(outfile, pickle.HIGHEST_PROTOCOL)
        fastPickler.fast = 1
        fastPickler.dump(obj)
        outfile.close()

In [586]:
class DataLoader(object):
    def __init__(self, dataPath):
        self.dataPath = dataPath

    def __get_files(self):
        folders = [self.dataPath + folder + '/' for folder in os.listdir(self.dataPath)]
        class_titles = os.listdir(self.dataPath)
        files = {}
        for folder, title in zip(folders, class_titles):
            files[title] = [folder + f for f in os.listdir(folder)]
        self.files = files

    def get_json(self):
        self.__get_files()
        data = []
        for topic in self.files:
            rand = randint(100, 150)
            i = 0
            for file in self.files[topic]:
                content = FileReader(filePath=file).content()
                data.append({
                    'category': topic,
                    'content': content
                })
                if i == rand:
                    break
                else:
                    i += 1
        return data

## 2. Feature Extraction

Sau khi đã có tập dữ liệu, tiến hành một số bước lựa chọn thuộc tính đầu vào cho bài toán phân lớp.

- Words segmentation `def segmentation(self):`
- Remove Stopwords `NLP(object):`
- Xây dựng từ điển các từ `FeatureExtraction(object):`
- Khởi tạo vector thuộc tính với Bag of Word `FeatureExtraction(object):`


In [587]:
class NLP(object):
    def __init__(self, text = None):
        self.text = text
        self.__set_stopwords()

    def __set_stopwords(self):
        self.stopwords = FileReader(settings.STOP_WORDS).read_stopwords()

    def segmentation(self):
        return ViTokenizer.tokenize(self.text)

    def split_words(self):
        text = self.segmentation()
        try:
            return [x.strip(settings.SPECIAL_CHARACTER).lower() for x in text.split()]
        except TypeError:
            return []

    def get_words_feature(self):
        split_words = self.split_words()  
        return [word for word in split_words if word not in self.stopwords]

In [588]:
class FeatureExtraction(object):
    def __init__(self, data):
        self.data = data

    def __build_dictionary(self):
        print ('Building dictionary')
        dict_words = []
        i = 0
        for text in self.data:
            i += 1
            print ("Step {} / {}".format(i, len(self.data)))
            words = NLP(text = text['content']).get_words_feature()
            dict_words.append(words)
        FileStore(filePath=settings.DICTIONARY_PATH).store_dictionary(dict_words)

    def __load_dictionary(self):
        if os.path.exists(settings.DICTIONARY_PATH) == False:
            self.__build_dictionary()
        self.dictionary = FileReader(settings.DICTIONARY_PATH).load_dictionary()

    def __build_dataset(self):
        self.features = []
        self.labels = []
        i = 0
        for d in self.data:
            i += 1
            print ("Step {} / {}".format(i, len(self.data)))
            self.features.append(self.get_dense(d['content']))
            self.labels.append(d['category'])

    def get_dense(self, text):
        self.__load_dictionary()
        words = NLP(text).get_words_feature()
        # Bag of words
        vec = self.dictionary.doc2bow(words)
        dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
        return dense

    def get_data_and_label(self):
        self.__build_dataset()
        return self.features, self.labels

## 3. Phân lớp văn bản Naive Bayes

Sau khi đã có được vector thuộc tính sử dụng phương pháp **Bag of Word** sẽ tiến hành phân loại văn bản.


In [589]:
class Classifier(object):
    def __init__(self, features_train = None, labels_train = None, features_test = None, labels_test = None,  estimator = GaussianNB(priors=None)):
        self.features_train = features_train
        self.features_test = features_test
        self.labels_train = labels_train
        self.labels_test = labels_test
        self.estimator = estimator

    def training(self):
        self.estimator.fit(self.features_train, self.labels_train)
        self.__training_result()

    def save_model(self, filePath):
        FileStore(filePath=filePath).save_pickle(obj=est)

    def __training_result(self):
        y_true, y_pred = self.labels_test, self.estimator.predict(self.features_test)
        print(classification_report(y_true, y_pred))

In [590]:
if __name__ == '__main__':
    json_train = DataLoader(dataPath=settings.DATA_TRAIN_PATH).get_json()
    FileStore(filePath=settings.DATA_TRAIN_JSON, data=json_train).store_json()
    json_test = DataLoader(dataPath=settings.DATA_TEST_PATH).get_json()
    FileStore(filePath=settings.DATA_TEST_JSON, data=json_test).store_json()
    train_loader = FileReader(filePath=settings.DATA_TRAIN_JSON)
    test_loader = FileReader(filePath=settings.DATA_TEST_JSON)
    data_train = train_loader.read_json()
    data_test = test_loader.read_json()

    features_train, labels_train = FeatureExtraction(data=data_train).get_data_and_label()
    features_test, labels_test = FeatureExtraction(data=data_test).get_data_and_label()

    est = Classifier(features_train=features_train, features_test=features_test, labels_train=labels_train, labels_test=labels_test)
    est.training()
    est.save_model(filePath='trained_model/gaussiannb_nb_model.pk')
    print ('Finish processing.')

Step 1 / 1321
Step 2 / 1321
Step 3 / 1321
Step 4 / 1321
Step 5 / 1321
Step 6 / 1321
Step 7 / 1321
Step 8 / 1321
Step 9 / 1321
Step 10 / 1321
Step 11 / 1321
Step 12 / 1321
Step 13 / 1321
Step 14 / 1321
Step 15 / 1321
Step 16 / 1321
Step 17 / 1321
Step 18 / 1321
Step 19 / 1321
Step 20 / 1321
Step 21 / 1321
Step 22 / 1321
Step 23 / 1321
Step 24 / 1321
Step 25 / 1321
Step 26 / 1321
Step 27 / 1321
Step 28 / 1321
Step 29 / 1321
Step 30 / 1321
Step 31 / 1321
Step 32 / 1321
Step 33 / 1321
Step 34 / 1321
Step 35 / 1321
Step 36 / 1321
Step 37 / 1321
Step 38 / 1321
Step 39 / 1321
Step 40 / 1321
Step 41 / 1321
Step 42 / 1321
Step 43 / 1321
Step 44 / 1321
Step 45 / 1321
Step 46 / 1321
Step 47 / 1321
Step 48 / 1321
Step 49 / 1321
Step 50 / 1321
Step 51 / 1321
Step 52 / 1321
Step 53 / 1321
Step 54 / 1321
Step 55 / 1321
Step 56 / 1321
Step 57 / 1321
Step 58 / 1321
Step 59 / 1321
Step 60 / 1321
Step 61 / 1321
Step 62 / 1321
Step 63 / 1321
Step 64 / 1321
Step 65 / 1321
Step 66 / 1321
Step 67 / 1321
Step