In [None]:
!pip install pyvi
from sklearn.base import BaseEstimator, TransformerMixin
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import os
import pickle

In [None]:
def load_stop_words(fn):
    with open(fn, mode='r', encoding='utf8') as f:
        words = f.read()
        f.close()
    return words.split('\n')

In [None]:
class FileReader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_data(self, is_contain_labels=True):
        with open(self.file_path, mode='r', encoding='utf8') as f:
            lines = f.readlines()
            f.close()
        if is_contain_labels:
            X, y = [], []
            for line in lines:
                line = line.strip()
                if line == '':
                    continue
                s = line.strip().split('\t')
                X.append(s[0])
                y.append(s[1])
            return X, y
        else:
            return lines


In [None]:
class FeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words_fn):
        self.tokenizer = ViTokenizer()
        self.SPECIAL_CHARACTER = '0123456789%@$.,=+-!;/()*"&^:#|\n\t\''
        self.STOP_WORDS = load_stop_words(stop_words_fn)

    def fit(self, *_):
        return self

    def remove_stop_words(self, text):
        return ' '.join([token for token in re.split('\\s+', text) if
                         token not in self.STOP_WORDS and token not in self.SPECIAL_CHARACTER])

    def transform(self, X, y=None, **fit_params):
        return [self.remove_stop_words(self.tokenizer.tokenize(x)) for x in X]

In [None]:
class NaiveBayesModel(object):
    def __init__(self):
        self.clf = self._init_pipeline()

    @staticmethod
    def _init_pipeline():
        pipeline = Pipeline([
            ("features_transformer", FeaturesTransformer('vietnamese-stopwords/vietnamese-stopwords-dash.txt')),
            ('bow', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', MultinomialNB())
        ])
        return pipeline

In [None]:
def main():
    folder = '/dataset'
    for directory in os.listdir(folder):
        print(time.asctime())
        data_fn = f'{folder}/{directory}/data.txt'
        X, y = FileReader(data_fn).load_data()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        # Fit model
        model = NaiveBayesModel()
        model.clf.fit(X_train, y_train)

        # Test
        y_pred = model.clf.predict(X_test)
        print("Accuracy of title is %.2f %%" % accuracy_score(y_test, y_pred))

        #Saving model
        with open(f'models/{directory}_nb.pickle', mode='wb') as f:
            pickle.dump(model, f)
            f.close()

        print(time.asctime())

In [None]:
main()