In [35]:
from pyvi.pyvi import ViTokenizer
import settings
from sklearn.svm import LinearSVC

from gensim import corpora, matutils
from sklearn.metrics import classification_report

class NLP(object):
    def __init__(self, text = None):
        self.text = text
        self.__set_stopwords()

    def __set_stopwords(self):
        self.stopwords = FileReader(settings.STOP_WORDS).read_stopwords()

    def segmentation(self):
        return ViTokenizer.tokenize(self.text)

    def split_words(self):
        text = self.segmentation()
        try:
            return [x.strip(settings.SPECIAL_CHARACTER).lower() for x in text.split()]
        except TypeError:
            return []

    def get_words_feature(self):
        split_words = self.split_words()
        return [word for word in split_words if word.encode('utf-8') not in self.stopwords]

In [26]:
class FileReader(object):
    def __init__(self, filePath, encoder = None):
        self.filePath = filePath
        self.encoder = encoder if encoder != None else 'utf-16le'

    def read(self):
        with open(self.filePath) as f:
            s = f.read()
        return s

    def content(self):
        s = self.read()
        return s.decode(self.encoder)

    def read_json(self):
        with open(self.filePath) as f:
            s = json.load(f)
        return s

    def read_stopwords(self):
        with open(self.filePath, 'r') as f:
            stopwords = set([w.strip().replace(' ', '_') for w in f.readlines()])
        return stopwords

    def load_dictionary(self):
        return corpora.Dictionary.load_from_text(self.filePath)

In [27]:
class FileStore(object):
    def __init__(self, filePath, data = None):
        self.filePath = filePath
        self.data = data

    def store_json(self):
        with open(self.filePath, 'w') as outfile:
            json.dump(self.data, outfile)

    def store_dictionary(self, dict_words):
        dictionary = corpora.Dictionary(dict_words)
        dictionary.filter_extremes(no_below=20, no_above=0.3)
        dictionary.save_as_text(self.filePath)

    def save_pickle(self,  obj):
        outfile = open(self.filePath, 'wb')
        fastPickler = cPickle.Pickler(outfile, cPickle.HIGHEST_PROTOCOL)
        fastPickler.fast = 1
        fastPickler.dump(obj)
        outfile.close()


In [28]:
class DataLoader(object):
    def __init__(self, dataPath):
        self.dataPath = dataPath

    def __get_files(self):
        folders = [self.dataPath + folder + '/' for folder in os.listdir(self.dataPath)]
        class_titles = os.listdir(self.dataPath)
        files = {}
        for folder, title in zip(folders, class_titles):
            files[title] = [folder + f for f in os.listdir(folder)]
        self.files = files

    def get_json(self):
        self.__get_files()
        data = []
        for topic in self.files:
            rand = randint(100, 150)
            i = 0
            for file in self.files[topic]:
                content = FileReader(filePath=file).content()
                data.append({
                    'category': topic,
                    'content': content
                })
                if i == rand:
                    break
                else:
                    i += 1
        return data

In [29]:
class FeatureExtraction(object):
    def __init__(self, data):
        self.data = data

    def __build_dictionary(self):
        print ('Building dictionary')
        dict_words = []
        i = 0
        for text in self.data:
            i += 1
#             print ("Step {} / {}".format(i, len(self.data)))
            words = NLP(text = text['content']).get_words_feature()
            dict_words.append(words)
        FileStore(filePath=settings.DICTIONARY_PATH).store_dictionary(dict_words)

    def __load_dictionary(self):
        if os.path.exists(settings.DICTIONARY_PATH) == False:
            self.__build_dictionary()
        self.dictionary = FileReader(settings.DICTIONARY_PATH).load_dictionary()

    def __build_dataset(self):
        self.features = []
        self.labels = []
        i = 0
        for d in self.data:
            i += 1
#             print ("Step {} / {}".format(i, len(self.data)))
            self.features.append(self.get_dense(d['content']))
            self.labels.append(d['category'])

    def get_dense(self, text):
        self.__load_dictionary()
        words = NLP(text).get_words_feature()
        # Bag of words
        vec = self.dictionary.doc2bow(words)
        dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
        return dense

    def get_data_and_label(self):
        self.__build_dataset()
        return self.features, self.labels

In [30]:
class Classifier(object):
    def __init__(self, features_train = None, labels_train = None, features_test = None, labels_test = None,  estimator = LinearSVC(random_state=0)):
        self.features_train = features_train
        self.features_test = features_test
        self.labels_train = labels_train
        self.labels_test = labels_test
        self.estimator = estimator

    def training(self):
        self.estimator.fit(self.features_train, self.labels_train)
        self.__training_result()

    def save_model(self, filePath):
        FileStore(filePath=filePath).save_pickle(obj=est)

    def __training_result(self):
        y_true, y_pred = self.labels_test, self.estimator.predict(self.features_test)
        print(classification_report(y_true, y_pred))

In [36]:
# json_train = DataLoader(dataPath=settings.DATA_TRAIN_PATH).get_json()
# FileStore(filePath=settings.DATA_TRAIN_JSON, data=json_train).store_json()
# json_test = DataLoader(dataPath=settings.DATA_TEST_PATH).get_json()
# FileStore(filePath=settings.DATA_TEST_JSON, data=json_test).store_json()
train_loader = FileReader(filePath=settings.DATA_TRAIN_JSON)
test_loader = FileReader(filePath=settings.DATA_TEST_JSON)
data_train = train_loader.read_json()
data_test = test_loader.read_json()

features_train, labels_train = FeatureExtraction(data=data_train).get_data_and_label()
features_test, labels_test = FeatureExtraction(data=data_test).get_data_and_label()

est = Classifier(features_train=features_train, features_test=features_test, labels_train=labels_train, labels_test=labels_test)
est.training()
est.save_model(filePath='trained_model/linear_svc_model.pk')

Step 1 / 1265
Building dictionary
Step 1 / 1265
Step 2 / 1265
Step 3 / 1265
Step 4 / 1265
Step 5 / 1265
Step 6 / 1265
Step 7 / 1265
Step 8 / 1265
Step 9 / 1265
Step 10 / 1265
Step 11 / 1265
Step 12 / 1265
Step 13 / 1265
Step 14 / 1265
Step 15 / 1265
Step 16 / 1265
Step 17 / 1265
Step 18 / 1265
Step 19 / 1265
Step 20 / 1265
Step 21 / 1265
Step 22 / 1265
Step 23 / 1265
Step 24 / 1265
Step 25 / 1265
Step 26 / 1265
Step 27 / 1265
Step 28 / 1265
Step 29 / 1265
Step 30 / 1265
Step 31 / 1265
Step 32 / 1265
Step 33 / 1265
Step 34 / 1265
Step 35 / 1265
Step 36 / 1265
Step 37 / 1265
Step 38 / 1265
Step 39 / 1265
Step 40 / 1265
Step 41 / 1265
Step 42 / 1265
Step 43 / 1265
Step 44 / 1265
Step 45 / 1265
Step 46 / 1265
Step 47 / 1265
Step 48 / 1265
Step 49 / 1265
Step 50 / 1265
Step 51 / 1265
Step 52 / 1265
Step 53 / 1265
Step 54 / 1265
Step 55 / 1265
Step 56 / 1265
Step 57 / 1265
Step 58 / 1265
Step 59 / 1265
Step 60 / 1265
Step 61 / 1265
Step 62 / 1265
Step 63 / 1265
Step 64 / 1265
Step 65 / 1265


Step 519 / 1265
Step 520 / 1265
Step 521 / 1265
Step 522 / 1265
Step 523 / 1265
Step 524 / 1265
Step 525 / 1265
Step 526 / 1265
Step 527 / 1265
Step 528 / 1265
Step 529 / 1265
Step 530 / 1265
Step 531 / 1265
Step 532 / 1265
Step 533 / 1265
Step 534 / 1265
Step 535 / 1265
Step 536 / 1265
Step 537 / 1265
Step 538 / 1265
Step 539 / 1265
Step 540 / 1265
Step 541 / 1265
Step 542 / 1265
Step 543 / 1265
Step 544 / 1265
Step 545 / 1265
Step 546 / 1265
Step 547 / 1265
Step 548 / 1265
Step 549 / 1265
Step 550 / 1265
Step 551 / 1265
Step 552 / 1265
Step 553 / 1265
Step 554 / 1265
Step 555 / 1265
Step 556 / 1265
Step 557 / 1265
Step 558 / 1265
Step 559 / 1265
Step 560 / 1265
Step 561 / 1265
Step 562 / 1265
Step 563 / 1265
Step 564 / 1265
Step 565 / 1265
Step 566 / 1265
Step 567 / 1265
Step 568 / 1265
Step 569 / 1265
Step 570 / 1265
Step 571 / 1265
Step 572 / 1265
Step 573 / 1265
Step 574 / 1265
Step 575 / 1265
Step 576 / 1265
Step 577 / 1265
Step 578 / 1265
Step 579 / 1265
Step 580 / 1265
Step 581

Step 1041 / 1265
Step 1042 / 1265
Step 1043 / 1265
Step 1044 / 1265
Step 1045 / 1265
Step 1046 / 1265
Step 1047 / 1265
Step 1048 / 1265
Step 1049 / 1265
Step 1050 / 1265
Step 1051 / 1265
Step 1052 / 1265
Step 1053 / 1265
Step 1054 / 1265
Step 1055 / 1265
Step 1056 / 1265
Step 1057 / 1265
Step 1058 / 1265
Step 1059 / 1265
Step 1060 / 1265
Step 1061 / 1265
Step 1062 / 1265
Step 1063 / 1265
Step 1064 / 1265
Step 1065 / 1265
Step 1066 / 1265
Step 1067 / 1265
Step 1068 / 1265
Step 1069 / 1265
Step 1070 / 1265
Step 1071 / 1265
Step 1072 / 1265
Step 1073 / 1265
Step 1074 / 1265
Step 1075 / 1265
Step 1076 / 1265
Step 1077 / 1265
Step 1078 / 1265
Step 1079 / 1265
Step 1080 / 1265
Step 1081 / 1265
Step 1082 / 1265
Step 1083 / 1265
Step 1084 / 1265
Step 1085 / 1265
Step 1086 / 1265
Step 1087 / 1265
Step 1088 / 1265
Step 1089 / 1265
Step 1090 / 1265
Step 1091 / 1265
Step 1092 / 1265
Step 1093 / 1265
Step 1094 / 1265
Step 1095 / 1265
Step 1096 / 1265
Step 1097 / 1265
Step 1098 / 1265
Step 1099 / 12

Step 283 / 1265
Step 284 / 1265
Step 285 / 1265
Step 286 / 1265
Step 287 / 1265
Step 288 / 1265
Step 289 / 1265
Step 290 / 1265
Step 291 / 1265
Step 292 / 1265
Step 293 / 1265
Step 294 / 1265
Step 295 / 1265
Step 296 / 1265
Step 297 / 1265
Step 298 / 1265
Step 299 / 1265
Step 300 / 1265
Step 301 / 1265
Step 302 / 1265
Step 303 / 1265
Step 304 / 1265
Step 305 / 1265
Step 306 / 1265
Step 307 / 1265
Step 308 / 1265
Step 309 / 1265
Step 310 / 1265
Step 311 / 1265
Step 312 / 1265
Step 313 / 1265
Step 314 / 1265
Step 315 / 1265
Step 316 / 1265
Step 317 / 1265
Step 318 / 1265
Step 319 / 1265
Step 320 / 1265
Step 321 / 1265
Step 322 / 1265
Step 323 / 1265
Step 324 / 1265
Step 325 / 1265
Step 326 / 1265
Step 327 / 1265
Step 328 / 1265
Step 329 / 1265
Step 330 / 1265
Step 331 / 1265
Step 332 / 1265
Step 333 / 1265
Step 334 / 1265
Step 335 / 1265
Step 336 / 1265
Step 337 / 1265
Step 338 / 1265
Step 339 / 1265
Step 340 / 1265
Step 341 / 1265
Step 342 / 1265
Step 343 / 1265
Step 344 / 1265
Step 345

Step 799 / 1265
Step 800 / 1265
Step 801 / 1265
Step 802 / 1265
Step 803 / 1265
Step 804 / 1265
Step 805 / 1265
Step 806 / 1265
Step 807 / 1265
Step 808 / 1265
Step 809 / 1265
Step 810 / 1265
Step 811 / 1265
Step 812 / 1265
Step 813 / 1265
Step 814 / 1265
Step 815 / 1265
Step 816 / 1265
Step 817 / 1265
Step 818 / 1265
Step 819 / 1265
Step 820 / 1265
Step 821 / 1265
Step 822 / 1265
Step 823 / 1265
Step 824 / 1265
Step 825 / 1265
Step 826 / 1265
Step 827 / 1265
Step 828 / 1265
Step 829 / 1265
Step 830 / 1265
Step 831 / 1265
Step 832 / 1265
Step 833 / 1265
Step 834 / 1265
Step 835 / 1265
Step 836 / 1265
Step 837 / 1265
Step 838 / 1265
Step 839 / 1265
Step 840 / 1265
Step 841 / 1265
Step 842 / 1265
Step 843 / 1265
Step 844 / 1265
Step 845 / 1265
Step 846 / 1265
Step 847 / 1265
Step 848 / 1265
Step 849 / 1265
Step 850 / 1265
Step 851 / 1265
Step 852 / 1265
Step 853 / 1265
Step 854 / 1265
Step 855 / 1265
Step 856 / 1265
Step 857 / 1265
Step 858 / 1265
Step 859 / 1265
Step 860 / 1265
Step 861

Step 33 / 1396
Step 34 / 1396
Step 35 / 1396
Step 36 / 1396
Step 37 / 1396
Step 38 / 1396
Step 39 / 1396
Step 40 / 1396
Step 41 / 1396
Step 42 / 1396
Step 43 / 1396
Step 44 / 1396
Step 45 / 1396
Step 46 / 1396
Step 47 / 1396
Step 48 / 1396
Step 49 / 1396
Step 50 / 1396
Step 51 / 1396
Step 52 / 1396
Step 53 / 1396
Step 54 / 1396
Step 55 / 1396
Step 56 / 1396
Step 57 / 1396
Step 58 / 1396
Step 59 / 1396
Step 60 / 1396
Step 61 / 1396
Step 62 / 1396
Step 63 / 1396
Step 64 / 1396
Step 65 / 1396
Step 66 / 1396
Step 67 / 1396
Step 68 / 1396
Step 69 / 1396
Step 70 / 1396
Step 71 / 1396
Step 72 / 1396
Step 73 / 1396
Step 74 / 1396
Step 75 / 1396
Step 76 / 1396
Step 77 / 1396
Step 78 / 1396
Step 79 / 1396
Step 80 / 1396
Step 81 / 1396
Step 82 / 1396
Step 83 / 1396
Step 84 / 1396
Step 85 / 1396
Step 86 / 1396
Step 87 / 1396
Step 88 / 1396
Step 89 / 1396
Step 90 / 1396
Step 91 / 1396
Step 92 / 1396
Step 93 / 1396
Step 94 / 1396
Step 95 / 1396
Step 96 / 1396
Step 97 / 1396
Step 98 / 1396
Step 99 / 

KeyboardInterrupt: 

In [31]:
temp = u"Chào các bạn tôi là Phạm Văn Toàn đến từ blog Tự học Machine Learning"
print (NLP(text=temp).segmentation())

Chào các bạn tôi là Phạm_Văn_Toàn đến từ blog Tự học Machine_Learning
