In [35]:
from pyvi.pyvi import ViTokenizer
import settings
from sklearn.svm import LinearSVC

from gensim import corpora, matutils
from sklearn.metrics import classification_report

class NLP(object):
    def __init__(self, text = None):
        self.text = text
        self.__set_stopwords()

    def __set_stopwords(self):
        self.stopwords = FileReader(settings.STOP_WORDS).read_stopwords()

    def segmentation(self):
        return ViTokenizer.tokenize(self.text)

    def split_words(self):
        text = self.segmentation()
        try:
            return [x.strip(settings.SPECIAL_CHARACTER).lower() for x in text.split()]
        except TypeError:
            return []

    def get_words_feature(self):
        split_words = self.split_words()
        return [word for word in split_words if word.encode('utf-8') not in self.stopwords]

In [26]:
class FileReader(object):
    def __init__(self, filePath, encoder = None):
        self.filePath = filePath
        self.encoder = encoder if encoder != None else 'utf-16le'

    def read(self):
        with open(self.filePath) as f:
            s = f.read()
        return s

    def content(self):
        s = self.read()
        return s.decode(self.encoder)

    def read_json(self):
        with open(self.filePath) as f:
            s = json.load(f)
        return s

    def read_stopwords(self):
        with open(self.filePath, 'r') as f:
            stopwords = set([w.strip().replace(' ', '_') for w in f.readlines()])
        return stopwords

    def load_dictionary(self):
        return corpora.Dictionary.load_from_text(self.filePath)

In [27]:
class FileStore(object):
    def __init__(self, filePath, data = None):
        self.filePath = filePath
        self.data = data

    def store_json(self):
        with open(self.filePath, 'w') as outfile:
            json.dump(self.data, outfile)

    def store_dictionary(self, dict_words):
        dictionary = corpora.Dictionary(dict_words)
        dictionary.filter_extremes(no_below=20, no_above=0.3)
        dictionary.save_as_text(self.filePath)

    def save_pickle(self,  obj):
        outfile = open(self.filePath, 'wb')
        fastPickler = cPickle.Pickler(outfile, cPickle.HIGHEST_PROTOCOL)
        fastPickler.fast = 1
        fastPickler.dump(obj)
        outfile.close()


In [28]:
class DataLoader(object):
    def __init__(self, dataPath):
        self.dataPath = dataPath

    def __get_files(self):
        folders = [self.dataPath + folder + '/' for folder in os.listdir(self.dataPath)]
        class_titles = os.listdir(self.dataPath)
        files = {}
        for folder, title in zip(folders, class_titles):
            files[title] = [folder + f for f in os.listdir(folder)]
        self.files = files

    def get_json(self):
        self.__get_files()
        data = []
        for topic in self.files:
            rand = randint(100, 150)
            i = 0
            for file in self.files[topic]:
                content = FileReader(filePath=file).content()
                data.append({
                    'category': topic,
                    'content': content
                })
                if i == rand:
                    break
                else:
                    i += 1
        return data

In [29]:
class FeatureExtraction(object):
    def __init__(self, data):
        self.data = data

    def __build_dictionary(self):
        print ('Building dictionary')
        dict_words = []
        i = 0
        for text in self.data:
            i += 1
#             print ("Step {} / {}".format(i, len(self.data)))
            words = NLP(text = text['content']).get_words_feature()
            dict_words.append(words)
        FileStore(filePath=settings.DICTIONARY_PATH).store_dictionary(dict_words)

    def __load_dictionary(self):
        if os.path.exists(settings.DICTIONARY_PATH) == False:
            self.__build_dictionary()
        self.dictionary = FileReader(settings.DICTIONARY_PATH).load_dictionary()

    def __build_dataset(self):
        self.features = []
        self.labels = []
        i = 0
        for d in self.data:
            i += 1
#             print ("Step {} / {}".format(i, len(self.data)))
            self.features.append(self.get_dense(d['content']))
            self.labels.append(d['category'])

    def get_dense(self, text):
        self.__load_dictionary()
        words = NLP(text).get_words_feature()
        # Bag of words
        vec = self.dictionary.doc2bow(words)
        dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
        return dense

    def get_data_and_label(self):
        self.__build_dataset()
        return self.features, self.labels

In [30]:
class Classifier(object):
    def __init__(self, features_train = None, labels_train = None, features_test = None, labels_test = None,  estimator = LinearSVC(random_state=0)):
        self.features_train = features_train
        self.features_test = features_test
        self.labels_train = labels_train
        self.labels_test = labels_test
        self.estimator = estimator

    def training(self):
        self.estimator.fit(self.features_train, self.labels_train)
        self.__training_result()

    def save_model(self, filePath):
        FileStore(filePath=filePath).save_pickle(obj=est)

    def __training_result(self):
        y_true, y_pred = self.labels_test, self.estimator.predict(self.features_test)
        print(classification_report(y_true, y_pred))

In [None]:
# json_train = DataLoader(dataPath=settings.DATA_TRAIN_PATH).get_json()
# FileStore(filePath=settings.DATA_TRAIN_JSON, data=json_train).store_json()
# json_test = DataLoader(dataPath=settings.DATA_TEST_PATH).get_json()
# FileStore(filePath=settings.DATA_TEST_JSON, data=json_test).store_json()
train_loader = FileReader(filePath=settings.DATA_TRAIN_JSON)
test_loader = FileReader(filePath=settings.DATA_TEST_JSON)
data_train = train_loader.read_json()
data_test = test_loader.read_json()

features_train, labels_train = FeatureExtraction(data=data_train).get_data_and_label()
features_test, labels_test = FeatureExtraction(data=data_test).get_data_and_label()

est = Classifier(features_train=features_train, features_test=features_test, labels_train=labels_train, labels_test=labels_test)
est.training()
est.save_model(filePath='trained_model/linear_svc_model.pk')

Step 1 / 1265
Building dictionary
Step 1 / 1265
Step 2 / 1265
Step 3 / 1265
Step 4 / 1265
Step 5 / 1265
Step 6 / 1265
Step 7 / 1265
Step 8 / 1265
Step 9 / 1265
Step 10 / 1265
Step 11 / 1265
Step 12 / 1265
Step 13 / 1265
Step 14 / 1265
Step 15 / 1265
Step 16 / 1265
Step 17 / 1265
Step 18 / 1265
Step 19 / 1265
Step 20 / 1265
Step 21 / 1265
Step 22 / 1265
Step 23 / 1265
Step 24 / 1265
Step 25 / 1265
Step 26 / 1265
Step 27 / 1265
Step 28 / 1265
Step 29 / 1265
Step 30 / 1265
Step 31 / 1265
Step 32 / 1265
Step 33 / 1265
Step 34 / 1265
Step 35 / 1265
Step 36 / 1265
Step 37 / 1265
Step 38 / 1265
Step 39 / 1265
Step 40 / 1265
Step 41 / 1265
Step 42 / 1265
Step 43 / 1265
Step 44 / 1265
Step 45 / 1265
Step 46 / 1265
Step 47 / 1265
Step 48 / 1265
Step 49 / 1265
Step 50 / 1265
Step 51 / 1265
Step 52 / 1265
Step 53 / 1265
Step 54 / 1265
Step 55 / 1265
Step 56 / 1265
Step 57 / 1265
Step 58 / 1265
Step 59 / 1265
Step 60 / 1265
Step 61 / 1265
Step 62 / 1265
Step 63 / 1265
Step 64 / 1265
Step 65 / 1265


Step 519 / 1265
Step 520 / 1265
Step 521 / 1265
Step 522 / 1265
Step 523 / 1265
Step 524 / 1265
Step 525 / 1265
Step 526 / 1265
Step 527 / 1265
Step 528 / 1265
Step 529 / 1265
Step 530 / 1265
Step 531 / 1265
Step 532 / 1265
Step 533 / 1265
Step 534 / 1265
Step 535 / 1265
Step 536 / 1265
Step 537 / 1265
Step 538 / 1265
Step 539 / 1265
Step 540 / 1265
Step 541 / 1265
Step 542 / 1265
Step 543 / 1265
Step 544 / 1265
Step 545 / 1265
Step 546 / 1265
Step 547 / 1265
Step 548 / 1265
Step 549 / 1265
Step 550 / 1265
Step 551 / 1265
Step 552 / 1265
Step 553 / 1265
Step 554 / 1265
Step 555 / 1265
Step 556 / 1265
Step 557 / 1265
Step 558 / 1265
Step 559 / 1265
Step 560 / 1265
Step 561 / 1265
Step 562 / 1265
Step 563 / 1265
Step 564 / 1265
Step 565 / 1265
Step 566 / 1265
Step 567 / 1265
Step 568 / 1265
Step 569 / 1265
Step 570 / 1265
Step 571 / 1265
Step 572 / 1265
Step 573 / 1265
Step 574 / 1265
Step 575 / 1265
Step 576 / 1265
Step 577 / 1265
Step 578 / 1265
Step 579 / 1265
Step 580 / 1265
Step 581

In [31]:
temp = u"Chào các bạn tôi là Phạm Văn Toàn đến từ blog Tự học Machine Learning"
print (NLP(text=temp).segmentation())

Chào các bạn tôi là Phạm_Văn_Toàn đến từ blog Tự học Machine_Learning
