In [1]:
import os 
from random import randint
from datetime import datetime
from pyvi import ViTokenizer
from gensim import corpora, matutils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import islice
import pickle
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, metrics, svm
from sklearn.model_selection import GridSearchCV, KFold
import gensim
import gensim.downloader as gensim_api
from gensim.models import Word2Vec

from file_loader import FileStore, FileReader, DataLoader
from preprocessing import *
import const
from summarizer import Summarizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_summarizer = Summarizer()

In [9]:
text_summarizer.summary("Chào bạn tôi là Quang")

' Bạn chào mình đến với chương trình ẩm thực đường phố và xin chúc mừng bạn Nguyễn Ngọc Long, còn bạn là ai mời vào đây. Thân mời Bạn vui lòng chờ chút gọi tên.'

In [4]:
dataLoader = DataLoader(const.DATA_PATH)
data = dataLoader.get_json()

fileReader = FileReader('./data/vietnamese-stopwords.txt')
stopwordLst = fileReader.read_stopwords()

In [5]:
features = []
labels = []
for d in data:
    features.append(d['content'])
    labels.append(d['category'])

In [6]:
label_dict = dict(
                    zip(
                        sorted(set(labels), key=labels.index), 
                        range(len(labels))
                        )
                )
label_dict

{'Van hoa': 0,
 'The gioi': 1,
 'Khoa hoc': 2,
 'Suc khoe': 3,
 'Chinh tri Xa hoi': 4,
 'Vi tinh': 5,
 'Kinh doanh': 6,
 'The thao': 7,
 'Phap luat': 8,
 'Doi song': 9}

In [7]:
label_indices = [label_dict[label] for label in labels]

In [None]:
summary_features = [text_summarizer.summary(article) for article in features]

In [None]:
features = [processing(article, stopwordLst) for article in summary_features]

In [None]:
def get_TFIDF_feature_extractor(max_feature_dim=5000, ngram_range=(1,2)):
    
    tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features=5000, ngram_range=ngram_range)
    
    return tfidf_vectorizer

In [None]:
tfidf_extractor_2gram = get_TFIDF_feature_extractor(max_feature_dim=5000, ngram_range=(1,2))
tfidf_feature_train_2gram = tfidf_extractor_2gram.fit_transform(features)

In [None]:
test_ratio = 0.2
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    tfidf_feature_train_2gram, label_indices, 
    test_size=test_ratio,
    stratify=label_indices
)


In [None]:
def optimize_model_parameter(model, X_train, Y_train):
    '''
    Perform hyper parameter tuning for model
    '''
    best_model = model.fit(X_train, Y_train)
    #print(svm_grid_model.best_estimator_) 
    
    # Estimate the best accuracy
    train_accuracy = best_model.best_score_ *100
    print("Accuracy for our training dataset with tuning is : {:.2f}%".format(train_accuracy) )
    return best_model

def evaluate_model(model, X_test, Y_test):
    '''
    Initial evaluate model on test set
    '''
    # Predict topic indices in x_test
    Y_test_hat = model.predict(X_test)

    # Estimate the testing accuracy
    test_accuracy = metrics.accuracy_score(Y_test, Y_test_hat)*100
    print(metrics.classification_report(Y_test, Y_test_hat))
    print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )
    return test_accuracy

In [None]:
def get_SVM_model():
    '''
    Get linear SVM model and list of parameters
    '''
    # Initialize SVM model as a classifier
    svm_model = svm.SVC(probability=True) 

    # Define config for params
    param_grid = {'C': [50, 100, 200], 
                  'kernel': ['linear', 'rbf']} 

    # Perform Grid search
    grid = GridSearchCV(
        svm_model, 
        param_grid, 
        refit = True, 
        verbose = 3, 
        cv=5,
        n_jobs=-1
    )
    
    return grid

In [None]:
# Initialize SVM with Grid Search model
svm_grid_model = get_SVM_model()

# Hyper parameter tuning
optimized_model = optimize_model_parameter(svm_grid_model, tfidf_extractor_2gram, y_train)
# Classifier pipeline
model = pipeline.Pipeline([("vectorizer", tfidf_extractor_2gram),  
                           ("classifier", optimized_model)])

# Predict topic indices in x_test
y_test_hat = model.predict(x_test)

# Evaluate on test set
evaluate_model(model, x_test, y_test)