In [1]:
import os
import pickle

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from py4j.java_gateway import java_import
from pyspark.mllib.common import _to_java_object_rdd

In [2]:
with open('W2V_model.pickle', 'rb') as handle:
    model = pickle.load(handle)

In [3]:
# Import vnTokenizer from Java
java_import(sc._gateway.jvm, "vn.vitk.tok.Tokenizer")
Tokenizer = sc._jvm.vn.vitk.tok.Tokenizer
dataFolder = os.getcwd() + '/dat/tok'
token = Tokenizer(sc._jsc, dataFolder + "/lexicon.xml", dataFolder + "/regexp.txt")

In [3]:
df = pd.read_csv('export_labeling.csv')

In [4]:
df['mix'] = 'xxx ' + df.content

In [5]:
all_text = ' '.join(list(df['mix']))
all_text = BeautifulSoup(all_text, "lxml").text

tokened = token.tokenizeOneLine(all_text)

NameError: name 'token' is not defined

In [6]:
df['content_toked'] = pd.Series(tokened.split('xxx')[1:])

NameError: name 'tokened' is not defined

In [6]:
def vectorize(text):
    sum_vec = np.zeros(100)
    for token in text.split():
        try:
            vec = model.wv.get_vector(token.lower())
            sum_vec += vec
        except KeyError:
            pass
    if np.count_nonzero(sum_vec)==0:
        return None
    else:
        return sum_vec


In [None]:
df['vec'] = df.content_toked.apply(vectorize)

df = df.dropna()

In [8]:
df[df.label!=0]

Unnamed: 0,id,content,content_length,label,article_link,last_update,mix
0,24929249,ác quỷ,6,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689860207,xxx ác quỷ
2,25097281,Cạn lời,7,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689851739,xxx Cạn lời
4,25098148,Vô phúc,7,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689851711,xxx Vô phúc
5,25105892,cạn lời,7,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689851752,xxx cạn lời
6,24926224,Mừng quá,8,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689864096,xxx Mừng quá
7,25049905,Ngụy biện,9,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689851604,xxx Ngụy biện
8,24926213,Quá độc ác,10,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689864015,xxx Quá độc ác
10,25072944,Tởm lợm quá,11,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689853638,xxx Tởm lợm quá
13,24927439,Thật dã man.,12,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689859995,xxx Thật dã man.
14,24927706,Tội ác man rợ,13,-1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689860187,xxx Tội ác man rợ


In [10]:
df_copy = df.copy()

In [11]:
df = df_copy.copy()

X_train, X_test, y_train, y_test = train_test_split(df['vec'], df['label'], test_size=0.33)

clf = svm.SVC()
clf.fit(list(X_train), y_train)

y_pred = clf.predict(list(X_test))
y_true = y_test
confusion_matrix(y_true, y_pred)

array([[107,   2,   0],
       [ 65,   5,   0],
       [ 67,   0,   0]])

# no neutral

In [12]:
df = df_copy.copy()

df = df[df['label']!=0]

In [13]:
df = df[df['content_length']<100]

In [14]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier

# tuning

In [15]:
# SVM
parameters = {'C': np.arange(0.1, 2.0, 0.1), 'kernel':('linear', 'rbf', 'sigmoid'), }
tmp_model = svm.SVC()
svc = GridSearchCV(tmp_model, parameters)
svc.fit(list(df['vec']), df['label'])
svc_best_params_ = svc.best_params_
svc_best_score_ = svc.best_score_
print('SVM', svc_best_params_, svc_best_score_)


# MLPClassifier
parameters = {'activation':('identity', 'logistic', 'tanh', 'relu'),
              'hidden_layer_sizes': [(u, l) for u in np.arange(100, 200, 20) for l in np.arange(1,3)],
              'solver': ('lbfgs', )}
tmp_model = MLPClassifier()
mlp_model = GridSearchCV(tmp_model, parameters)
mlp_model.fit(list(df['vec']), df['label'])
mlp_best_params_ = mlp_model.best_params_
mlp_best_score_ = mlp_model.best_score_
print('MLPClassifier', mlp_best_params_, mlp_best_score_)


# Decision Tree
parameters = {'criterion': ('gini', 'entropy')}
tmp_model = DecisionTreeClassifier()
dt_model = GridSearchCV(tmp_model, parameters)
dt_model.fit(list(df['vec']), df['label'])
dt_best_params_ = dt_model.best_params_
dt_best_score_ = dt_model.best_score_
print('Decision Tree', dt_best_params_, dt_best_score_)


# Random Forest
parameters = {'criterion': ('gini', 'entropy'),
             'n_estimators': range(5,20)}
tmp_model = RandomForestClassifier()
rf_model = GridSearchCV(tmp_model, parameters)
rf_model.fit(list(X_train), y_train)
rf_best_score_ = rf_model.best_score_
rf_best_params_ = rf_model.best_params_
print('Random forest', rf_best_params_, rf_best_score_)


# KNN
parameters = {'metric': ('euclidean', 'cosine', 'minkowski'),
             'n_neighbors': range(5, 100, 5)}
tmp_model = neighbors.KNeighborsClassifier()
knn_model = GridSearchCV(tmp_model, parameters)
knn_model.fit(list(X_train), y_train)
knn_best_params_ = knn_model.best_params_
knn_best_score_ = knn_model.best_score_
print('KNN', knn_best_params_, knn_best_score_)

SVM {'C': 0.2, 'kernel': 'linear'} 0.698170731707317
MLPClassifier {'activation': 'tanh', 'hidden_layer_sizes': (120, 1), 'solver': 'lbfgs'} 0.7286585365853658
Decision Tree {'criterion': 'entropy'} 0.6646341463414634
Random forest {'criterion': 'gini', 'n_estimators': 13} 0.5410821643286573
KNN {'metric': 'cosine', 'n_neighbors': 10} 0.5671342685370742


# 70 - 30 train test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['vec'], df['label'], test_size=0.33)

# SVM
svm_model = svm.SVC(**svc_best_params_)
svm_model.fit(list(X_train), y_train)

y_pred = svm_model.predict(list(X_test))
y_true = y_test
print('SVM')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))


# MLPClassifier
mlp_model = MLPClassifier(**mlp_best_params_)
mlp_model.fit(list(X_train), y_train)

y_pred = mlp_model.predict(list(X_test))
y_true = y_test
print('MLPClassifier')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))


# DT
dt_model = DecisionTreeClassifier(**dt_best_params_)
dt_model.fit(list(X_train), y_train)

y_pred = dt_model.predict(list(X_test))
y_true = y_test
print('Decision Tree')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

# Random Forest
rf_model = RandomForestClassifier(**rf_best_params_)
rf_model.fit(list(X_train), y_train)

y_pred = rf_model.predict(list(X_test))
y_true = y_test
print('Random forest')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

# KNN
knn_model = neighbors.KNeighborsClassifier(**knn_best_params_)
knn_model.fit(list(X_train), y_train)

y_pred = knn_model.predict(list(X_test))
y_true = y_test
print('KNN')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

SVM
[[48 13]
 [18 30]]
0.7155963302752294
MLPClassifier
[[52  9]
 [18 30]]
0.7522935779816514
Decision Tree
[[39 22]
 [19 29]]
0.6238532110091743
Random forest
[[54  7]
 [27 21]]
0.6880733944954128
KNN
[[59  2]
 [28 20]]
0.7247706422018348


# 10 fold cross validation

In [17]:
kf = KFold(n_splits=10)
X = df['vec'].tolist()
y = df['label'].tolist()

svm_result = []
mlp_result = []
dt_result = []
rf_result = []
knn_result = []

for train_index, test_index in kf.split(X):
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    
    
    # SVM
    svm_model = svm.SVC(**svc_best_params_)
    svm_model.fit(list(X_train), y_train)

    y_pred = svm_model.predict(list(X_test))
    y_true = y_test
    svm_result.append(accuracy_score(y_true, y_pred))


    # MLPClassifier
    mlp_model = MLPClassifier(**mlp_best_params_)
    mlp_model.fit(list(X_train), y_train)

    y_pred = mlp_model.predict(list(X_test))
    y_true = y_test
    
    mlp_result.append(accuracy_score(y_true, y_pred))

    # DT
    dt_model = DecisionTreeClassifier(**dt_best_params_)
    dt_model.fit(list(X_train), y_train)

    y_pred = dt_model.predict(list(X_test))
    y_true = y_test
    
    dt_result.append(accuracy_score(y_true, y_pred))
    
    # Random Forest
    rf_model = RandomForestClassifier(**rf_best_params_)
    rf_model.fit(list(X_train), y_train)

    y_pred = rf_model.predict(list(X_test))
    y_true = y_test
    
    rf_result.append(accuracy_score(y_true, y_pred))

    # KNN
    knn_model = neighbors.KNeighborsClassifier(**knn_best_params_)
    knn_model.fit(list(X_train), y_train)

    y_pred = knn_model.predict(list(X_test))
    y_true = y_test
    
    knn_result.append(accuracy_score(y_true, y_pred))

In [18]:
print('SVM')
print(np.mean(svm_result))

print('MLPClassifier')
print(np.mean(mlp_result))

print('Decision Tree')
print(np.mean(dt_result))

print('Random forest')
print(np.mean(rf_result))

print('KNN')
print(np.mean(knn_result))

SVM
0.6890151515151516
MLPClassifier
0.6981060606060606
Decision Tree
0.5943181818181819
Random forest
0.6891098484848485
KNN
0.7253787878787878


# Some test case

In [20]:
svm_model = svm.SVC(**svc_best_params_)
svm_model.fit(df['vec'].tolist(), df['label'])

SVC(C=0.2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
with open('SVM_model.pickle', 'wb') as handle:
    pickle.dump(svm_model, handle)

In [2]:
with open('SVM_model.pickle', 'rb') as handle:
    svm_model = pickle.load(handle)

In [22]:
input_text = 'Toà chính xác'
input_text_tokened = token.tokenizeOneLine(input_text)
print(input_text_tokened)

input_vec = vectorize(input_text_tokened)
print(input_vec)

svm_model.predict([input_vec])

Toà chính_xác
[ 1.04086304 -1.92496395 -2.2725938  -3.36352062  0.65497446 -1.0380941
 -1.40365517 -1.23997307 -2.34971216 -4.96927071  5.78147244 -0.37644911
  1.36498615 -0.45285167  3.99076641  0.37398171  0.29981311  4.04796124
  2.52687651  1.0552505  -0.709086    2.65707654  0.50446248  4.54734731
  1.70417474 -3.06812149  1.31861359  1.1428055  -0.88251288  0.46178843
 -0.88175142 -1.4287641   0.22639611  4.46257627  3.52507997 -0.01099262
  2.08479345  1.71934464 -4.09721231  1.71379673 -0.91186213  0.42443816
 -2.60398644  0.96475387  1.04659924 -0.68887708  1.74847394 -0.86631691
 -2.46199211  5.4675169   1.93725976 -2.42202592  0.71491751  0.10406733
  2.26181099 -0.93562829 -3.57759911 -1.3361131  -0.45010722  0.45893703
 -3.74703455 -1.9961049   0.0588237   0.65915304  2.71249878 -2.35377049
 -1.28461584 -4.05415544 -2.00214553  2.3623054  -0.99852804 -1.18645543
 -0.89682198 -2.93126106 -0.2420294  -1.87509593 -1.97272527 -2.02930868
  6.22473693 -2.78950357  1.13638377  

array([1])

In [23]:
input_text = 'phóng thích'
print(input_text)

input_vec = vectorize(input_text)
print(input_vec)

svm_model.predict([input_vec])

phóng thích
[ 0.88519126  1.24548894 -4.21519673  3.58301377  4.40064788  2.14076394
  0.18499291 -0.82497645 -0.65972912  2.16263014 -3.83733255 -0.64181045
  1.24848461  1.96826684 -1.44579339 -1.29251672 -3.83400929  1.10671449
 -0.8674835  -1.33810878  3.0971595   1.14340162  1.47882374  5.28486645
 -1.3371141  -0.20613241  1.60149565  5.28643835 -2.47314632  0.95038332
 -1.51831907 -0.05362344  0.1968857  -2.03402279 -3.31298792 -0.37254223
 -3.56349117 -1.58798486 -2.48578686  2.55279529 -2.5375724  -1.46365303
 -1.05453461  2.5540086   2.35941175 -2.76987461  0.23198116 -2.37788355
 -3.34298253 -1.99145055  3.31337368 -3.50806566 -1.35975727  1.70879889
  2.96593702  1.46172008  2.31203239  0.04384136 -2.46919858  2.8782922
 -1.26179178  3.98629177 -2.63742781 -0.57672548 -0.83039224  0.95744503
 -2.19989383 -0.34680033 -1.97662377  4.44865543 -4.67635846  2.94090502
  3.24305439 -4.80531585  1.51361454 -0.36265254  4.29663233  2.9986608
  2.19400528 -0.57978415 -2.74514258 -3.2

array([-1])

In [12]:
input_text = 'Toà chuẩn_xác'
print(input_text)

input_vec = vectorize(input_text)
print(input_vec)

svm_model.predict([input_vec])

Toà chuẩn_xác
[ 3.12395886 -1.71248525 -1.07526982 -1.74302465 -1.09150317 -1.35445732
  0.33230796  1.2434752  -1.97452354 -4.08011109  3.65230809  0.31954074
 -0.18640713 -0.77087851  2.3574025   2.09721816  0.24956012  2.82662047
  0.51012072  0.16586852 -1.04089701  0.95165598 -0.2123821   1.87117118
  2.10885926 -0.44988908 -0.61934026  0.94063101 -0.59436838  0.52972583
  0.1209131   0.51506531 -0.38584746  1.66878107  2.71174699  0.14360809
  2.70437938  1.6373197  -3.87863113 -0.12493852  1.19740418 -0.42455344
 -1.08581465 -0.90078083  0.15878126  0.09728709  2.0748312   0.12572613
 -0.46732447  3.61276358  0.64415652 -1.32756603  1.06424006 -2.5794235
  0.84034064 -1.5954741  -2.72441925 -1.91838862 -1.53266558  0.42106051
 -2.70605463 -1.04928905 -1.15855955 -0.39200042  1.28156802 -1.59996426
 -0.59072813 -4.51184477 -0.99117306  1.54005408 -0.33815856 -0.12425156
 -2.52384387 -2.25602791  0.62670586 -1.38237315 -1.70013021 -1.55540407
  2.98339766 -1.56328279 -0.69881999  

array([1])

In [15]:
model.wv.most_similar('toà')

[('tòa', 0.7758179903030396),
 ('phiên_toà', 0.6296157836914062),
 ('hđxx', 0.553084671497345),
 ('toà_án', 0.5176868438720703),
 ('phúc_thẩm', 0.5169740915298462),
 ('tòa_trọng_tài', 0.49995505809783936),
 ('toà_trọng_tài', 0.49825742840766907),
 ('giám_đốc_thẩm', 0.4915924668312073),
 ('tài_phiệt', 0.48819756507873535),
 ('anthem_inc', 0.47171491384506226)]

In [14]:
model.wv.most_similar('tòa')

[('toà', 0.77581787109375),
 ('nếu_tòa', 0.6361499428749084),
 ('tu_chính', 0.6249154806137085),
 ('lập_pháp', 0.5303572416305542),
 ('giám_đốc_thẩm', 0.5277676582336426),
 ('sputnik_tòa', 0.5262762308120728),
 ('toà_án', 0.526037871837616),
 ('cao_tầng', 0.5111631155014038),
 ('căn', 0.4936997890472412),
 ('xử', 0.4921284317970276)]

In [32]:
model.wv.most_similar('trắc_nghiệm')

[('thi_trắc_nghiệm', 0.5748363733291626),
 ('hình_học', 0.5364856719970703),
 ('đáp_án', 0.5322859287261963),
 ('chính_tả', 0.5321452617645264),
 ('mk', 0.5043061971664429),
 ('-53-65', 0.4993148744106293),
 ('quiz', 0.48665499687194824),
 ('vựng', 0.48498111963272095),
 ('thuyết_tương_đối', 0.4842168390750885),
 ('giải_nghĩa', 0.4841591715812683)]

In [173]:
df[df.content.str.contains('chính')]

Unnamed: 0,id,content,content_length,label,article_link,last_update,mix,content_toked,vec
27,25082608,Toà chính xác!,14,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689855946,xxx Toà chính xác!,Toà chính_xác !,"[-2.32006072998, -1.82127258927, -2.2872030613..."
505,25074608,"nghiện chứ ca sĩ gi? ai công nhận đâu, dùng từ...",96,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689851981,"xxx nghiện chứ ca sĩ gi? ai công nhận đâu, dùn...","nghiện chứ ca_sĩ gi ? ai công_nhận đâu , dùng...","[-15.8777293116, 3.46057801694, -10.4677679464..."


In [169]:
df[df.label==1]

Unnamed: 0,id,content,content_length,label,article_link,last_update,mix,content_toked,vec
6,24926224,Mừng quá,8,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689864096,xxx Mừng quá,Mừng quá,"[0.350542515516, 0.597414374352, 0.22443556785..."
27,25082608,Toà chính xác!,14,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689855946,xxx Toà chính xác!,Toà chính_xác !,"[-2.32006072998, -1.82127258927, -2.2872030613..."
30,25070886,Hay quá bạn :)),15,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689853709,xxx Hay quá bạn :)),Hay quá bạn : )),"[-6.59535965323, 2.57714438438, -2.30702804029..."
32,25080605,Tôi ủng hộ GRAB,15,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689855923,xxx Tôi ủng hộ GRAB,Tôi ủng_hộ GRAB,"[-5.38799283653, 1.40645685606, -1.87748279423..."
33,24926164,Sớm đền tội thôi,16,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689864079,xxx Sớm đền tội thôi,Sớm đền_tội thôi,"[-1.64083902282, -2.92613292485, -0.1233732998..."
34,24926211,Tin vui đầu năm.,16,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689864066,xxx Tin vui đầu năm.,Tin_vui đầu năm .,"[2.01233011484, 2.00089620019, 3.68940339983, ..."
39,24926521,công an Quá tài!,18,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689865624,xxx công an Quá tài!,công_an Qua ́ ta ̀ i !,"[-8.23035168648, 3.88569267839, 4.20050807018,..."
40,24926562,CA vn thật là hay.,18,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689865702,xxx CA vn thật là hay.,CA vn thật là hay .,"[-3.61269539595, -2.6696883142, -6.16903573275..."
44,24926119,Tin này vui hơn Tết,19,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689864047,xxx Tin này vui hơn Tết,Tin này vui hơn Tết,"[-5.31304593384, 1.04110825062, 5.38951149583,..."
62,24927596,tội mấy&nbsp;đứa nhỏ,20,1,https://vnexpress.net/tin-tuc/phap-luat/xxx-37...,1520689860170,xxx tội mấy&nbsp;đứa nhỏ,tội mấy đứa nhỏ,"[4.44041395187, 1.15158653259, 1.22274839878, ..."
