## STEP 1. Load Train-texts and Targets

In [81]:
import pickle

train_data_file = "data_train_new.pkl"

try:
    train_data_df = pickle.load( open(train_data_file, "rb") )
except IOError:
    print("PLEASE CHANGE CURRENT DIR TO ~/JVN_work/ml_adv\n")
    raise

In [82]:
train_data_df.head(5)

Unnamed: 0,target,text,tokens
0,1,công_ty\ncp\nđầu_tư\nthế_giới\ndi_động\n(\nmwg...,"[công_ty, cp, đầu_tư, thế_giới, di_động, mwg, ..."
1,1,triển_lãm\nvietnam\nhi-end\nshow\n2017\n(\nvhs...,"[triển_lãm, vietnam, hiend, show, vhs, khai_mạ..."
2,0,tuấn_anh\ntranh_thủ\nthời_gian\ngặp_gỡ\nvà\nch...,"[tuấn_anh, tranh_thủ, gặp_gỡ, chụp, ảnh, cđv, ..."
3,0,tổng_cục\nthống_kê\n(\nbộ\nkh\n&\nđt\n)\nvừa\n...,"[tổng_cục, thống_kê, kh, đt, công_bố, báo_cáo,..."
4,1,"theo\nphonearena\n,\nbà\nwang\ncho\nrằng\nkhi\...","[phonearena, wang, nhu_cầu, điện_toán, hình_dá..."


In [83]:
train_texts = train_data_df.tokens
train_targets = train_data_df.target

## STEP 2. Selecting Best Features

### STEP 2.1 Build the whole vocabulary

In [84]:
#Construct the term frequency matrix
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(preprocessor=lambda word: word,
                                   tokenizer=lambda word: word,
                                   min_df=0.001)

tf_matrix = count_vectorizer.fit_transform(train_texts)

In [85]:
from scipy.sparse import csr_matrix
tf_matrix.toarray() #Each row represents a document, each column represents a token

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [86]:
# get all the features (distinct tokens) in all the document set 
vocabulary=count_vectorizer.get_feature_names()
vocabulary

['a_tt',
 'aag',
 'abbank',
 'abc',
 'ac',
 'acb',
 'acer',
 'acid',
 'active',
 'add',
 'adidas',
 'adn',
 'adobe',
 'adreno',
 'ads',
 'af',
 'afc',
 'aff',
 'afghanistan',
 'afp',
 'agribank',
 'ai_cập',
 'ai_ngờ',
 'aids',
 'air',
 'airbnb',
 'airpods',
 'akg',
 'album',
 'aleksandr_kogan',
 'alexa',
 'alexis_sanchez',
 'alibaba',
 'all',
 'alpha',
 'alphabet',
 'always_on_display',
 'alwayson',
 'am_hiểu',
 'amazon',
 'ambient_mode',
 'amd',
 'america',
 'amoled',
 'an',
 'an_bình',
 'an_cư',
 'an_giang',
 'an_lành',
 'an_ninh',
 'an_phú',
 'an_sinh',
 'an_thần',
 'an_toàn',
 'an_toàn_lao_động',
 'an_tâm',
 'an_ủi',
 'analog',
 'and',
 'android',
 'anfield',
 'angelina_jolie',
 'anh_chàng',
 'anh_chị',
 'anh_chị_em',
 'anh_em',
 'anh_hùng',
 'anh_quân',
 'anh_trai',
 'anh_tuấn',
 'animoji',
 'antoine_griezmann',
 'antonio_conte',
 'antutu',
 'ao',
 'ao_ước',
 'ap',
 'apec',
 'apg',
 'api',
 'app',
 'app_store',
 'apple',
 'apsc',
 'apt',
 'ar',
 'argentina',
 'arizona',
 'arkit',


In [87]:
len(vocabulary)

9704

### STEP 2.2 Compute the Information Gain for each word in the vocabulary

In [88]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = list(mutual_info_classif(tf_matrix, train_targets, discrete_features=True))
mutual_info

[0.0003328874355885657,
 0.0017025610180447831,
 0.00039305877054351013,
 0.0005504534056724217,
 0.0002715773500649878,
 0.0014401482601977635,
 0.0007876129212159601,
 0.0006404407173222685,
 0.0011497121792804366,
 0.0007955835390593103,
 0.00047229654027197135,
 0.0007219335006863481,
 0.0015642918800393407,
 0.0018408735288767504,
 0.0015642886518650316,
 0.0006894943546309653,
 0.0016569183969134757,
 0.0022627859639933116,
 0.00029795482965447426,
 0.00037715825828606706,
 0.0009890040223123197,
 0.0005734738614969867,
 3.449853577623847e-05,
 0.0001649477020528033,
 0.0015063343071862524,
 0.0007165001749677594,
 0.002163758975013731,
 0.0007354961385478225,
 0.000576654905622037,
 0.0017486607266101396,
 0.0030412671854928167,
 0.0013225292456436545,
 0.001210635410106362,
 0.00030335156557932924,
 0.0003712754073223024,
 0.001778740065212074,
 0.0008275130325077935,
 0.000873528144331929,
 0.00014572705529058267,
 0.008248996160607524,
 0.0009195477005730682,
 0.0013083246480

In [89]:
features_IG=dict(zip(vocabulary,mutual_info))

In [90]:
sorted(features_IG.items(), key = lambda kv: kv[1], reverse=True)

[('smartphone', 0.11797477227693086),
 ('tính_năng', 0.1096796287960413),
 ('thiết_bị', 0.10647265252509944),
 ('màn_hình', 0.1054578961688194),
 ('công_nghệ', 0.10229498035221979),
 ('apple', 0.0925144794413319),
 ('iphone', 0.08187732032437463),
 ('hãng', 0.08123621060284475),
 ('ứng_dụng', 0.08065473368489233),
 ('phiên_bản', 0.07628334236632785),
 ('sản_phẩm', 0.0694779687810578),
 ('di_động', 0.06814416033803436),
 ('samsung', 0.06520897560342732),
 ('máy', 0.06436102823708055),
 ('android', 0.06300738706671963),
 ('camera', 0.06203036366867728),
 ('trang_bị', 0.06094410326480964),
 ('điện_thoại', 0.055989946163837984),
 ('pin', 0.05326122174124682),
 ('inch', 0.05243398108720563),
 ('gb', 0.051352658959131145),
 ('tích_hợp', 0.05091283079125718),
 ('cho_phép', 0.05089336313649201),
 ('google', 0.050301586118503154),
 ('thiết_kế', 0.04844703329090971),
 ('tp', 0.047987606792114994),
 ('mạng', 0.04785351236821108),
 ('án', 0.04691498711004529),
 ('dữ_liệu', 0.04570245357656286),
 (

In [91]:
sorted(features_IG.items(), key = lambda kv: kv[1], reverse=False)

[('brunei', 4.5524461724649745e-08),
 ('nhằn', 4.5524461724649745e-08),
 ('chín_muồi', 5.122190654179709e-08),
 ('dễ_thở', 5.122190654179709e-08),
 ('nối_liền', 5.122190654179709e-08),
 ('phi_mã', 5.122190654179709e-08),
 ('thứ_ba', 5.122190654179709e-08),
 ('vô_căn_cứ', 5.122190654179709e-08),
 ('vùng_sâu_vùng_xa', 5.122190654179709e-08),
 ('xoá_sổ', 5.692088479985341e-08),
 ('chát', 6.26213944995499e-08),
 ('tồn', 6.832343947072231e-08),
 ('hiển_nhiên', 7.402701832515818e-08),
 ('dân_dụng', 7.402701832602554e-08),
 ('bao_quát', 7.973213156549364e-08),
 ('liên_thông', 8.543878235044329e-08),
 ('huỷ_diệt', 9.114696830018337e-08),
 ('ẩn_chứa', 1.2542837931608583e-07),
 ('nguyên_vẹn', 1.3114733714725962e-07),
 ('vòng_tròn', 1.4831345907743293e-07),
 ('trên_tài', 1.9415772626327557e-07),
 ('giữ_chân', 5.059434145925773e-07),
 ('tương_quan', 6.23854630757599e-07),
 ('xác_suất', 6.294790305606986e-07),
 ('nhức_nhối', 7.040524994559802e-07),
 ('trả_đũa', 7.152997838853065e-07),
 ('khuynh_hướ

### STEP 2.3 Set threshold to select best features

In [92]:
final_features=[w for w in features_IG.keys() if features_IG[w]>0.004]
final_features

['amazon',
 'android',
 'app_store',
 'apple',
 'ar',
 'arsenal',
 'assistant',
 'asus',
 'ban',
 'bao_gồm',
 'barca',
 'bgr',
 'bit',
 'biên_bản',
 'biến',
 'biến_chứng',
 'biểu_diễn',
 'biểu_hiện',
 'biểu_tượng',
 'bkav',
 'blackberry',
 'bluetooth',
 'bo',
 'bs',
 'buồn',
 'bv',
 'bàn',
 'bàn_bạc',
 'bàn_giao',
 'bàn_phím',
 'bào_chữa',
 'bác_sĩ',
 'bán_kết',
 'bán_lẻ',
 'bé',
 'béo',
 'bình_dương',
 'bóng',
 'bóng_đá',
 'bản_quyền',
 'bản_án',
 'bảo',
 'bảo_anh',
 'bảo_hành',
 'bảo_lâm',
 'bảo_mật',
 'bấm',
 'bất_động_sản',
 'bầu',
 'bầu_cử',
 'bắc',
 'bắt',
 'bắt_giữ',
 'bằng_sáng_chế',
 'bệnh',
 'bệnh_lý',
 'bệnh_nhân',
 'bệnh_viện',
 'bị_can',
 'bị_cáo',
 'bố',
 'bố_mẹ',
 'bồi_thường',
 'bộ_nhớ',
 'bộ_nhớ_trong',
 'bộ_vi_xử_lý',
 'bộ_xử_lý',
 'bụng',
 'bữa',
 'ca',
 'ca_khúc',
 'ca_sĩ',
 'cambridge_analytica',
 'camera',
 'can',
 'cao_cấp',
 'ceo',
 'ces',
 'champions_league',
 'chelsea',
 'chi',
 'chi_cục',
 'chi_nhánh',
 'chi_tiết',
 'chia_tay',
 'chip',
 'chiếm_đoạt',
 'chiến

In [93]:
best_features=dict(zip(final_features,range(len(final_features))))
best_features #This will be put in the vectorizer again for text representation

{'amazon': 0,
 'android': 1,
 'app_store': 2,
 'apple': 3,
 'ar': 4,
 'arsenal': 5,
 'assistant': 6,
 'asus': 7,
 'ban': 8,
 'bao_gồm': 9,
 'barca': 10,
 'bgr': 11,
 'bit': 12,
 'biên_bản': 13,
 'biến': 14,
 'biến_chứng': 15,
 'biểu_diễn': 16,
 'biểu_hiện': 17,
 'biểu_tượng': 18,
 'bkav': 19,
 'blackberry': 20,
 'bluetooth': 21,
 'bo': 22,
 'bs': 23,
 'buồn': 24,
 'bv': 25,
 'bàn': 26,
 'bàn_bạc': 27,
 'bàn_giao': 28,
 'bàn_phím': 29,
 'bào_chữa': 30,
 'bác_sĩ': 31,
 'bán_kết': 32,
 'bán_lẻ': 33,
 'bé': 34,
 'béo': 35,
 'bình_dương': 36,
 'bóng': 37,
 'bóng_đá': 38,
 'bản_quyền': 39,
 'bản_án': 40,
 'bảo': 41,
 'bảo_anh': 42,
 'bảo_hành': 43,
 'bảo_lâm': 44,
 'bảo_mật': 45,
 'bấm': 46,
 'bất_động_sản': 47,
 'bầu': 48,
 'bầu_cử': 49,
 'bắc': 50,
 'bắt': 51,
 'bắt_giữ': 52,
 'bằng_sáng_chế': 53,
 'bệnh': 54,
 'bệnh_lý': 55,
 'bệnh_nhân': 56,
 'bệnh_viện': 57,
 'bị_can': 58,
 'bị_cáo': 59,
 'bố': 60,
 'bố_mẹ': 61,
 'bồi_thường': 62,
 'bộ_nhớ': 63,
 'bộ_nhớ_trong': 64,
 'bộ_vi_xử_lý': 65,


In [94]:
len(best_features)

992

In [61]:
import pickle
with open('best_features_992.pickle', 'wb') as datapkl:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(best_features, datapkl, pickle.HIGHEST_PROTOCOL)

## STEP 3. Build a tf-idf representation using best_features as dimension

In [95]:
#Construct the term frequency matrix, feed best features in the vocabulary
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(preprocessor=lambda word: word,
                                   tokenizer=lambda word: word,
                                   vocabulary=best_features)

tf_matrix = count_vectorizer.fit_transform(train_texts)

In [96]:
#The tf-idf representation
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(sublinear_tf=True)
train_texts_tfidf = tfidf_transformer.fit_transform(tf_matrix)

In [97]:
# Preview Train-Texts in this TF-IDF vector-space
import pandas as pd

def to_df(tfidf_texts, feature_names):
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(tfidf_texts)]

    df = pd.DataFrame(data=tfidf_texts.toarray(), index=doc_names, columns=feature_names)

    return df

In [98]:
to_df(train_texts_tfidf[:5], count_vectorizer.get_feature_names())

Unnamed: 0,amazon,android,app_store,apple,ar,arsenal,assistant,asus,ban,bao_gồm,...,động_thái,động_viên,đức,ưu_điểm,ảnh,ảo,ống_kính,ổ_cứng,ứng_dụng,…
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.131231
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.136196,0.0,0.0,0.0,0.0,0.0
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc4,0.0,0.213986,0.0,0.111136,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
train_texts_tfidf.shape

(14891, 992)

## STEP 4. Tuning SVM parameters on a random subset

In [100]:
train_data_subset = train_data_df.sample(n=2000, random_state=107)

subset_texts = train_data_subset.tokens
subset_targets = train_data_subset.target

In [106]:
#Now construct a Pipeline with CountVectorizer, TfidfTransformer & LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

rbf_svm_pipe = Pipeline([
                    ('vectorizer', CountVectorizer(preprocessor=lambda word: word, 
                                                   tokenizer=lambda word: word,
                                                   min_df=0.001, 
                                                   max_df=0.75)),
                    ('tfidf', TfidfTransformer(sublinear_tf=True)),
                    ('clf', SVC(kernel='rbf'))
                 ])

In [109]:
#Now, set a "parameter-distribution" for C and do RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"clf__C": uniform(1, 10), "clf__gamma": reciprocal(0.001, 0.1)}
rnd_search_cv = RandomizedSearchCV(rbf_svm_pipe, param_distributions, cv=5, n_iter=20, verbose=2)
rnd_search_cv.fit(subset_texts, subset_targets)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146 .....
[CV]  clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146, total=   4.7s
[CV] clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146 .....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s remaining:    0.0s


[CV]  clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146, total=   4.7s
[CV] clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146 .....
[CV]  clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146, total=   4.1s
[CV] clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146 .....
[CV]  clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146, total=   4.6s
[CV] clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146 .....
[CV]  clf__C=2.2060428048871454, clf__gamma=0.0011053739897619146, total=   4.1s
[CV] clf__C=4.932589502577642, clf__gamma=0.010669985889857417 .......
[CV]  clf__C=4.932589502577642, clf__gamma=0.010669985889857417, total=   3.1s
[CV] clf__C=4.932589502577642, clf__gamma=0.010669985889857417 .......
[CV]  clf__C=4.932589502577642, clf__gamma=0.010669985889857417, total=   3.0s
[CV] clf__C=4.932589502577642, clf__gamma=0.010669985889857417 .......
[CV]  clf__C=4.932589502577642, clf__gamma=0.010669985889857417, total=   3.1s
[CV] clf__C=4

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  9.9min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=0.001,
        ngram_range=(1, 1),
        preprocessor=<function <lambda...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002063A2A8EB8>, 'clf__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002063A2B7400>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [110]:
print(rnd_search_cv.best_params_)
print(rnd_search_cv.best_score_)

{'clf__C': 7.482025704572655, 'clf__gamma': 0.08809368102181363}
0.9675


## STEP 5. Using the best-parameter to train on full train_data

In [111]:
rbf_svm_clf = SVC(C=7.482025704572655, gamma=0.08809368102181363)
rbf_svm_clf.fit(train_texts_tfidf, train_targets)

SVC(C=7.482025704572655, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.08809368102181363,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [112]:
#Now, let's check the accuracy on Training Data
from sklearn.metrics import accuracy_score

train_preds = rbf_svm_clf.predict(train_texts_tfidf)
accuracy_score(train_targets, train_preds)

0.9819353972197972

## STEP 6. Test

In [113]:
test_data_file = "data_test_new.pkl"

try:
    test_data_df = pickle.load( open(test_data_file, "rb") )
except IOError:
    print("PLEASE CHANGE CURRENT DIR TO ~/JVN_work/ml_adv\n")
    raise

test_texts = test_data_df.tokens
test_targets = test_data_df.target

In [114]:
test_tf_matrix = count_vectorizer.transform(test_texts)
test_texts_tfidf = tfidf_transformer.transform(test_tf_matrix)

In [115]:
to_df(test_texts_tfidf[:5], count_vectorizer.get_feature_names())

Unnamed: 0,amazon,android,app_store,apple,ar,arsenal,assistant,asus,ban,bao_gồm,...,động_thái,động_viên,đức,ưu_điểm,ảnh,ảo,ống_kính,ổ_cứng,ứng_dụng,…
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081468,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc1,0.0,0.0,0.0,0.303702,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc2,0.26672,0.0,0.0,0.097589,0.0,0.0,0.0,0.0,0.0,0.094898,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091172,0.0
Doc3,0.0,0.098245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.135901,0.0,0.0,0.0,0.0,0.0
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119619


In [116]:
test_texts_tfidf.shape

(3723, 992)

In [117]:
test_preds = rbf_svm_clf.predict(test_texts_tfidf)
accuracy_score(test_targets, test_preds)

0.97313994090787