### **STEP 1**. Load Train-texts and Targets

In [2]:
import pickle

train_data_file = "data_train_new.pkl"

try:
    train_data_df = pickle.load( open(train_data_file, "rb") )
except IOError:
    print("PLEASE CHANGE CURRENT DIR TO ~/JVN_work/ml_adv\n")
    raise

In [3]:
train_texts = train_data_df.tokens
train_targets = train_data_df.target

In [7]:
train_texts[2]

['hoàng_nam',
 'đánh',
 'cặp',
 'quốc_khánh',
 'đôi',
 'tranh',
 'tài',
 'vòng',
 'xuất',
 'đặc_cách',
 'chủ',
 'hai',
 'tay_vợt',
 'việt_nam',
 'thắng',
 'liền',
 'trận',
 'tiến',
 'chung_kết',
 'bán_kết',
 'hoàng_nam',
 'quốc_khánh',
 'đối_đầu',
 'kaliyanda_poonacha',
 'shanmugam',
 'set',
 'đấu',
 'diễn',
 'căng_thẳng',
 'giằng_co',
 'điểm_số',
 'hai',
 'tay_vợt',
 'việt_nam',
 'giành',
 'break',
 'game',
 'đối_phương',
 'bẻ',
 'giao',
 'thành_công',
 'cổ_vũ',
 'khán_giả',
 'hoàng_nam',
 'quốc_khánh',
 'thi_đấu',
 'nỗ_lực',
 'ăn_ý',
 'giành',
 'break',
 'game',
 'thứ_bảy',
 'đi',
 'thắng_lợi',
 'set',
 'đấu',
 'hai',
 'quốc_khánh',
 'toả',
 'tay_vợt',
 'đánh_đôi',
 'việt_nam',
 'pha',
 'đánh',
 'kỹ_thuật',
 'khắc',
 'chế',
 'đôi',
 'đối_thủ',
 'sở_hữu',
 'cú',
 'phát_bóng',
 'uy_lực',
 'hai',
 'tay_vợt',
 'việt_nam',
 'giành',
 'chiến_thắng',
 'dễ_dàng',
 'lọt',
 'chung_kết',
 'trận',
 'đấu',
 'giải',
 'hoàng_nam',
 'quốc_khánh',
 'đụng_độ',
 'đôi',
 'hạt_giống',
 'số_một',
 'morita

### **STEP 2.1**. Build a Term-frequency Matrix (Word-Thresholding)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(preprocessor=lambda word: word,
                                   tokenizer=lambda word: word,
                                   min_df=6, max_df=0.75)

tf_matrix = count_vectorizer.fit_transform(train_texts)
tf_matrix.shape

(14891, 16419)

In [78]:
count_vectorizer.get_feature_names()[:10]

['_kiều',
 '_son',
 'a-series',
 'a-xít',
 'aaa',
 'aag',
 'aami',
 'abass',
 'abbank',
 'abbott']

In [79]:
count_vectorizer.get_feature_names()[-10:]

['ứng_phó', 'ứng_tuyển', 'ứng_viên', 'ứng_xử', 'ửng', 'ỷ', '‘', '’', '•', '…']

### **STEP 2.2. Construct a TF-IDF vector-space**

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(sublinear_tf=True)
train_texts_tfidf = tfidf_transformer.fit_transform(tf_matrix)

Preview Train-Texts in this TF-IDF vector-space

In [81]:
import pandas as pd

def to_df(tfidf_texts, feature_names):
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(tfidf_texts)]

    df = pd.DataFrame(data=tfidf_texts.toarray(), index=doc_names, columns=feature_names)

    return df

In [82]:
to_df(train_texts_tfidf[:5], count_vectorizer.get_feature_names())

Unnamed: 0,_kiều,_son,a-series,a-xít,aaa,aag,aami,abass,abbank,abbott,...,ứng_phó,ứng_tuyển,ứng_viên,ứng_xử,ửng,ỷ,‘,’,•,…
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048369
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
train_texts_tfidf.shape

(14891, 16419)

### **STEP 3**. Tuning SVM parameters on a random subset

In [17]:
train_data_subset = train_data_df.sample(n=2000, random_state=107)

subset_texts = train_data_subset.tokens
subset_targets = train_data_subset.target

Now construct a Pipeline with **CountVectorizer**, **TfidfTransformer** & **LinearSVC**

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

linear_svm_pipe = Pipeline([
                    ('vectorizer', CountVectorizer(preprocessor=lambda word: word, 
                                                   tokenizer=lambda word: word,
                                                   min_df=6, 
                                                   max_df=0.75)),
                    ('tfidf', TfidfTransformer(sublinear_tf=True)),
                    ('clf', LinearSVC(loss='hinge'))
                 ])

Now, set a "parameter-distribution" for **C** and do **RandomizedSearchCV**

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_distributions = {"clf__C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(linear_svm_pipe, param_distributions, cv=5, n_iter=60, verbose=2)
rnd_search_cv.fit(subset_texts, subset_targets)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] clf__C=7.987090065165496 ........................................
[CV] ......................... clf__C=7.987090065165496, total=   0.3s
[CV] clf__C=7.987090065165496 ........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ......................... clf__C=7.987090065165496, total=   0.2s
[CV] clf__C=7.987090065165496 ........................................
[CV] ......................... clf__C=7.987090065165496, total=   0.2s
[CV] clf__C=7.987090065165496 ........................................
[CV] ......................... clf__C=7.987090065165496, total=   0.3s
[CV] clf__C=7.987090065165496 ........................................
[CV] ......................... clf__C=7.987090065165496, total=   0.3s
[CV] clf__C=4.099430609533856 ........................................
[CV] ......................... clf__C=4.099430609533856, total=   0.3s
[CV] clf__C=4.099430609533856 ........................................
[CV] ......................... clf__C=4.099430609533856, total=   0.2s
[CV] clf__C=4.099430609533856 ........................................
[CV] ......................... clf__C=4.099430609533856, total=   0.2s
[CV] clf__C=4.099430609533856 ........................................
[CV] .

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  2.0min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=6,
        ngram_range=(1, 1),
        preprocessor=<function <lambda> at...e', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))]),
          fit_params=None, iid=True, n_iter=60, n_jobs=1,
          param_distributions={'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe2dc820d30>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

Now, find info of the best-parameter for C

In [20]:
print(rnd_search_cv.best_params_)
print(rnd_search_cv.best_score_)

{'clf__C': 1.105793149251442}
0.9525


### **STEP 4. Using the best-parameter to train on full train_data**

In [21]:
from time import time

t0 = time()

final_clf = LinearSVC(C=1.2, loss='hinge')
final_clf.fit(train_texts_tfidf, train_targets)

round(time() - t0, 2) 

0.37

Now, let's check the accuracy on Training Data

In [22]:
from sklearn.metrics import accuracy_score

train_preds = final_clf.predict(train_texts_tfidf)
accuracy_score(train_targets, train_preds)

0.9940903901685582

### **STEP 5. Let's TEST!!!**

In [10]:
test_data_file = "data_test_new.pkl"

try:
    test_data_df = pickle.load( open(test_data_file, "rb") )
except IOError:
    print("PLEASE CHANGE CURRENT DIR TO ~/JVN_work/ml_adv\n")
    raise

test_texts = test_data_df.tokens
test_targets = test_data_df.target

Now, let's put the test-set into our "balanced" TF-IDF vector-space

In [11]:
test_tf_matrix = count_vectorizer.transform(test_texts)
test_texts_tfidf = tfidf_transformer.transform(test_tf_matrix)

In [92]:
to_df(test_texts_tfidf[:5], count_vectorizer.get_feature_names())

Unnamed: 0,_kiều,_son,a-series,a-xít,aaa,aag,aami,abass,abbank,abbott,...,ứng_phó,ứng_tuyển,ứng_viên,ứng_xử,ửng,ỷ,‘,’,•,…
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
test_texts_tfidf.shape

(3723, 16419)

Final step, prediction. Yayyyyyyyy!

In [15]:
t0 = time()

test_preds = final_clf.predict(test_texts_tfidf)
accuracy_score(test_targets, test_preds)

time() - t0

0.006906747817993164