# Support Vector Machine

In [1]:
import logging, os, warnings
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.utils import shuffle
from data_helper import XY_from_df
from sklearn.model_selection import StratifiedKFold
from bincls import BinaryClassificationAverageReport
from sklearn.metrics import classification_report, confusion_matrix

# sklearn ignore warnings
warnings.filterwarnings('ignore')

## Data Preparing

In [2]:
SEED = 5
FOLD = 10
TRAIN_PATH = "../dataset/train.csv"
TARGET_NAMES = ["bad", "good"]

np.random.seed(SEED)
df_train = pd.read_csv(TRAIN_PATH)
X, Y = XY_from_df(df_train)

stratified_folder = StratifiedKFold(n_splits=FOLD, random_state=SEED, shuffle=False)

### SVC 重要參數 （主要調節的參數有：C、kernel、degree、gamma、coef0）
 
- C(懲罰參數)：默認值是1.0，越高 train 時 bias 變低但 variance 變高，越低反之
- kernel：默認是rbf ，可以是'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' 
    - 0 linear：u'v
    - 1 polynomial：(gamma*u'*v + coef0)^degree
    - 2 ridial bias function：exp(-gamma|uv|^2)
    - 3 sigmoid：tanh(gamma*u'*v + coef0)
- degree：多項式poly 函數的維度，默認是3 ，選擇其他核函數時會被忽略。
- gamma  ： rbf,poly,sigmoid kernel的參數。默認是'auto'，則會選擇1/n_features gamma是选择RBF函数作为kernel后，该函数自带的一个参数。隐含地决定了数据映射到新的特征空间后的分布，gamma越大，支持向量越少，gamma值越小，支持向量越多。支持向量的个数影响训练与预测的速度。
- coef0  ： 對於'poly'和'sigmoid'有用
- class_weight：類別的權重，字典形式傳遞。設置第幾類的參數C 為weight*C(C-SVC 中的C) 
- random_state：seed

In [3]:
def cross_valid_process(stratified_folder, X, Y, model, report, mode="report", up=False):
    
    def upsampling(X,Y,train_index):
        pos = []
        for i, t in enumerate(Y[train_index]):
            if t == 0: pos.append(i)
        X_new = np.append(X[train_index], X[pos], axis=0)
        Y_new = np.append(Y[train_index], Y[pos], axis=0)
        idxs = [i for i in range(len(Y_new))]
        idxs = shuffle(idxs, random_state=3)
        return X_new[idxs], Y_new[idxs]
    
    for train_index, valid_index in stratified_folder.split(X, Y):
        if up:
            X_train ,Y_train = upsampling(X,Y,train_index)
        else:
            X_train ,Y_train = X[train_index], Y[train_index]
        if mode == "report": print(".", end=" ")
        m = model
        m.fit(X_train, Y_train)
        Y_valid_pred = m.predict(X[valid_index])
        cm = confusion_matrix(Y[valid_index], Y_valid_pred)
        report.cm_append(cm)
    if mode == "report":
        report.avg_cm_report()
        return None
    if mode == "obj":
        return report.object_score()

## Arguments Combinations

In [4]:
max_obj_score = 0
candidates = []

C = [0.5,1.0,1.5]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
gamma = [0.01, 0.02, 0.001]

for c in C:
    for k in kernel:
        for g in gamma:
            for up in [True, False]:
                print(c,k,g,up)
                report = BinaryClassificationAverageReport(TARGET_NAMES)
                svc = svm.SVC(C=c, kernel=k, random_state=SEED, class_weight="balanced")
                obj_score = cross_valid_process(stratified_folder, X, Y, svc, report, mode="obj", up=up)
                if obj_score >= max_obj_score:
                    candidates.append((c, k, g, up, obj_score))
                    max_obj_score = obj_score

0.5 linear 0.01 True
0.5 linear 0.01 False
0.5 linear 0.02 True
0.5 linear 0.02 False
0.5 linear 0.001 True
0.5 linear 0.001 False
0.5 poly 0.01 True
0.5 poly 0.01 False
0.5 poly 0.02 True
0.5 poly 0.02 False
0.5 poly 0.001 True
0.5 poly 0.001 False
0.5 rbf 0.01 True
0.5 rbf 0.01 False
0.5 rbf 0.02 True
0.5 rbf 0.02 False
0.5 rbf 0.001 True
0.5 rbf 0.001 False
0.5 sigmoid 0.01 True
0.5 sigmoid 0.01 False
0.5 sigmoid 0.02 True
0.5 sigmoid 0.02 False
0.5 sigmoid 0.001 True
0.5 sigmoid 0.001 False
1.0 linear 0.01 True
1.0 linear 0.01 False
1.0 linear 0.02 True
1.0 linear 0.02 False
1.0 linear 0.001 True
1.0 linear 0.001 False
1.0 poly 0.01 True
1.0 poly 0.01 False
1.0 poly 0.02 True
1.0 poly 0.02 False
1.0 poly 0.001 True
1.0 poly 0.001 False
1.0 rbf 0.01 True
1.0 rbf 0.01 False
1.0 rbf 0.02 True
1.0 rbf 0.02 False
1.0 rbf 0.001 True
1.0 rbf 0.001 False
1.0 sigmoid 0.01 True
1.0 sigmoid 0.01 False
1.0 sigmoid 0.02 True
1.0 sigmoid 0.02 False
1.0 sigmoid 0.001 True
1.0 sigmoid 0.001 False


In [5]:
candidates

[(0.5, 'linear', 0.01, True, 0.6708387852530966),
 (0.5, 'linear', 0.01, False, 0.6763903282633272),
 (0.5, 'linear', 0.02, False, 0.6763903282633272),
 (0.5, 'linear', 0.001, False, 0.6763903282633272),
 (0.5, 'poly', 0.01, True, 0.7244543289230052),
 (0.5, 'poly', 0.02, True, 0.7244543289230052),
 (0.5, 'poly', 0.001, True, 0.7244543289230052),
 (1.5, 'rbf', 0.01, True, 0.7327025135768004),
 (1.5, 'rbf', 0.02, True, 0.7327025135768004),
 (1.5, 'rbf', 0.001, True, 0.7327025135768004)]

## Robustness and Performance Good

In [6]:
c = 1.5
k = 'rbf'
g = 0.001

report = BinaryClassificationAverageReport(TARGET_NAMES)

svc = svm.SVC(C=c, kernel=k, gamma=g, random_state=SEED, class_weight="balanced")

cross_valid_process(stratified_folder, X, Y, svc, report, "report", True)

. . . . . . . . . . 

Below number are the average of 10 fold.

bad
             precision:    49.87%
                recall:    63.33%
                    F1:    55.61%
good
             precision:    82.01%
                recall:    72.00%
                    F1:    76.56%
---------------------------------
             weight_F1:    70.28%
                   acc:    69.40%



## TRAIN & VALIDATION

In [51]:
svc = svm.SVC(C=1.0, kernel='linear', degree=5, random_state=SEED, class_weight={0:7.,1:1.})
svc = svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_valid)
print(confusion_matrix(Y_valid, Y_pred))
print(classification_report(Y_valid, Y_pred, target_names=["bad", "good"]))

[[55  9]
 [82 54]]
              precision    recall  f1-score   support

         bad       0.40      0.86      0.55        64
        good       0.86      0.40      0.54       136

    accuracy                           0.55       200
   macro avg       0.63      0.63      0.54       200
weighted avg       0.71      0.55      0.54       200



In [52]:
Y_test_pred = svc.predict(X_test)

print(confusion_matrix(Y_test, Y_test_pred))
print(classification_report(Y_test, Y_test_pred, target_names=["bad", "good"]))

[[55  4]
 [86 55]]
              precision    recall  f1-score   support

         bad       0.39      0.93      0.55        59
        good       0.93      0.39      0.55       141

    accuracy                           0.55       200
   macro avg       0.66      0.66      0.55       200
weighted avg       0.77      0.55      0.55       200



In [53]:
def model_score(bad_recall, w_F1):
    score = 0.6* bad_recall + 0.4 * w_F1
    print("model score:",score)
    
model_score(0.93, 0.55)

model score: 0.778


## SVM Report

### baseline

- c=1.0, kernel='linear', degree=4

```

[validation]

[[ 14  45]
 [ 22 119]]
              precision    recall  f1-score   support

         bad       0.39      0.24      0.29        59
        good       0.73      0.84      0.78       141

    accuracy                           0.67       200
   macro avg       0.56      0.54      0.54       200
weighted avg       0.63      0.67      0.64       200

[test]

[[ 17  42]
 [ 15 126]]
              precision    recall  f1-score   support

         bad       0.53      0.29      0.37        59
        good       0.75      0.89      0.82       141

    accuracy                           0.71       200
   macro avg       0.64      0.59      0.59       200
weighted avg       0.69      0.71      0.69       200

model score: 0.43


```

### class_weight

- c=1.0, kernel='linear' degree=5, class_weight={0:7.,1:1.}

```
[validation]

[[55  9]
 [82 54]]
              precision    recall  f1-score   support

         bad       0.40      0.86      0.55        64
        good       0.86      0.40      0.54       136

    accuracy                           0.55       200
   macro avg       0.63      0.63      0.54       200
weighted avg       0.71      0.55      0.54       200




[test]

[[55  4]
 [86 55]]
              precision    recall  f1-score   support

         bad       0.39      0.93      0.55        59
        good       0.93      0.39      0.55       141

    accuracy                           0.55       200
   macro avg       0.66      0.66      0.55       200
weighted avg       0.77      0.55      0.55       200

model score: 0.778

```

### upsampling

```
[validation]

[[27 32]
 [42 99]]
              precision    recall  f1-score   support

         bad       0.39      0.46      0.42        59
        good       0.76      0.70      0.73       141

    accuracy                           0.63       200
   macro avg       0.57      0.58      0.57       200
weighted avg       0.65      0.63      0.64       200

[test]

[[ 32  27]
 [ 29 112]]
              precision    recall  f1-score   support

         bad       0.52      0.54      0.53        59
        good       0.81      0.79      0.80       141

    accuracy                           0.72       200
   macro avg       0.67      0.67      0.67       200
weighted avg       0.72      0.72      0.72       200


model score: 0.612
```

## class_weight -> model score: 0.778