# **실제 분류 데이터를 활용하여 SVM학습 모델 학습**

In [4]:
#! pip install tqdm



In [1]:
%matplotlib inline

from sklearn.datasets import make_blobs
import mglearn
from mpl_toolkits.mplot3d import Axes3D, axes3d
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.preprocessing import StandardScaler,normalize
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split
from sklearn.neural_network import MLPClassifier
from collections import Counter
import matplotlib.cm as cm
from tqdm import tqdm

import warnings
warnings.simplefilter("ignore", UserWarning)

**wdbc - classification 예제**
- Diagnosis 
    - M = malignant(악성)
    - B = benign(양성)

- 실제 분류 예제 데이터(wdbc.csv) 를 사용하여 인공신경망 모델과 SVM모델을 비교
- 549 row x 31 col으로 구성된 분류데이터
- Class의 비율은 약 4:6의 비율을 가짐
- 학습데이터를 정규화작업을 거쳐, 학습과 모델 평가를 위해 7:3의 비율로 학습데이터와 평가데이터로 나누어 보도록 보겠습니다.

In [38]:
# 실제 데이터를 통한 SVM 모델 학습
# classification
wdbc = pd.read_csv('../data/svr/wdbc.csv', delimiter=',')
wdbc = np.array(wdbc)
wdbc

array([[13.08, 15.71, 85.63, ..., 0.3184, 0.08183, 'M'],
       [9.504, 12.44, 60.34, ..., 0.245, 0.07773, 'M'],
       [13.03, 18.42, 82.61, ..., 0.1987, 0.06169, 'M'],
       ...,
       [20.13, 28.25, 131.2, ..., 0.2572, 0.06637, 'B'],
       [16.6, 28.08, 108.3, ..., 0.2218, 0.0782, 'B'],
       [20.6, 29.33, 140.1, ..., 0.4087, 0.124, 'B']], dtype=object)

- Target 전처리

In [8]:
map(lambda x: int(x=='M'), wdbc[:, 30])

<map at 0xffff54cdc820>

In [9]:
list(map(lambda x: int(x=='M'), wdbc[:, 30]))[0:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [10]:
wdbc_y = np.array(list(map(lambda x: int(x=='M'), wdbc[:, 30])))
wdbc_y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [11]:
class_ratio = sum(wdbc_y) / len(wdbc_y)
print('Label Balance : {0:.4f}'.format(class_ratio))

Label Balance : 0.6268


In [12]:
print('Before:', wdbc, '\n')
wdbc_x = np.delete(wdbc, 30, axis=1)
print('After:', wdbc_x)

Before: [[13.08 15.71 85.63 ... 0.3184 0.08183 'M']
 [9.504 12.44 60.34 ... 0.245 0.07773 'M']
 [13.03 18.42 82.61 ... 0.1987 0.06169 'M']
 ...
 [20.13 28.25 131.2 ... 0.2572 0.06637 'B']
 [16.6 28.08 108.3 ... 0.2218 0.0782 'B']
 [20.6 29.33 140.1 ... 0.4087 0.124 'B']] 

After: [[13.08 15.71 85.63 ... 0.07283 0.3184 0.08183]
 [9.504 12.44 60.34 ... 0.06227 0.245 0.07773]
 [13.03 18.42 82.61 ... 0.05013 0.1987 0.06169]
 ...
 [20.13 28.25 131.2 ... 0.1628 0.2572 0.06637]
 [16.6 28.08 108.3 ... 0.1418 0.2218 0.0782]
 [20.6 29.33 140.1 ... 0.265 0.4087 0.124]]


- 데이터 정규화

In [13]:
scaler = StandardScaler()
scaler

StandardScaler()

In [14]:
transformer = scaler.fit(wdbc_x)
transformer

StandardScaler()

In [15]:
wdbc_x = scaler.fit_transform(wdbc_x)
wdbc_x

array([[-0.29748492, -0.83526195, -0.26121114, ..., -0.63519647,
         0.45804738, -0.11829484],
       [-1.31225167, -1.59642621, -1.30202713, ..., -0.79585459,
        -0.72837556, -0.34537913],
       [-0.31167349, -0.20444999, -0.38549996, ..., -0.98055057,
        -1.47675952, -1.23377717],
       ...,
       [ 1.7031039 ,  2.08369821,  1.61423309, ...,  0.73359244,
        -0.5311772 , -0.97456876],
       [ 0.70139063,  2.04412698,  0.67177812, ...,  0.41410186,
        -1.10337573, -0.31934752],
       [ 1.83647648,  2.3350919 ,  1.98051471, ...,  2.28844657,
         1.91763856,  2.21735012]])

---

- Class count

In [16]:
count =Counter(wdbc_y)
count

Counter({1: 356, 0: 212})

In [17]:
# class ratio
count[1]/(count[1]+count[0])

0.6267605633802817

In [18]:
wdbc_train_x, wdbc_test_x, wdbc_train_y, wdbc_test_y = train_test_split(wdbc_x, wdbc_y, test_size=0.3, random_state=0)
print('Train:', Counter(wdbc_train_y)[1]/(Counter(wdbc_train_y)[1]+Counter(wdbc_train_y)[0]))
print('Test:', Counter(wdbc_test_y)[1]/(Counter(wdbc_test_y)[1]+Counter(wdbc_test_y)[0]))

Train: 0.6423173803526449
Test: 0.5906432748538012


- stratify 반영하면 class 비율을 유지시키면서 sampling 할 수 있음

In [19]:
wdbc_train_x, wdbc_test_x, wdbc_train_y, wdbc_test_y = train_test_split(wdbc_x, wdbc_y, stratify=wdbc_y, test_size=0.3, random_state=0)
print('Train:', Counter(wdbc_train_y)[1]/(Counter(wdbc_train_y)[1]+Counter(wdbc_train_y)[0]))
print('Test:', Counter(wdbc_test_y)[1]/(Counter(wdbc_test_y)[1]+Counter(wdbc_test_y)[0]))

Train: 0.6272040302267002
Test: 0.6257309941520468


| - | - | pred | pred |
|---|---|:-------------:|-----:|
| - | -  |             0 |     1|
| actual | 0 | 59 | 5 |
| actual | 1 | 0 | 107 |

In [20]:
# 분류기 성능 지표 산출 함수
def evaluation(y_true, y_pred):
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
    # True positive rate: TPR
    TPR = cm[1, 1] / sum(cm[1,:]) # recall
    # True negative rate: TNR
    TNR = cm[0, 0] / sum(cm[0,:])
    # Simple Accuracy
    ACC = (cm[0, 0] + cm[1, 1]) / np.sum(cm)
    # Balanced Correction Rate
    BCR = np.sqrt(TPR * TNR)
    # F1-measure
    Precision = cm[1,1] /sum(cm[:,1])
    F1 = 2*TPR*Precision/(TPR+Precision)
    return [cm, ACC, Precision,  TPR, F1, BCR]

acc_metric_names = ['ACC', 'PRE', 'REC', 'F1', 'BCR']

-  Cross Validation을 통한 파라미터 튜닝 SVM 학습 모델 정의
    - SVM Classification의 경우, 크게 'kernel'과 'C'(cost) 그리고 'gamma'(Kernel coefficient) 등이 존재합니다.
    - 'C'는 penalty cost를 의미하며, 'gamma'의 경우 Support Vector의 영향 area를 결정하게됩니다.
    - 이전 Neural Network 학습과 동일하게 N-fold Cross-Validation을 통해서 'C'와 'Gamma'를 결정하여 최적화된 SVM모델을 정의하도록 하겠습니다.


In [21]:
### Cross Validation을 통한 최적의 'C'와 'gamma' 파라미터 튜닝
def CV_SVM_classifier(train_x, train_y, CV_n):
    parameters =  {'C':[0.01,0.05,0.1,1,5,10,100],'gamma':[0.01,0.02,0.06,0.08,0.1,1,2,10,100]}
    print("매개변수 그리드:\n{}".format(parameters))
        
    # SVM 객체 생성 및 그리드를 통한 학습파라미터 최적화를 통한 모델 적합
    clf = SVC(kernel='rbf')
    
    grid_clf = GridSearchCV(clf, parameters, scoring='f1', cv=CV_n)
    
    grid_clf.fit(train_x, train_y)
    
    # SVM classifier fit
    print("The optimal 'C' : {}\nThe optimal 'Gamma' : {}".format(grid_clf.best_params_['C'], grid_clf.best_params_['gamma']))

    # 최종 KNN 회귀모형 적합 및 분류 성능 지표 산출
    opt_SVC = SVC(C=grid_clf.best_params_['C'], gamma=grid_clf.best_params_['gamma'])
    opt_SVC.fit(X=train_x, y=train_y)

    return opt_SVC

### SVM 파라미터 Gridsearch를 통한 SVM Classifier 모델 학습

5-fold cross-validation을 통해서 주언진 'wdbc'데이터 맞는  SVM Classifier 모델을 학습해보도록 하겠습니다.

parameter grid는 다음과 같습니다. 

Q1. Classifier

In [22]:
WD_SVM = CV_SVM_classifier(train_x=wdbc_train_x,
                           train_y=wdbc_train_y,
                           CV_n=5)

매개변수 그리드:
{'C': [0.01, 0.05, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.02, 0.06, 0.08, 0.1, 1, 2, 10, 100]}
The optimal 'C' : 5
The optimal 'Gamma' : 0.01


 Q2. prediction

In [23]:
WD_SVM_pred=WD_SVM.predict(wdbc_test_x)
WD_SVM_pred

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1])

In [24]:
### CV를 통한 SVM 분류 정확도 측정
WD_SVM_cfm, WD_SVM_acc, WD_SVM_pre, WD_SVM_rec, WD_SVM_f1, WD_SVM_bcr = evaluation(y_pred=WD_SVM_pred,y_true=wdbc_test_y)
print('CM: \n', WD_SVM_cfm)

print('==== SVM - Classifier Performance ====')
SVM_result_tb = pd.Series([WD_SVM_acc, WD_SVM_pre, WD_SVM_rec, WD_SVM_f1, WD_SVM_bcr], index = acc_metric_names)
SVM_result_tb.T

CM: 
 [[ 59   5]
 [  0 107]]
==== SVM - Classifier Performance ====


ACC    0.970760
PRE    0.955357
REC    1.000000
F1     0.977169
BCR    0.960143
dtype: float64



### SVM 분류 모형과 Neural Network 비교를위해 Neural Network model 정의

- 정확도 97%의 비교적 안정적인 성능을 보여줍니다.

In [25]:
### Neural Network를 통한 분류 정확도 : 과거 최적화 파라미터 히든레이어 60, 학습 iteration 500
WD_NN = MLPClassifier(hidden_layer_sizes=60, max_iter=500).fit(wdbc_train_x,wdbc_train_y)
WD_NN_pred=WD_NN.predict(wdbc_test_x)

In [26]:
### CV를 통한 NN 분류 정확도 측정
WD_NN_cfm, WD_NN_acc, WD_NN_pre, WD_NN_rec, WD_NN_f1, WD_NN_bcr = evaluation(y_true=wdbc_test_y, y_pred=WD_NN_pred)
print('CM: \n', WD_NN_cfm)

print('==== Neural Network - Classifier Performance ====')
NN_result_tb = pd.Series([WD_NN_acc, WD_NN_pre, WD_NN_rec, WD_NN_f1, WD_NN_bcr ], index = acc_metric_names)
NN_result_tb.T

CM: 
 [[ 60   4]
 [  2 105]]
==== Neural Network - Classifier Performance ====


ACC    0.964912
PRE    0.963303
REC    0.981308
F1     0.972222
BCR    0.959154
dtype: float64

---



### 반복 수행을 통한 Neural Network와 SVM의 학습 결과 비교

학습 파라미터를 고정시킨 상태로 인공신경망과 SVM을 5회 반복수행하여, 그 결과를 확인해보도록 하겠습니다.


In [27]:
# SVM best hyperparameters
opt_gamma=WD_SVM.gamma
opt_C=WD_SVM.C
saver_dict = {'SVM':{'ACC':[], 'PRE':[], 'REC':[], 'F1':[], 'BCR': []},
              'NN':{'ACC':[], 'PRE':[], 'REC':[], 'F1':[], 'BCR': []}}

In [28]:
for i in tqdm(range(5),desc='SVM'):

    # SVM classifier
    WD_SVM = SVC(C=opt_C,gamma=opt_gamma).fit(wdbc_train_x, wdbc_train_y)
    
    # prediction
    WD_SVM_pred = WD_SVM.predict(wdbc_test_x)
    
    # eval
    WD_SVM_cfm, WD_SVM_acc, WD_SVM_pre, WD_SVM_rec, WD_SVM_f1, WD_SVM_bcr = evaluation(y_true=wdbc_test_y,
                                                                                       y_pred=WD_SVM_pred)
    saver_dict['SVM']['ACC'].append(WD_SVM_acc)
    saver_dict['SVM']['PRE'].append(WD_SVM_pre)
    saver_dict['SVM']['REC'].append(WD_SVM_rec)
    saver_dict['SVM']['F1'].append(WD_SVM_f1)
    saver_dict['SVM']['BCR'].append(WD_SVM_bcr)
    


SVM: 100%|█████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 247.67it/s]


In [29]:
saver_dict['SVM']

{'ACC': [0.9707602339181286,
  0.9707602339181286,
  0.9707602339181286,
  0.9707602339181286,
  0.9707602339181286],
 'PRE': [0.9553571428571429,
  0.9553571428571429,
  0.9553571428571429,
  0.9553571428571429,
  0.9553571428571429],
 'REC': [1.0, 1.0, 1.0, 1.0, 1.0],
 'F1': [0.9771689497716896,
  0.9771689497716896,
  0.9771689497716896,
  0.9771689497716896,
  0.9771689497716896],
 'BCR': [0.960143218483576,
  0.960143218483576,
  0.960143218483576,
  0.960143218483576,
  0.960143218483576]}

In [30]:
for i in tqdm(range(5),desc='MLP'):
    
    # model
    WD_NN = MLPClassifier(hidden_layer_sizes=60, max_iter=500).fit(wdbc_train_x, wdbc_train_y)
    
    # prediction
    WD_NN_pred = WD_NN.predict(wdbc_test_x)

    # eval
    WD_NN_cfm, WD_NN_acc, WD_NN_pre, WD_NN_rec, WD_NN_f1, WD_NN_bcr = evaluation(y_true=wdbc_test_y,
                                                                                 y_pred=WD_NN_pred)

    saver_dict['NN']['ACC'].append(WD_NN_acc)
    saver_dict['NN']['PRE'].append(WD_NN_pre)
    saver_dict['NN']['REC'].append(WD_NN_rec)
    saver_dict['NN']['F1'].append(WD_NN_f1)
    saver_dict['NN']['BCR'].append(WD_NN_bcr)
    

MLP: 100%|██████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.01it/s]


In [31]:
saver_dict['NN']

{'ACC': [0.9532163742690059,
  0.9590643274853801,
  0.9649122807017544,
  0.9707602339181286,
  0.9532163742690059],
 'PRE': [0.9626168224299065,
  0.9629629629629629,
  0.9719626168224299,
  0.9811320754716981,
  0.9626168224299065],
 'REC': [0.9626168224299065,
  0.9719626168224299,
  0.9719626168224299,
  0.9719626168224299,
  0.9626168224299065],
 'F1': [0.9626168224299065,
  0.9674418604651163,
  0.9719626168224299,
  0.9765258215962441,
  0.9626168224299065],
 'BCR': [0.949975405485867,
  0.9545757975514716,
  0.962497724235688,
  0.9703549788849073,
  0.949975405485867]}

### 반복 수행을 통한 Neural Network와 SVM의 학습 결과 비교

5번의 반복 수행 결과, Parameter가 고정되어 있음에도 불구하고 매번 다른 학습결과가 나오는 인공신경망과 달리
SVM의 경우 5번 모두 동일한 결과(동일한 최적해)를 산출함을 확인할 수 있습니다.


In [32]:
# SVM case
print('--------------------------------------------------------------')
print( "SVM을 통한 5회 반복 분류 model 정확도:\n{}".format(saver_dict['SVM']['ACC']))
print('--------------------------------------------------------------')
print('')
SVM_result = pd.DataFrame(saver_dict['SVM'])
SVM_result

--------------------------------------------------------------
SVM을 통한 5회 반복 분류 model 정확도:
[0.9707602339181286, 0.9707602339181286, 0.9707602339181286, 0.9707602339181286, 0.9707602339181286]
--------------------------------------------------------------



Unnamed: 0,ACC,PRE,REC,F1,BCR
0,0.97076,0.955357,1.0,0.977169,0.960143
1,0.97076,0.955357,1.0,0.977169,0.960143
2,0.97076,0.955357,1.0,0.977169,0.960143
3,0.97076,0.955357,1.0,0.977169,0.960143
4,0.97076,0.955357,1.0,0.977169,0.960143


In [33]:
# Neural Network case
print('--------------------------------------------------------------')
print( "Neural Network 를 통한 10개 반복 분류 model 정확도:\n{}".format(saver_dict['NN']['ACC']))
print('--------------------------------------------------------------')
print('')
NN_result=pd.DataFrame(saver_dict['NN'])
NN_result

--------------------------------------------------------------
Neural Network 를 통한 10개 반복 분류 model 정확도:
[0.9532163742690059, 0.9590643274853801, 0.9649122807017544, 0.9707602339181286, 0.9532163742690059]
--------------------------------------------------------------



Unnamed: 0,ACC,PRE,REC,F1,BCR
0,0.953216,0.962617,0.962617,0.962617,0.949975
1,0.959064,0.962963,0.971963,0.967442,0.954576
2,0.964912,0.971963,0.971963,0.971963,0.962498
3,0.97076,0.981132,0.971963,0.976526,0.970355
4,0.953216,0.962617,0.962617,0.962617,0.949975


- **Summary**

In [34]:
# average 
SVM_result.mean(axis=0)

ACC    0.970760
PRE    0.955357
REC    1.000000
F1     0.977169
BCR    0.960143
dtype: float64

In [35]:
# average 
NN_result.mean(axis=0)

ACC    0.960234
PRE    0.968258
REC    0.968224
F1     0.968233
BCR    0.957476
dtype: float64

In [36]:
df = pd.concat([SVM_result.mean(axis=0),NN_result.mean(axis=0)], axis=1).T
df.index = ['SVM', 'NN']

In [37]:
df

Unnamed: 0,ACC,PRE,REC,F1,BCR
SVM,0.97076,0.955357,1.0,0.977169,0.960143
NN,0.960234,0.968258,0.968224,0.968233,0.957476
