# Multiclass SVM 구현

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [14]:
y.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [12]:
X

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [3]:
def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [4]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [5]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

[참고) sklearn.multiclass.OneVsRestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html)

## One-hot encoding

In [10]:
X_train_s = pd.DataFrame(X_train)
X_test_s = pd.DataFrame(X_test)

X_train_s.column = ['']

In [11]:
X_test_s.head()

Unnamed: 0,0,1,2,3
0,-0.148872,-0.374155,0.266764,0.139419
1,0.318177,-0.605591,0.547568,0.009727
2,0.318177,-1.068463,1.053015,0.269112
3,-1.550016,-1.762772,-1.361899,-1.157504
4,0.084653,0.320153,0.603729,0.787881


In [16]:
y_train

110     virginica
69     versicolor
148     virginica
39         setosa
53     versicolor
          ...    
64     versicolor
91     versicolor
81     versicolor
51     versicolor
0          setosa
Name: species, Length: 120, dtype: object

In [30]:
y_train_s = pd.get_dummies(y_train)
y_test_s = pd.get_dummies(y_test)

y_train_s

Unnamed: 0,setosa,versicolor,virginica
110,0,0,1
69,0,1,0
148,0,0,1
39,1,0,0
53,0,1,0
...,...,...,...
64,0,1,0
91,0,1,0
81,0,1,0
51,0,1,0


In [38]:
y_train_s.loc[:, 'setosa']

110    0
69     0
148    0
39     1
53     0
      ..
64     0
91     0
81     0
51     0
0      1
Name: setosa, Length: 120, dtype: uint8

## Training each Binary SVM

In [54]:
from sklearn import metrics #model evaluation하는 라이브러리

svc1 = SVC(kernel='rbf', C = 100)
svc1.fit(X_train_s,y_train_s.loc[:, 'setosa']) # 모델 트레이닝 하는 코드

y_pred1 = svc1.predict(X_test_s) # 훈련한 모델로 test셋을 시험

# print('Accuracy Score:') 
# print(metrics.accuracy_score(y_test_s.loc[:, 'setosa'], y_pred1)) #스코어

y_pred1

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0], dtype=uint8)

In [55]:
svc2 = SVC(kernel='rbf', C = 100)
svc2.fit(X_train_s,y_train_s.loc[:, 'versicolor']) # 모델 트레이닝 하는 코드

y_pred2 = svc2.predict(X_test_s) # 훈련한 모델로 test셋을 시험

# print('Accuracy Score:') 
# print(metrics.accuracy_score(y_test_s.loc[:, 'versicolor'], y_pred2)) #스코어

y_pred2

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1], dtype=uint8)

In [56]:
svc3 = SVC(kernel='rbf', C = 100)
svc3.fit(X_train_s,y_train_s.loc[:, 'virginica']) # 모델 트레이닝 하는 코드

y_pred3 = svc3.predict(X_test_s) # 훈련한 모델로 test셋을 시험

# print('Accuracy Score:') 
# print(metrics.accuracy_score(y_test_s.loc[:, 'virginica'], y_pred3)) #스코어

y_pred3

array([0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0], dtype=uint8)

## Multiclass SVM

In [125]:
y_preds = np.zeros(len(y_test), dtype=int)
# 클래스별 등장 확률 구하기
class_prob = [sum(y_pred1)/len(y_test), sum(y_pred2)/len(y_test), sum(y_pred3)/len(y_test)]

# 테스트셋 크기
for i in range(len(y_test)):
    y_pred = [y_pred1[i], y_pred2[i], y_pred3[i]]
    # 모두 1이거나 0일때, 등장 확률이 제일 높은 클래스로 선택
    if sum(y_pred) == 0 or sum(y_pred) == 3:
        y_preds[i] = np.argmax(class_prob)
    
    elif sum(y_pred) == 2:
        # 결과가 0으로 나온 클래스 제외, 등장 확률이 높은 클래스로 선택
        tmp = class_prob.copy()
        tmp.pop(y_pred.index(0))
        y_preds[i] = np.argmax(tmp)
        
    else:
        y_preds[i] = np.argmax(y_pred)

y_preds

array([1, 1, 2, 1, 2, 2, 0, 2, 0, 1, 2, 0, 0, 2, 1, 1, 0, 1, 1, 2, 0, 2,
       2, 1, 2, 0, 0, 2, 2, 1])

In [127]:
y_preds = pd.DataFrame(y_preds).replace({0:'setosa', 1:'versicolor', 2:'virginica'})

y_preds

Unnamed: 0,0
0,versicolor
1,versicolor
2,virginica
3,versicolor
4,virginica
5,virginica
6,setosa
7,virginica
8,setosa
9,versicolor


In [129]:
print('Accuracy Score:') 
print(metrics.accuracy_score(y_test, y_preds)) #스코어

Accuracy Score:
0.9


# Multiclass SVM 일반화

In [246]:
class MyOneVsRestClassifier:
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.y_pred_list = []
    
    def predict(self, X_train, X_test, y_train, y_test, gamma = 5, C = 10):
        for i in range(n_classes):
            svc = SVC(kernel='rbf', gamma = gamma, C = C)
            svc.fit(X_train, y_train.iloc[:, i])
            self.y_pred_list.append(list(svc.predict(X_test)))
        
        class_prob = []
        for i in range(n_classes):
            class_prob.append(sum(self.y_pred_list[i])/len(y_test))
        
        y_pred = np.zeros(len(y_test), dtype=int)
        pred_sum = sum(self.y_pred_list)
        
        for i in range(len(y_test)):
            print(y_pred)
            # 모두 1이거나 0일때, 등장 확률이 제일 높은 클래스로 선택
#             pred_sum = np.array(self.y_pred_list).sum(axis=0)
            if pred_sum[i] == int(0) or pred_sum[i] == self.n_classes:
                y_preds[i] = np.argmax(class_prob)

            if pred_sum[i] != 1:
                # 결과가 0으로 나온 클래스 제외, 등장 확률이 높은 클래스로 선택
                tmp = class_prob.copy()
                for i in range(n_classes - pred_sum[i]):
                    tmp.pop(y_pred.index(0))
                y_preds[i] = np.argmax(tmp)
            
            # 1이 하나일때
            else:
                y_preds[i] = np.argmax(y_pred)
        
        return y_preds

In [288]:
class MyOneVsRestClassifier:
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.y_pred_list = []
    
    def predict(self, X_train, X_test, y_train, y_test, gamma = 5, C = 10):
        for i in range(n_classes):
            svc = SVC(kernel='rbf', gamma = gamma, C = C)
            svc.fit(X_train, y_train.iloc[:, i])
            self.y_pred_list.append(svc.predict(X_test))
        
        class_prob = []
        for i in range(n_classes):
            class_prob.append(sum(self.y_pred_list[i])/len(y_test))
        
        y_preds = np.zeros(len(y_test), dtype=int)
        for i in range(len(y_test)):
            y_pred = []
            for j in range(self.n_classes):
                y_pred.append(self.y_pred_list[j][i])
                
            # 모두 1이거나 0일때, 등장 확률이 제일 높은 클래스로 선택
            pred_sum = sum(y_pred)
            if pred_sum == 0 or pred_sum == self.n_classes:
                y_preds[i] = np.argmax(class_prob)

            elif pred_sum != 1:
                # 결과가 0으로 나온 클래스 제외, 등장 확률이 높은 클래스로 선택
                tmp = class_prob.copy()
                for i in range(n_classes - pred_sum):
                    tmp.pop(y_pred.index(0))
                y_preds[i] = np.argmax(tmp)
            
            # 1이 하나일때
            else:
                y_preds[i] = np.argmax(y_pred)
        
        return y_preds

In [289]:
# 클래스 수
n_classes = 3
OVRClassifier = MyOneVsRestClassifier(n_classes)

y_pred = OVRClassifier.predict(X_train_s, X_test_s, y_train_s, y_test_s)
y_pred

array([1, 1, 1, 2, 2, 2, 0, 2, 0, 1, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 0, 2,
       1, 1, 2, 0, 0, 2, 2, 1])

In [292]:
y_test

96     versicolor
73     versicolor
134     virginica
41         setosa
70     versicolor
116     virginica
19         setosa
138     virginica
33         setosa
89     versicolor
137     virginica
36         setosa
20         setosa
126     virginica
87     versicolor
56     versicolor
11         setosa
62     versicolor
72     versicolor
120     virginica
8          setosa
147     virginica
77     versicolor
86     versicolor
129     virginica
4          setosa
31         setosa
136     virginica
132     virginica
88     versicolor
Name: species, dtype: object

In [290]:
y_pred = pd.DataFrame(y_pred).replace({0:'setosa', 1:'versicolor', 2:'virginica'})

In [291]:
print('Accuracy Score:') 
print(metrics.accuracy_score(y_test, y_pred)) #스코어

Accuracy Score:
0.8333333333333334
