# Multiclass SVM 구현

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [3]:
def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [4]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [5]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

In [6]:
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from sklearn.metrics import classification_report

## 1. One vs One

In [7]:
# 타겟값 인코딩
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 초기화 및 학습 데이터에 맞춰 학습
encoder = LabelEncoder()
encoder.fit(y_train)
y_train, y_test = encoder.transform(y_train), encoder.transform(y_test)

from itertools import combinations

# 클래스 조합 생성
class_combinations = combinations(np.unique(y_train), 2)
results = pd.DataFrame()

# SVM 모델 초기화
svc = SVC()

# 조합 별 모델 학습 및 예측값 저장
for comb in class_combinations:
    # 현재 클래스 조합에 해당하는 인덱스 선택
    indices = [True if x in comb else False for x in y_train]
    train_x, train_y = X_train[indices], y_train[indices]

    # SVM 모델 학습
    svc.fit(train_x, train_y)

    # 테스트 데이터에 대한 예측 결과 저장
    result = svc.predict(X_test)
    results[str(comb)] = result


from collections import Counter

# 클래스별 샘플 개수 내림차순 정렬
sorted_counts = [item[0] for item in sorted(Counter(y_train).items(), key=lambda x: x[1], reverse=True)]

# 샘플 별 클래스 투표 함수 정의
def voting(row: pd.Series):
    counts = row.value_counts()
    win_class = int(counts.idxmax())

    # 동점의 경우, 가장 개수가 많은 클래스 사용
    if sum(counts == counts.max()) > 1:
        same_scores = counts[counts == counts.max].index
        selected_class = sorted([(sorted_counts.index(x), int(x)) for x in same_scores])[0][1]
        win_class = selected_class
    
    return win_class

# 예측값 계산
y_pred = results.apply(voting, axis=1)

In [8]:
# One vs One Multiclassification 성능 평가

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

'''accuracy = 0.93 / f1-score = 0.93'''

              precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       0.91      0.91      0.91        11
           2       0.91      1.00      0.95        10

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30



'accuracy = 0.93 / f1-score = 0.93'

## 2. One vs Rest

In [9]:
results = pd.DataFrame()

svc = SVC(probability=True)  # 확률 예측을 위해 probability=True로 설정

for cls in np.unique(y_train):
    svc.fit(X_train, (y_train == cls))  # 해당 클래스에 해당하는 샘플은 1, 그 외에는 0으로 레이블을 지정하여 훈련

    # 테스트 데이터에 대한 예측 확률을 저장
    result = svc.predict_proba(X_test)[:, 1]  # 양성 클래스(해당 클래스)에 대한 예측 확률만 저장
    results[str(cls)] = result

# 최종 예측은 각 클래스에 대한 예측 확률 중 가장 높은 확률을 가진 클래스로 선택
y_pred = results.idxmax(axis=1).astype(int)

In [10]:
# One vs Rest Multiclassification 성능 평가

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

''' accuracy = 0.97 / f1-score = 0.97 '''

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      0.82      0.90        11
           2       0.83      1.00      0.91        10

    accuracy                           0.93        30
   macro avg       0.94      0.94      0.94        30
weighted avg       0.94      0.93      0.93        30



' accuracy = 0.97 / f1-score = 0.97 '