##  SVM(Support Vector Machine)의 특징

### 장점
- 데이터마이닝 기법 중에서도 예측력이 무수한 알고리즘
- 샘플이 작을 떄에도 모형의 적합도가 우수함
- 계산량이 적어 대용량 데이터에 대한 분석이 가능함
- 생물정보학, 문자 인식, 길이 인식, 얼굴 및 물체 인식 등에 우수함

### 단점
- 단위비용과 커널 선택에 따라 모형이 민감함.
- 예측 과정의 이해가 어렵고 비전문가에게 모델 설명이 난해함
- 다범주 분류의 경우 기하급수적으로 학습속도와 분류속도가 느려짐

#### 선형 분류

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [2]:
import pandas as pd

In [3]:
### data load
iris = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv")
iris

Unnamed: 0.1,Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,1,5.1,3.5,1.4,0.2,setosa
1,2,4.9,3.0,1.4,0.2,setosa
2,3,4.7,3.2,1.3,0.2,setosa
3,4,4.6,3.1,1.5,0.2,setosa
4,5,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,virginica
146,147,6.3,2.5,5.0,1.9,virginica
147,148,6.5,3.0,5.2,2.0,virginica
148,149,6.2,3.4,5.4,2.3,virginica


In [4]:
### 설명변수(x)와 반응변수(y)
x = iris.iloc[50:,3:5]
y = iris.iloc[50:,5]

In [5]:
### Train & Test data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.4)

In [6]:
### SVM
svc = SVC(kernel = "linear", C = 1)
model = svc.fit(x_train,y_train)

In [7]:
### 예측
y_pred = model.predict(x_test)

In [8]:
### 교차표
pd.crosstab(y_test, y_pred)

col_0,versicolor,virginica
Species,Unnamed: 1_level_1,Unnamed: 2_level_1
versicolor,19,2
virginica,1,18


In [9]:
### classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

  versicolor       0.95      0.90      0.93        21
   virginica       0.90      0.95      0.92        19

    accuracy                           0.93        40
   macro avg       0.93      0.93      0.92        40
weighted avg       0.93      0.93      0.93        40



https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv

In [10]:
### data load
mtcar = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv")
mtcar

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [11]:
### 설명변수(x)와 반응변수(y)
x = mtcar.loc[:,['mpg','am']]
x.std= StandardScaler().fit_transform(x)
y = mtcar.iloc[:,8]

In [12]:
### Train & Test data
x_train, x_test, y_train, y_test = train_test_split(x.std,y,test_size = 0.4)

In [13]:
### SVM
svc = SVC(kernel = "linear", C = 1)
model = svc.fit(x_train,y_train)

In [14]:
### 예측
y_pred = model.predict(x_test)

In [15]:
### accuracy
model.score(x_test,y_test)

0.6923076923076923

In [16]:
### 교차표
pd.crosstab(y_test, y_pred)

col_0,0,1
vs,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,1
1,3,4


In [17]:
### classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.62      0.83      0.71         6
           1       0.80      0.57      0.67         7

    accuracy                           0.69        13
   macro avg       0.71      0.70      0.69        13
weighted avg       0.72      0.69      0.69        13



### 다중 클래스 분류
- 3개 이상의 클래스(범주)가 있는 경우

In [18]:
### data load
house = pd.read_csv("http://youngho.iwinv.net/data/house_price_prediction.csv")
house.head()

Unnamed: 0,date,price,price_cat,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 0:00,313000.0,Class3,3,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 0:00,342000.0,Class3,3,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
2,2014-05-02 0:00,420000.0,Class3,3,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
3,2014-05-02 0:00,550000.0,Class2,4,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA
4,2014-05-02 0:00,490000.0,Class2,2,1.0,880,6380,1.0,0,0,3,880,0,1938,1994,522 NE 88th St,Seattle,WA 98115,USA


In [21]:
### 입력변수(x)와 반응변수(y)
x = house.loc[:,['bedrooms','sqft_living','view','condition','yr_built']]
x_std= StandardScaler().fit_transform(x)
y = house.iloc[:,2]

In [22]:
### Train & Test data
x_train, x_test, y_train, y_test = train_test_split(x_std,y,test_size = 0.4)

In [23]:
### SVM
svc = SVC(kernel = "linear", C = 1)
model = svc.fit(x_train,y_train)

In [24]:
### 예측
y_pred = model.predict(x_test)

In [25]:
### accuracy
model.score(x_test,y_test)

0.46384222059897734

In [26]:
### 교차표
pd.crosstab(y_test, y_pred)

col_0,Class2,Class3,Class4
price_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Class2,291,141,41
Class3,193,249,84
Class4,70,205,95


In [27]:
### classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

      Class2       0.53      0.62      0.57       473
      Class3       0.42      0.47      0.44       526
      Class4       0.43      0.26      0.32       370

    accuracy                           0.46      1369
   macro avg       0.46      0.45      0.44      1369
weighted avg       0.46      0.46      0.45      1369

