In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [5]:
# 載入資料
iris = datasets.load_iris()
x = pd.DataFrame(iris['data'], columns = iris['feature_names'])
print("target_names:" + str(iris['target_names']))
y = pd.DataFrame(iris['target'], columns = ['target'])
iris_data = pd.concat([x, y], axis=1)
iris_data = iris_data[['sepal length (cm)', 'petal length (cm)', 'target']]  # 只留需要的columns
iris_data = iris_data[iris_data['target'].isin([0, 1])]  # 篩選類別是0,1的資料
iris_data.head(3)

target_names:['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),petal length (cm),target
0,5.1,1.4,0
1,4.9,1.4,0
2,4.7,1.3,0


In [6]:
# train&test資料
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    iris_data[['sepal length (cm)', 'petal length (cm)']], iris_data[['target']], test_size = 0.3, random_state = 0)

In [11]:
# 資料集做標準化
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

In [13]:
# 引入SVM
from sklearn.svm import SVC
# 使用svc，是SVM的kernel(核)，不是用python而是C來做運算，因為速度比較快

In [15]:
# 先指定kernel=linear線性分類，因為SVM可以在平面做曲線
# 如果想要知道分類成哪一類的機率的話加上probability=True
svm = SVC(kernel = 'linear', probability = True)

In [16]:
# 使用fit來train model
svm.fit(x_train_std, y_train['target'].valueues)

SVC(kernel='linear', probability=True)

In [17]:
# 預測的值
svm.predict(x_test_std)

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1])

In [18]:
# 實際上的值
y_test['target'].values

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1])

In [19]:
# check預計值與實際值有無差異
error = 0
for i, v in enumerate(svm.predict(x_test_std)):
    if v != y_test['target'].values[i]:
        error += 1
print("error:" + str(error))

error:0


error等於0，所有的預測都正確

In [20]:
# 看預測出來的機率是多少
svm.predict_proba(x_test_std)

array([[0.9500494 , 0.0499506 ],
       [0.00766008, 0.99233992],
       [0.9739689 , 0.0260311 ],
       [0.00660894, 0.99339106],
       [0.01700423, 0.98299577],
       [0.11851503, 0.88148497],
       [0.98441528, 0.01558472],
       [0.0049398 , 0.9950602 ],
       [0.00885689, 0.99114311],
       [0.01579397, 0.98420603],
       [0.02430465, 0.97569535],
       [0.03012165, 0.96987835],
       [0.00822769, 0.99177231],
       [0.98054903, 0.01945097],
       [0.9625743 , 0.0374257 ],
       [0.94233996, 0.05766004],
       [0.98835626, 0.01164374],
       [0.87626275, 0.12373725],
       [0.98059304, 0.01940696],
       [0.95670044, 0.04329956],
       [0.9500494 , 0.0499506 ],
       [0.03472622, 0.96527378],
       [0.95001236, 0.04998764],
       [0.0372759 , 0.9627241 ],
       [0.96761939, 0.03238061],
       [0.96991933, 0.03008067],
       [0.96254628, 0.03745372],
       [0.02106632, 0.97893368],
       [0.03997577, 0.96002423],
       [0.00615694, 0.99384306]])

第0筆資料:[0.9500494 , 0.0499506 ]，預測成類別0的機率是0.95所以判斷為類別0   
第1筆資料:[0.00766008, 0.99233992]，預測成類別0的機率是0.007，所以預測為類別1的機率是1-0.007，判斷為類別1

### 視覺化

In [None]:
from matplotlib.colors import ListedColormap