# Ch 3. 監督式學習：分類
## 3-4. 支援向量機
[3-4-1. 線性支援向量機](#sec3_4_1)  
[3-4-2. 加入核函數處理非線性分類](#sec3_4_2)

## 3-5. 樸素貝氏分類器
[3-5. 樸素貝氏分類器](#sec3_5)
***

<a id='sec3_4_1'></a>
## 3-4-1. 線性支援向量機

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

df = pd.read_csv('Pokemon_894_13.csv')
df['hasType2'] = df['Type2'].notnull().astype(int)
X, y = df.loc[:, 'HP':'Speed'], df['hasType2']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scale = StandardScaler().fit(X_train)
X_train_std = scale.transform(X_train)
X_test_std = scale.transform(X_test)

# 建立 SVM 分類器，設定最大回合數以增加收斂機會
svm = LinearSVC(max_iter=1500)
svm.fit(X_train_std, y_train)

from sklearn.metrics import classification_report
# 產生分類報告
y_pred = svm.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.55      0.54        95
           1       0.66      0.65      0.66       129

    accuracy                           0.61       224
   macro avg       0.60      0.60      0.60       224
weighted avg       0.61      0.61      0.61       224



<a id='sec3_4_2'></a>
## 3-4-1. 加入核函數處理非線性分類

In [2]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=5, gamma=.01, probability=True)
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.55      0.55        95
           1       0.67      0.67      0.67       129

    accuracy                           0.62       224
   macro avg       0.61      0.61      0.61       224
weighted avg       0.62      0.62      0.62       224



In [3]:
# 未知寶可夢的屬性
new_poke = [[120, 50, 80, 100, 150, 90]]
new_poke_std = scale.transform(new_poke)
# 預測是否有雙屬性
print(svm.predict(new_poke_std))
# 檢視預測機率
svm.predict_proba(new_poke_std)

[1]


array([[0.43968598, 0.56031402]])

In [4]:
# 查看支援向量
print('支援向量數目：', svm.support_vectors_.shape[0])
print('支援向量的索引值：', svm.support_[:5])
svm.support_vectors_[:2, :]

支援向量數目： 592
支援向量的索引值： [ 3  4  5  6 10]


array([[-0.37405999, -0.47353542,  0.19932389, -0.2076741 ,  0.31555353,
        -0.32148304],
       [ 0.60433716, -0.17166222, -0.12505553,  0.03957894,  1.59202932,
        -0.01581563]])

In [5]:
df['Legendary'] = df['Legendary'].astype(int)
n_legend = df['Legendary'].sum()
n_not_legend = df.shape[0] - n_legend
print('數量比=> 神獸:非神獸 = {}:{}'.format(n_legend,n_not_legend))

數量比=> 神獸:非神獸 = 79:815


In [6]:
# 預測是否為神獸
X, y = df.loc[:, 'HP':'Speed'], df['Legendary']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scale = StandardScaler().fit(X_train)
X_train_std = scale.transform(X_train)
X_test_std = scale.transform(X_test)

svm = SVC(kernel='rbf')
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       197
           1       0.79      0.41      0.54        27

    accuracy                           0.92       224
   macro avg       0.85      0.70      0.74       224
weighted avg       0.91      0.92      0.90       224



In [7]:
# 加上平衡類別的考量
svm = SVC(kernel='rbf', class_weight='balanced')
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95       197
           1       0.61      0.93      0.74        27

    accuracy                           0.92       224
   macro avg       0.80      0.92      0.84       224
weighted avg       0.94      0.92      0.93       224



<a id='sec3_5'></a>
## 3-5 樸素貝氏分類器

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred))

0.6875


In [9]:
# 未知寶可夢的屬性
new_poke = [[120, 50, 80, 100, 150, 90]]
print(clf.predict(new_poke_std))

[0]


In [10]:
# 假設類別的機率分布，預設為原始訓練數據的分布
clf = GaussianNB(priors=[0.4, 0.6])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred))

0.5416666666666667


In [11]:
from sklearn.naive_bayes import MultinomialNB

X, y = df.loc[:, 'HP':'Speed'], df['Generation']
clf = MultinomialNB().fit(X, y)
print(clf.predict(new_poke))

[2]
