In [20]:
import pandas as pd
import numpy as np
import sklearn as metrics
from sklearn import svm
from sklearn.svm import SVC
#使用SVC模型
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report,roc_curve,auc
#計算準確率及顯示accuracy結果
#auc roc用於評估Classifier輸出
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
#使用train_test_split分配訓練測試模型比例, KFold交叉驗證,GridSearchCV網格搜尋

breast_cols = ['ID', 'Diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 
              'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se',
               'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se',
               'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
               'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
breast = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None, names = breast_cols,sep=',') 
breast.drop(['ID'],inplace=True, axis=1) #去除不必要的數據
breast_convert = {'M' : 0, 'B' : 1} #將Diagnosis數據轉換成numberic
breast['Diagnosis'] = breast['Diagnosis'].map(breast_convert)
print(breast)
print()

features=list(breast.columns[1:31]) #設定train,test model 各自的數據
x = breast[features]
y = breast["Diagnosis"]

svc = svm.SVC(kernel='linear',C=1).fit(x,y) #使用SVC 並將kernel設為linear
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 5) #分配模型比例, 及設定random_state可讓下次執行結果保持一樣
kf = KFold(n_splits=5,shuffle=True,random_state=5) #KFold
print(kf)
print()

for train_index , test_index in kf.split(x): #印出Fold 每次測試時所用的數據
     print('train_index:%s \n test_index: %s \n' %(train_index,test_index))
print()

print("SVC score is %s" %cross_val_score(svc,x,y,cv=5).mean()) #SVC分數
print()

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(f1_score(y_test, y_pred , average="macro"))
print(f1_score(y_test, y_pred , average="micro"))
print(f1_score(y_test, y_pred , average="weighted"))
print(f1_score(y_test, y_pred , average=None))
print()
print(classification_report(y_test,y_pred)) #顯示F1score

     Diagnosis  radius_mean  ...  symmetry_worst  fractal_dimension_worst
0            0        17.99  ...          0.4601                  0.11890
1            0        20.57  ...          0.2750                  0.08902
2            0        19.69  ...          0.3613                  0.08758
3            0        11.42  ...          0.6638                  0.17300
4            0        20.29  ...          0.2364                  0.07678
..         ...          ...  ...             ...                      ...
564          0        21.56  ...          0.2060                  0.07115
565          0        20.13  ...          0.2572                  0.06637
566          0        16.60  ...          0.2218                  0.07820
567          0        20.60  ...          0.4087                  0.12400
568          1         7.76  ...          0.2871                  0.07039

[569 rows x 31 columns]

KFold(n_splits=5, random_state=5, shuffle=True)

train_index:[  0   1   2   4   5   6 

In [21]:
iris_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, names = iris_cols,sep=',') 
iris_convert = {'Iris-setosa': 0, 'Iris-versicolor':1 , 'Iris-virginica':2 } #數據整理,把string改成Interger 
iris['Species'] = iris['Species'].map(iris_convert)
print(iris)
print()

features=list(iris.columns[0:4]) #獲取train_test_split 輸入的數據
x = iris[features]
y = iris["Species"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 5) #分配train test model的比例

svm = SVC() #svc默認為rbf， 所以沒設定
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(confusion_matrix(y_test, y_pred))#印出matrix
print()
print(classification_report(y_test,y_pred))#印出clf報告

param_grid = {'kernel':['rbf'],'C':[0.1,1,10,100], 'gamma':[1,0.1,0.01,0.001]} #gridsearchCv 設定題目要求的C與γ
kfold = KFold(n_splits=10, shuffle=True,random_state=5) #將數據分出10筆
grid = GridSearchCV(svm, param_grid, refit = True, verbose=3,cv=kfold)
grid_result=grid.fit(X_train, y_train) 
pred_grid = grid.predict(X_test)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) #經gridsearch后得出最佳分數
print()
print(confusion_matrix(y_test, pred_grid)) #印出gridsearch后的matrix
print()
print(classification_report(y_test, pred_grid))#印出gridsearch后的clf報告

#將confusion_matrix(y_test, y_pred &classification_report(y_test,y_pred 與confusion_matrix(y_test, pred_grid &classification_report(y_test, pred_grid） 結果作比對
#兩者相等即正確
#ROC AUC 未完成及使用

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Species
0              5.1           3.5            1.4           0.2        0
1              4.9           3.0            1.4           0.2        0
2              4.7           3.2            1.3           0.2        0
3              4.6           3.1            1.5           0.2        0
4              5.0           3.6            1.4           0.2        0
..             ...           ...            ...           ...      ...
145            6.7           3.0            5.2           2.3        2
146            6.3           2.5            5.0           1.9        2
147            6.5           3.0            5.2           2.0        2
148            6.2           3.4            5.4           2.3        2
149            5.9           3.0            5.1           1.8        2

[150 rows x 5 columns]

[[ 8  0  0]
 [ 0 10  1]
 [ 0  0 11]]

              precision    recall  f1-score   support

           0       1.00      1

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.583, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.333, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.167, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.333, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=1.000, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=1.000, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.833, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] .

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:    0.8s finished
