In [2]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

## Ref: https://happycoder.org/2017/10/14/python-data-science-and-machine-learning-scikit-learn-basic-tutorial/
# 1.明確定義問題 (Problem Definition)
# 2.獲取資料與探索性資料分析 (Get Data & Exploratory Data Analysis)
# 3.資料預處理與特徵工程 (Data Clean/Preprocessing & Feature Engineering)
# 4.訓練模型與校調 (Model Training)
# 5.模型驗證 (Model Predict & Testing)
# 6.模型優化 (Model Optimization)
# 7.上線運行 (Deploy Model)

# 引入 numpy、pd 和 sklearn(scikit-learn) 模組
import numpy as np
import pandas as pd
from sklearn import datasets
# 引入 train_test_split 分割方法，注意在 sklearn v0.18 後 train_test_split 從 sklearn.cross_validation 子模組搬到 sklearn.model_selection 中
from sklearn.model_selection import train_test_split
# 引入 KNeighbors 模型
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC


In [3]:

## 2.獲取資料與探索性資料分析 (Get Data & Exploratory Data Analysis)

# 引入 iris 資料集
raw_iris = datasets.load_iris()
# 探索性分析 Exploratory data analysis，了解資料集內容
# 先印出 key 值，列出有哪些值：['data', 'target', 'target_names', 'DESCR', 'feature_names']
print(raw_iris.keys())

# 印出 feature 值
print('### data ###')
print(raw_iris['data'])

# 印出目標值，分別對應的是三種花的類別：['setosa 山鳶尾' 'versicolor 變色鳶尾' 'virginica 維吉尼亞鳶尾']
print('### target ###')
print(raw_iris['target'])

# 印出目標標籤，三種花的類別：['setosa' 'versicolor' 'virginica']
print('### target_names ###')
print(raw_iris['target_names'])

# 印出資料集內容描述
#print(raw_iris['DESCR'])

# 印出屬性名稱，['sepal length 花萼長度 (cm)', 'sepal width 花萼寬度 (cm)', 'petal length 花蕊長度 (cm)', 'petal width 花蕊寬度 (cm)']
print('### feature_names ###')
print(raw_iris['feature_names'])

# 類別種類
print(np.unique(raw_iris.target))


dict_keys(['DESCR', 'data', 'target_names', 'feature_names', 'target'])
### data ###
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5

In [4]:


## 3.資料預處理與特徵工程 (Data Clean/Preprocessing & Feature Engineering)

# 將資料轉為 pandas DataFrame
# data 為觀察目標變數
df_X = pd.DataFrame(raw_iris.data)
# target 為預測變數
df_y = pd.DataFrame(raw_iris.target)
# 將資料切分為 training data 和 testing data，其中 random_state 若設為 0 或不設則即便實例不同但因種子相同產生同樣隨機編號，若設為 1 則每次隨機產生不同編號
# test_size 為切分 training data 和 testing data 的比例
print('### train_test_split ###')
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3)

# 印出所有資料集筆數
print(len(df_y))

# 印出切分 y_train 的數量為所有資料集的 70%，共 105 筆
print(y_train)
print(len(y_train))

# 印出切分的 y_test 資料為所有資料集的 30%，共 45 筆
print(y_test)
print(len(y_test))


# 初始化 LinearSVC 實例
lin_clf = LinearSVC()
# 使用 fit 來建置模型，其參數接收 training data matrix, testing data array，所以進行 y_train.values.ravel() Data Frame 轉換
lin_clf.fit(X_train, y_train.values.ravel())
print(lin_clf)

# 初始化 KNeighborsClassifier 實例
knn = KNeighborsClassifier()
# 使用 fit 來建置模型，其參數接收 training data matrix, testing data array，所以進行 y_train.values.ravel() 轉換
knn.fit(X_train, y_train.values.ravel())
print(knn)


### train_test_split ###
150
     0
118  2
88   1
105  2
47   0
106  2
45   0
98   1
127  2
50   1
90   1
97   1
142  2
77   1
149  2
84   1
25   0
119  2
15   0
87   1
75   1
36   0
138  2
16   0
111  2
60   1
144  2
44   0
42   0
145  2
9    0
..  ..
26   0
2    0
20   0
80   1
0    0
49   0
101  2
103  2
38   0
99   1
132  2
96   1
79   1
140  2
74   1
117  2
71   1
134  2
41   0
32   0
58   1
120  2
86   1
69   1
130  2
35   0
27   0
17   0
46   0
93   1

[105 rows x 1 columns]
105
     0
5    0
33   0
52   1
53   1
13   0
91   1
54   1
102  2
78   1
95   1
12   0
135  2
92   1
104  2
66   1
110  2
146  2
115  2
129  2
19   0
21   0
100  2
107  2
82   1
37   0
147  2
10   0
123  2
108  2
70   1
18   0
85   1
29   0
126  2
28   0
137  2
125  2
116  2
89   1
64   1
133  2
59   1
131  2
143  2
6    0
45
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=Non

In [5]:


## 4.模型驗證 (Model Predict & Testing)

# 使用 X_test 來預測結果
print(lin_clf.predict(X_test))
# 印出預測準確率
print(lin_clf.score(X_test, y_test))

# 使用 X_test 來預測結果
print(knn.predict(X_test))
# 印出 testing data 預測標籤機率
print(knn.predict_proba(X_test))
# 印出預測準確率
print(knn.score(X_test, y_test))

[0 0 1 1 0 1 1 2 1 1 0 2 1 2 2 2 2 2 1 0 0 2 2 1 0 2 0 2 2 2 0 2 0 2 0 2 2
 2 1 1 1 1 2 2 0]
0.8888888888888888
[0 0 1 1 0 1 1 2 1 1 0 2 1 2 1 2 2 2 2 0 0 2 2 1 0 2 0 2 2 2 0 1 0 1 0 2 2
 2 1 1 1 1 2 2 0]
[[1.  0.  0. ]
 [1.  0.  0. ]
 [0.  1.  0. ]
 [0.  1.  0. ]
 [1.  0.  0. ]
 [0.  1.  0. ]
 [0.  1.  0. ]
 [0.  0.  1. ]
 [0.  1.  0. ]
 [0.  1.  0. ]
 [1.  0.  0. ]
 [0.  0.  1. ]
 [0.  1.  0. ]
 [0.  0.  1. ]
 [0.  1.  0. ]
 [0.  0.2 0.8]
 [0.  0.4 0.6]
 [0.  0.  1. ]
 [0.  0.  1. ]
 [1.  0.  0. ]
 [1.  0.  0. ]
 [0.  0.  1. ]
 [0.  0.  1. ]
 [0.  1.  0. ]
 [1.  0.  0. ]
 [0.  0.2 0.8]
 [1.  0.  0. ]
 [0.  0.4 0.6]
 [0.  0.  1. ]
 [0.  0.4 0.6]
 [1.  0.  0. ]
 [0.  0.8 0.2]
 [1.  0.  0. ]
 [0.  0.6 0.4]
 [1.  0.  0. ]
 [0.  0.  1. ]
 [0.  0.  1. ]
 [0.  0.  1. ]
 [0.  1.  0. ]
 [0.  1.  0. ]
 [0.  0.6 0.4]
 [0.  1.  0. ]
 [0.  0.  1. ]
 [0.  0.  1. ]
 [1.  0.  0. ]]
0.9333333333333333
