## multi-class classification (使用sckit-learn)
       
       1.) 針對iris 品種分類('setosa' 'versicolor' 'virginica'), 進行logistic,pca,svm,knn,decision tree, random forest等演練
       2.) PCA 
     
       



In [1]:
%matplotlib inline
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os,sys
from utility import plot_confusion_matrix,plot_decision_regions,testcase_report

ModuleNotFoundError: No module named 'utility'

## 載入Iris DataSet

In [None]:
iris = datasets.load_iris()
print(iris.DESCR)
x = pd.DataFrame(iris['data'], columns=iris['feature_names'])
print(iris['data'].shape)
print("target_names: "+str(iris['target_names']))
y = pd.DataFrame(iris['target'], columns=['target'])
iris_data = pd.concat([x,y], axis=1)
iris_data = iris_data[['sepal length (cm)','petal length (cm)','target']]  #only  select two featrues
iris_data.head(5)



## 切割資料集 (training set, test set)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_data.drop(['target'],axis=1), iris_data['target'], test_size=0.3, random_state=0)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_test.head(5)

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
X_train_std[:5]

## Create LogisticRegression Classifier  (using X_train_std)

In [None]:
X_train_std[:5]

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs',multi_class='auto') # multi_class=multinomial
lr.fit(X_train_std,y_train)
lr.predict(X_test_std)


In [None]:
print(lr.coef_)

In [None]:
print(y_test.values)

In [None]:
error=np.where(lr.predict(X_test_std)!=y_test)[0]
print('misclassified:',error)
print('score:',1-len(error)/len(X_test_std))

In [None]:
lr.score(X_test_std, y_test)

### 使用testcase_report() 回報分類結果

In [None]:
report=testcase_report(iris_data,lr,X_test,X_test_std,y_test)

In [None]:
report[0]

In [None]:
report[1].head()

In [None]:
X_test.head()

# 主成分分析PCA

###   fit_transform() v.s. transform()
    根據對之前部分trainData進行fit的整體指標，對剩餘的資料（testData）使用同樣的均值、方差、最大最小值等指標進行轉換transform(testData)，對於train、test處理方式要相同。

In [None]:
X_train_std[:5]

In [None]:
from sklearn.decomposition import PCA
# 主成分分析PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c= y_train, label="PCA")
plt.legend()
plt.show()

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_ratio_.cumsum()

In [None]:
print(X_train_pca.shape)
print(y_train.shape)

## Create LogisticRegression Classifier (using X_train_pca)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs',multi_class='auto')
lr.fit(X_train_pca,y_train)
plot_decision_regions(X_train_pca,y_train, classifier=lr)
plt.title('LogisticRegression Classifier (PCA=2) for training data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
lr.score(X_test_pca, y_test)


In [None]:
report=testcase_report(iris_data,lr,X_test,X_test_pca,y_test)
report[0]

In [None]:
plot_decision_regions(X_test_pca,y_test, classifier=lr)
plt.title('LogisticRegression Classifier (PCA=2) for testing data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()


## Create KNN Classifier

In [None]:
from sklearn import neighbors, datasets
knn = neighbors.KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_pca, y_train)

plot_decision_regions(X_train_pca, y_train, classifier=knn)
plt.title('KNN Classifier (PCA=2) for training data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
print(knn.score(X_train_pca, y_train))
print(knn.score(X_test_pca, y_test))

In [None]:
report=testcase_report(iris_data,knn,X_test,X_test_pca,y_test)
report[0]

In [None]:
plot_decision_regions(X_test_pca,y_test, classifier=knn)
plt.title('KNN Classifier (PCA=2) for testing data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()


## Create SVM  Classifier 

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf',gamma='auto')
svm.fit(X_train_pca, y_train)
plot_decision_regions(X_train_pca, y_train, classifier=svm)
plt.title('SVM Classifier (PCA=2) for training data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

svm.score(X_test_pca, y_test)

In [None]:
report=testcase_report(iris_data,svm,X_test,X_test_pca,y_test)
report[0]

In [None]:
plot_decision_regions(X_test_pca,y_test, classifier=svm)
plt.title('SVM Classifier (PCA=2) for testing data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()


## Create DecisionTree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = 'entropy',max_depth=3, random_state=0)
clf.fit(X_train_pca, y_train)
plot_decision_regions(X_train_pca, y_train, classifier=clf)
plt.title('Decision Tree Classifier (PCA=2) for training data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

clf.score(X_test_pca, y_test)

In [None]:
report=testcase_report(iris_data,clf,X_test,X_test_pca,y_test)
report[0]

In [None]:
plot_decision_regions(X_test_pca,y_test, classifier=clf)
plt.title('Decision Tree Classifier (PCA=2) for testing data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()


## Create Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=100,random_state=3,n_jobs=8)
forest.fit(X_train_pca, y_train)
plot_decision_regions(X_train_pca, y_train, classifier=forest)
plt.title('Random Forest Classifier (PCA=2) for training data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

forest.score(X_test_pca, y_test)

In [None]:
report=testcase_report(iris_data,forest,X_test,X_test_pca,y_test)
report[0]

In [None]:
plot_decision_regions(X_test_pca,y_test, classifier=forest)
plt.title('Random Forest Classifier (PCA=2) for testing data',fontsize=16)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()



In [None]:
report=testcase_report(iris_data,clf,X_test,X_test_pca,y_test)

## confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, cohen_kappa_score
predicted=clf.predict(X_test_pca)
C=confusion_matrix(y_test, predicted)
print(C)

In [None]:
iris['target_names']

In [None]:
# Plot non-normalized confusion matrix
plt.figure(figsize=(6,6))
plot_confusion_matrix(C, classes=iris['target_names'],
                      title='Confusion matrix, without normalization')

In [None]:
plt.figure(figsize=(6,6))
plot_confusion_matrix(C, classes=iris['target_names'], normalize=True,
                      title='Normalized confusion matrix')