In [None]:
# ライブラリのインポート

from itertools import product
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from IPython.display import Image 
import pydotplus

In [None]:
# データを読み込む

iris = datasets.load_iris()

# データを見てみる
#print(iris)
print("Data")
print(iris.data.__class__)
print(iris.data)
print("Classes ", iris.target_names)
print(iris.target.__class__)
print(iris.target)

In [None]:
# データをプロットしてみる

plt.clf()
f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(11, 10))

axarr[0, 0].scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target, alpha=0.6)
axarr[0, 0].set_title(iris.feature_names[0] + " vs " + iris.feature_names[1])

axarr[0, 1].scatter(iris.data[:, 1], iris.data[:, 2], c=iris.target, alpha=0.6)
axarr[0, 1].set_title(iris.feature_names[1] + " vs " + iris.feature_names[2])

axarr[1, 0].scatter(iris.data[:, 0], iris.data[:, 3], c=iris.target, alpha=0.6)
axarr[1, 0].set_title(iris.feature_names[0] + " vs " + iris.feature_names[3])

axarr[1, 1].scatter(iris.data[:, 2], iris.data[:, 3], c=iris.target, alpha=0.6)
axarr[1, 1].set_title(iris.feature_names[2] + " vs " + iris.feature_names[3])

plt.show()

print(iris.data[:, 0].shape)
print(iris.data[:, 1].shape)

In [None]:
# 単純化のために2次元の特徴量のみを使う
X = iris.data[:, [0, 1]]
y = iris.target

#　識別器のインスタンスをつくる. SVM
svm = SVC(kernel='rbf', probability=True)

# 学習させる
svm.fit(X, y)

# 学習データに対する精度
print("Score = {0}".format(svm.score(X, y)))

# 予測する
for x in X:
    print("Predict f(%s) = %s" % (x, svm.predict([x])))

In [None]:
# 他の識別器
classifiers = [
    DecisionTreeClassifier(max_depth=2),
    KNeighborsClassifier(n_neighbors=1),
    SVC(kernel='rbf', probability=True),
    ]

# 同じインタフェース 
for classifier in classifiers:
    classifier.fit(X, y)

# Plotting decision regions
plt.clf()
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.05), np.arange(y_min, y_max, 0.05))

f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(11, 10))

for index, classifier, title in zip([0, 1, 2], classifiers, ['Decision Tree (depth=2)', 'KNN (k=1)', 'Kernel SVM', 'Liner SVM']):
    predicted = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    predicted = predicted.reshape(xx.shape)
    
    #     print(predicted)
    axarr[index // 2, index % 2].contourf(xx, yy, predicted, alpha=0.3)
    axarr[index // 2, index % 2].scatter(X[:, 0], X[:, 1], c=y, alpha=0.6)
    axarr[index // 2, index % 2].set_title("%s (%f)" % (title, classifier.score(X, y)))
    
plt.show()

In [None]:
## 精度の評価
for classifier, title in zip(classifiers, ['Decision Tree (depth=2)', 'KNN (k=1)', 'Kernel SVM', 'Liner SVM']):
    scores = cross_val_score(classifier, X, y, cv=6)
    print("Cross Validation Score of %s = mean(%s) = %s" % (title, scores, scores.mean()))

In [None]:
## トレーニング後のDecision Treeを可視化
dot_data = tree.export_graphviz(classifiers[0], out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 

Image(graph.create_png()) 

In [None]:
## Pandas
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

iris_df

In [None]:
## データの概要
iris_df.describe()

In [None]:
## データの相関係数
iris_df.corr()

In [None]:
## 1番目と3番目と4番目は相関が高いので3,4番目を捨てる
iris_df_simple = iris_df.drop(["petal length (cm)", "petal width (cm)"], axis=1)
clf = DecisionTreeClassifier(max_depth=2)

print(iris_df_simple.shape)
score = cross_val_score(clf, iris_df_simple, y, cv=6).mean()
print("Cross Validation Score = %s" % score)

In [None]:
## 最適な特徴量2個をつくる
iris_df_new = iris_df.drop(["petal width (cm)", "petal length (cm)", "sepal length (cm)"], axis=1)
iris_df_new['new_feature'] =  iris_df['petal length (cm)'] + iris_df["petal width (cm)"] + iris_df['sepal length (cm)'] * 0.5
clf = DecisionTreeClassifier(max_depth=2)

print(iris_df_new.shape)
if(iris_df_new.shape != (150, 2)):
    print("The size of feature vector is different. It should be (150, 2)")
else:
    score = cross_val_score(clf, iris_df_new, y, cv=6).mean()
    print("Cross Validation Score = %s" % score)

In [None]:
## パラメータのグリッドサーチ
from sklearn.model_selection import GridSearchCV

search_params = [{
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 3, 4, 5, 6, 7],
    'max_features': [None, "auto"],
}]

tuned_clf = GridSearchCV(DecisionTreeClassifier(random_state=1), search_params, cv=6)
tuned_clf.fit(iris_df_new, y)
print("Best Score %s " % tuned_clf.best_score_)
print("Best Params %s " % tuned_clf.best_params_)

In [None]:
## 同じ特徴量2個での比較

### 既存の特徴量data[0], data[3]と、デフォルトパラメータのDecisionTree
data_1 = X
first_decision_tree = classifiers[0]
score = cross_val_score(first_decision_tree, data_1, iris.target, cv=6).mean()
print("Cross Validation Score(first) = %s" % score)

### 新しい特徴量とパラメータチューニングしたDecisionTree
data_2 = iris_df_new
first_decision_tree = tuned_clf

score = cross_val_score(first_decision_tree, data_2, iris.target, cv=6).mean()
print("Cross Validation Score(tuned) = %s" % score)
