In [1]:
# numpyの準備
import numpy as np

In [2]:
# あやめのデータを準備
from sklearn.datasets import load_iris

In [3]:
# dataにデータをセット
data = load_iris()

In [4]:
X = data.data # .dataにデータが入っている

In [5]:
dir(data)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [6]:
X.shape # データ個数×特徴数

(150, 4)

In [7]:
X[0] # 最初のデータ（4次元ベクトル）

array([5.1, 3.5, 1.4, 0.2])

In [8]:
data.feature_names # 特徴の意味

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [9]:
y = data.target # .targetにラベルが入っている

In [10]:
y.shape # データ個数

(150,)

In [11]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [12]:
y[0] # 最初のデータのラベル

0

In [13]:
data.target_names # ラベルの意味

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

[Iris flower data set @ Wikipedia](https://en.wikipedia.org/wiki/Iris_flower_data_set)


|class|image|license|
|------|------|--|
| Iris setosa |<img src="https://upload.wikimedia.org/wikipedia/commons/5/56/Kosaciec_szczecinkowaty_Iris_setosa.jpg" width=200> | CC BY-SA 3.0 by Radomil |
|Iris versicolor|<img src="https://upload.wikimedia.org/wikipedia/commons/4/41/Iris_versicolor_3.jpg" width=200>|CC BY-SA 3.0 by Danielle Langlois|
|Iris virginica|<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/Iris_virginica.jpg/736px-Iris_virginica.jpg" width=200>|CC BY-SA 2.0|

In [14]:
print(data.DESCR) # データの詳細な記述

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [15]:
# 線形モデルを準備
from sklearn import linear_model

# 識別器を作成
clf = linear_model.LogisticRegression()

In [16]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
n_samples = X.shape[0] # データの個数
n_train   = n_samples // 2 # 半分のデータを学習
n_test    = n_samples - n_train # テストデータ数

In [19]:
# 0,1,...,n_train-1：最初の半分を学習データのインデックスに
train_index = range(0, n_train)

# n_train,n_train+1,...,n_samples-1：残りの半分をテストデータのインデックスに
test_index  = range(n_train, n_samples)

In [20]:
np.array(train_index), np.array(test_index) # 確認してみる

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74]),
 array([ 75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,
         88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
        101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
        114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
        127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149]))

In [21]:
X_train, X_test = X[train_index], X[test_index] # 学習データ，テストデータ
y_train, y_test = y[train_index], y[test_index] # 学習データのラベル，テストデータのラベル

In [22]:
clf.fit(X_train, y_train); # 識別器の学習



In [23]:
print(clf.score(X_train, y_train)) # 学習データの精度

1.0


In [24]:
print(clf.score(X_test, y_test)) # テストデータの精度

0.3333333333333333


In [25]:
clf.predict(X_test), y_test # テストデータの識別

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2]))

In [26]:
wrong = 0
for i,j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i,j)
    else:
        print(i,j, " Wrong!")
        wrong += 1

1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!


In [27]:
print("{0} / {1} = {2}".format(wrong,
                               n_test, 
                               1 - wrong / n_test ))

50 / 75 = 0.33333333333333337


# なぜこうなったのか？

In [None]:
y_train, y_test # 学習ラベルとテストラベルを確認してみる

In [None]:
# ランダムにシャッフルして，学習・テストに分割するモジュール
from sklearn.model_selection import ShuffleSplit

In [None]:
ss = ShuffleSplit(n_splits=1,      # 分割を1個生成
                  train_size=0.5,  # 学習は半分
                  test_size =0.5,  # テストも半分
                  random_state=0)  # 乱数種（再現用）

In [None]:
# 学習データとテストデータのインデックスを作成
train_index, test_index = next(ss.split(X)) 

In [None]:
list(train_index), list(test_index) # 確認してみる

In [None]:
X_train, X_test = X[train_index], X[test_index] # 学習データ，テストデータ
y_train, y_test = y[train_index], y[test_index] # 学習データのラベル，テストデータのラベル

In [None]:
y_train, y_test # 学習ラベルとテストラベルを確認してみる

In [None]:
clf.fit(X_train, y_train); # 識別器の学習

In [None]:
print(clf.score(X_train, y_train)) # 学習データの精度

In [None]:
print(clf.score(X_test, y_test)) # テストデータの精度

In [None]:
clf.predict(X_test), y_test # テストデータの識別

In [None]:
wrong = 0
for i,j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i,j)
    else:
        print(i,j, " Wrong!")
        wrong += 1

In [None]:
print("{0} / {1} = {2}".format(wrong,
                               n_test, 
                               1 - wrong / n_test ))