In [1]:
# numpyの準備
import numpy as np

In [2]:
# あやめのデータを準備
from sklearn.datasets import load_iris

In [3]:
# dataにデータをセット
data = load_iris()

In [4]:
# .dataにデータが入っている
X = data.data

In [5]:
X.shape

(150, 4)

In [6]:
# 最初のデータ（4次元ベクトル）
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [7]:
#特徴の意味
data.feature_names 

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [8]:
# .targetにラベルが入っている
y = data.target

In [9]:
# データ個数
y.shape

(150,)

In [10]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [11]:
# 最初のデータのラベル
y[0]

0

In [12]:
# ラベルの意味
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [13]:
# データの詳細な記述
print(data.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [14]:
# 線形モデルを準備
from sklearn import linear_model

# 識別器を作成
clf = linear_model.LogisticRegression()

In [15]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
# データの個数
n_samples = X.shape[0]
# 半分のデータを学習
n_train = n_samples // 2
# テストデータ数
n_test = n_samples - n_train

In [17]:
# 0,1,...,n_train-1 : 最初の半分
train_index = range(0, n_train)

# n_train, n_train+1,...n_samples-1 : 残りの半分
test_index = range(n_train, n_samples)

In [18]:
# 確認してみる
np.array(train_index), np.array(test_index)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74]),
 array([ 75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,
         88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
        101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
        114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
        127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149]))

In [19]:
# テストデータ, 学習データ
X_test, X_train = X[test_index], X[train_index] 

# テストデータのラベル, 学習データのラベル
y_train, y_test = y[train_index], y[test_index]

In [20]:
# 識別器の学習
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
# 学習データの精度
print(clf.score(X_train, y_train)) 

1.0


In [22]:
# テストデータの精度
print(clf.score(X_test, y_test))

0.3333333333333333


In [24]:
# テストデータの識別
clf.predict(X_test), y_test

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2]))

In [26]:
wrong = 0

for i, j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i, j)
    else:
        print(i, j, " Wrong!")
        wrong += 1

1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!
1 2  Wrong!


In [27]:
print("{0} / {1} = {2}".format(wrong,
                               n_test, 
                               1 - wrong / n_test ))

50 / 75 = 0.33333333333333337


In [28]:
# 学習ラベルとテストラベルを確認してみる
y_train, y_test

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2]))

In [29]:
# ランダムにシャフルして、学習・テストに分割するモジュール
from sklearn.model_selection import ShuffleSplit

In [31]:
ss = ShuffleSplit(n_splits=1, #分割を1個生成
                  train_size=0.5, #学習は半分
                  test_size =0.5, #テストも半分
                  random_state=0) #乱数種（再現用）

In [32]:
# 学習データとテストデータのインデックスを作成
train_index, test_index = next(ss.split(X))

In [33]:
list(train_index), list(test_index)

([3,
  149,
  98,
  6,
  68,
  109,
  96,
  12,
  102,
  120,
  104,
  128,
  46,
  11,
  110,
  124,
  41,
  148,
  1,
  113,
  139,
  42,
  4,
  129,
  17,
  38,
  5,
  53,
  143,
  105,
  0,
  34,
  28,
  55,
  75,
  35,
  23,
  74,
  31,
  118,
  57,
  131,
  65,
  32,
  138,
  14,
  122,
  19,
  29,
  130,
  49,
  136,
  99,
  82,
  79,
  115,
  145,
  72,
  77,
  25,
  81,
  140,
  142,
  39,
  58,
  88,
  70,
  87,
  36,
  21,
  9,
  103,
  67,
  117,
  47],
 [114,
  62,
  33,
  107,
  7,
  100,
  40,
  86,
  76,
  71,
  134,
  51,
  73,
  54,
  63,
  37,
  78,
  90,
  45,
  16,
  121,
  66,
  24,
  8,
  126,
  22,
  44,
  97,
  93,
  26,
  137,
  84,
  27,
  127,
  132,
  59,
  18,
  83,
  61,
  92,
  112,
  2,
  141,
  43,
  10,
  60,
  116,
  144,
  119,
  108,
  69,
  135,
  56,
  80,
  123,
  133,
  106,
  146,
  50,
  147,
  85,
  30,
  101,
  94,
  64,
  89,
  91,
  125,
  48,
  13,
  111,
  95,
  20,
  15,
  52])

In [34]:
# テストデータ, 学習データ
X_test, X_train = X[test_index], X[train_index] 

# テストデータのラベル, 学習データのラベル
y_train, y_test = y[train_index], y[test_index]

In [35]:
# 学習ラベルとテストラベルを確認してみる
y_train, y_test

(array([0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0,
        0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2, 1, 2, 1, 0,
        2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 1, 0, 1, 2, 2, 0, 1, 1,
        1, 1, 0, 0, 0, 2, 1, 2, 0]),
 array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
        0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
        0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,
        1, 2, 0, 0, 2, 1, 0, 0, 1]))

In [36]:
# 識別器の学習
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
# 学習データの精度
print(clf.score(X_train, y_train)) 

0.92


In [38]:
# テストデータの精度
print(clf.score(X_test, y_test))

0.84


In [39]:
# テストデータの識別
clf.predict(X_test), y_test

(array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 2, 0, 2, 2, 0, 0, 2, 2,
        0, 0, 2, 0, 0, 1, 1, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 1, 2, 0, 2, 0,
        0, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 1, 1,
        2, 2, 0, 0, 2, 1, 0, 0, 1]),
 array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
        0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
        0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,
        1, 2, 0, 0, 2, 1, 0, 0, 1]))

In [40]:
wrong = 0

for i, j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i, j)
    else:
        print(i, j, " Wrong!")
        wrong += 1

2 2
1 1
0 0
2 2
0 0
2 2
0 0
1 1
1 1
1 1
2 2
1 1
1 1
1 1
2 1  Wrong!
0 0
2 1  Wrong!
2 1  Wrong!
0 0
0 0
2 2
2 1  Wrong!
0 0
0 0
2 2
0 0
0 0
1 1
1 1
0 0
2 2
2 1  Wrong!
0 0
2 2
2 2
2 1  Wrong!
0 0
2 1  Wrong!
2 1  Wrong!
1 1
2 2
0 0
2 2
0 0
0 0
1 1
2 2
2 2
2 2
2 2
1 1
2 2
2 1  Wrong!
1 1
2 2
2 2
2 2
2 2
1 1
2 2
2 1  Wrong!
0 0
2 2
2 1  Wrong!
1 1
1 1
2 1  Wrong!
2 2
0 0
0 0
2 2
1 1
0 0
0 0
1 1


In [41]:
print("{0} / {1} = {2}".format(wrong,
                               n_test, 
                               1 - wrong / n_test ))

12 / 75 = 0.84
