In [1]:
import numpy as np
x = np.array([[1, 2, 3], [4, 5, 6]])
print("x:\n{}".format(x))

x:
[[1 2 3]
 [4 5 6]]


In [3]:
from scipy import sparse
# 対角成分が1でそれ以外が0の、2次元NumPy配列を作る
eye = np.eye(5)
print("NumPy array:\n{}".format(eye))

NumPy array:
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [4]:
# NumPy配列をSciPyのCSR形式の疎行列に変換する
# 非ゼロ要素だけが格納される
sparse_matrix = sparse.csr_matrix(eye)
print("\nSciPy sparse CSR matrix:\n{}".format(sparse_matrix))



SciPy sparse CSR matrix:
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0


In [8]:
data = np.ones(5)
row_indices = np.arange(5)
col_indices = np.arange(5)
eye_coo = sparse.coo_matrix((data, (row_indices, col_indices)))
print("COO representation:\n{}".format(eye_coo))

COO representation:
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0


In [15]:
%matplotlib notebook
import matplotlib.pyplot as plt
# -10から10までを100ステップに区切った列を配列として生成
x = np.linspace(-5, 5, 100)
# coサイン関数を用いて2つ目の配列を生成
y = np.cos(x)
# plot関数は、一方の配列に対して他方の配列をプロットする
plt.plot(x, y, marker="x")

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x26360f9ef40>]

In [16]:
import pandas as pd
# 人を表す簡単なデータセットを作る
data = {'Name': ["John", "Anna", "Peter", "Linda"],
'Location' : ["New York", "Paris", "Berlin", "London"],
'Age' : [24, 13, 53, 33]
}
data_pandas = pd.DataFrame(data)

In [17]:
# IPython.displayを用いるとDataFrameを
# Jupyter notebook上できれいに表示することができる。
display(data_pandas)

Unnamed: 0,Name,Location,Age
0,John,New York,24
1,Anna,Paris,13
2,Peter,Berlin,53
3,Linda,London,33


In [18]:
# ageカラムが30を超えるすべての行を取り出す
display(data_pandas[data_pandas.Age > 30])

Unnamed: 0,Name,Location,Age
2,Peter,Berlin,53
3,Linda,London,33


In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

In [23]:
pip install mglearn

Note: you may need to restart the kernel to use updated packages.


In [24]:
import sys
print("Python version: {}".format(sys.version))
import pandas as pd
print("pandas version: {}".format(pd.__version__))
import matplotlib
print("matplotlib version: {}".format(matplotlib.__version__))
import numpy as np
print("NumPy version: {}".format(np.__version__))
import scipy as sp
print("SciPy version: {}".format(sp.__version__))
import IPython
print("IPython version: {}".format(IPython.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))

Python version: 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]
pandas version: 1.4.4
matplotlib version: 3.5.2
NumPy version: 1.21.5
SciPy version: 1.9.1
IPython version: 7.31.1
scikit-learn version: 1.0.2


In [25]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()


In [26]:
print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))

Keys of iris_dataset: 
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [29]:
print(iris_dataset['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [35]:
print(iris_dataset['data_module'])

sklearn.datasets.data


In [36]:
print(iris_dataset['target_names'])

['setosa' 'versicolor' 'virginica']


In [98]:
iris_dataset['data']

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [38]:
type(iris_dataset['target_names'])

numpy.ndarray

In [40]:
type(iris_dataset['data_module'])

str

In [41]:
print("Type of data: {}".format(type(iris_dataset['data'])))

Type of data: <class 'numpy.ndarray'>


In [42]:
print("Shape of data: {}".format(iris_dataset['data'].shape))

Shape of data: (150, 4)


In [43]:
format(iris_dataset['data'].shape)

'(150, 4)'

In [44]:
type(format(iris_dataset['data'].shape))

str

In [45]:
print(iris_dataset['target'])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [47]:
type(iris_dataset['target'])

numpy.ndarray

In [48]:
print("Shape of target: {}".format(iris_dataset['target'].shape))

Shape of target: (150,)


In [49]:
iris_dataset['target'].shape

(150,)

In [51]:
format(iris_dataset['target'].shape)

'(150,)'

In [52]:
type(iris_dataset['target'].shape)

tuple

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
iris_dataset['data'], iris_dataset['target'], random_state=0)

In [54]:
##同じ関数を何度か呼び出した際に、確実に同じ結果が得られるよう、random_stateパラメータを
##用いて擬似乱数生成器に同じシードを渡している。これによって出力が決定的になり、常に同じ結
##果が得られるようになる。本書では、乱数を用いる際には常にこのようにrandom_stateパラメータ
##を固定して用いる。?

In [55]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

X_train shape: (112, 4)
y_train shape: (112,)


In [56]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_test shape: (38, 4)
y_test shape: (38,)


In [58]:
# X_trainのデータからDataFrameを作る、
# iris_dataset.feature_namesの文字列を使ってカラムに名前を付ける。
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
# データフレームからscatter matrixを作成し、y_trainに従って色を付ける。
grr = pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',
hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3)

<IPython.core.display.Javascript object>

In [89]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)


In [90]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [91]:
print(type(knn),
type(knn.fit(X_train, y_train)))

<class 'sklearn.neighbors._classification.KNeighborsClassifier'> <class 'sklearn.neighbors._classification.KNeighborsClassifier'>


In [92]:
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape: {}".format(X_new.shape))

X_new.shape: (1, 4)


In [93]:
prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(
iris_dataset['target_names'][prediction]))

Prediction: [0]
Predicted target name: ['setosa']


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [94]:
iris_dataset['target_names'][1]

'versicolor'

In [95]:
y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [96]:
print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
## 予測結果とテストデータが同じになるものを1 この期待値

Test set score: 0.97


In [97]:
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 0.97


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [99]:
## 訓練と評価を行うために必要な最小の手順
X_train, X_test, y_train, y_test = train_test_split(
 iris_dataset['data'], iris_dataset['target'], random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 0.97


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
