In [1]:
import os
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()

load_iris返回的iris对象是一个Bunch对象。

In [3]:
print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))

Keys of iris_dataset: 
dict_keys(['DESCR', 'feature_names', 'target', 'target_names', 'data'])


In [4]:
# DESCR键对应的值是数据集的简要说明。

print(iris_dataset['DESCR'][:193] + "\n")

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive att



In [5]:
print("Target names:{}".format(iris_dataset['target_names']))

Target names:['setosa' 'versicolor' 'virginica']


In [6]:
print("Feature names:\n{}".format(iris_dataset['feature_names']))

Feature names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [7]:
# 数据存储在target和data字段中。data中是花萼长度、花萼宽度、花瓣长度、花瓣宽度的测量数据

In [8]:
print("Type of data: {}".format(type(iris_dataset['data'])))

Type of data: <class 'numpy.ndarray'>


In [9]:
print("Shape of data: {}".format(iris_dataset['data'].shape))

Shape of data: (150, 4)


In [10]:
# 数组中包含150条花朵测量样本。每一项属性叫做特征。数组的形状为样本数量乘以特征数量

In [11]:
print("First five rows of data: \n {}".format(iris_dataset['data'][:5]))

First five rows of data: 
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [12]:
print("Type of target: {}".format(type(iris_dataset["target"])))

Type of target: <class 'numpy.ndarray'>


In [13]:
# target是一维数组，每一朵花对应一条记录

In [14]:
print("Shape of target: {}".format(iris_dataset['target'].shape))

Shape of target: (150,)


In [15]:
# 品种被转换为了0、1、2的分类整数

In [16]:
from sklearn.model_selection import train_test_split
import mglearn

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_dataset['data'], iris_dataset['target'], random_state=0)

* 在对数据进行拆分之前，train_test_split函数利用伪随机数生成器将数据集打乱。因为原始数据集是按照标签来进行排序的，贸然选择后25%的结果只会是标签2。
* 为了确保多次运行同一函数能够得到相同的输出，利用random_state选项指定了随机数生成器的种子。

In [18]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

X_train shape: (112, 4)
y_train shape: (112,)


In [19]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_test shape: (38, 4)
y_test shape: (38,)


要点一： 观察数据

可以检查数据的异常值和特殊值。

In [20]:
# 利用X_train中的数据创建df
# 利用iris_dataset.feature_names中的字符串对数据列进行标记

iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)

# 利用DF创建散点图矩阵，按照y_train进行着色

grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',
                        hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3)

K-means算法

scikit-learn中所有的机器学习模型都是在各自的类中实现，这些类被称为Estimator。

k近邻算法实在neighbors模块的KNeighborsClassifier类中实现。使用前先实例化

In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

knn对象对算法进行了封装，既包括了训练数据构建模型的算法，也包括对新数据点进行预测的算法。同时还包括算法从训练数据中提取的信息。

而对于KNeighborsClassifier来说，里面只存储了训练集

In [23]:
# 基于训练集来构建模型，需要利用knn对象中的fit方法，输入参数为X_train和y_train，两者皆为二维numpy数组，前者为训练数据，后者为训练标签

In [28]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

fit方法返回的是knn对象并就地修改（做原处修改）

预测

In [29]:
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape: {}".format(X_new.shape))

X_new.shape: (1, 4)


In [30]:
prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

Prediction: [0]
Predicted target name: ['setosa']


预测结果为‘setosa’

最后是评估模型

In [31]:
# 通过计算精度来评估模型的优劣

In [32]:
y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


In [33]:
print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))

Test set score: 0.97


In [34]:
# 同样可以采用knn对象的score方法来计算测试集的精度

print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 0.97


总结：fit，predict和score方法是scikit-learn监督学习模型中的核心接口

In [37]:
# 最后总结下核心代码：

X_train, X_test, y_train, y_test = train_test_split(
        iris_dataset['data'], iris_dataset['target'], random_state=0)

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

print("Test set score : {:.2f}".format(knn.score(X_test, y_test)))




Test set score : 0.97
