In [5]:
import tensorflow as tf

import pandas as pd

In [6]:
# 设置特征名称和标签名称
CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Setosa', 'Versicolor', 'Virginica']

In [7]:
# 加载数据集，并读取为Dataframe格式
train_path = tf.keras.utils.get_file(
    "iris_training.csv", "https://storage.googleapis.com/download.tensorflow.org/data/iris_training.csv")
test_path = tf.keras.utils.get_file(
    "iris_test.csv", "https://storage.googleapis.com/download.tensorflow.org/data/iris_test.csv")

# header: 将header这一行指定为列名，并且从这一行开始记录数据，默认为header=0
# names:指定列名，如果文件中不包含header的行，应该显性表示header=None
train = pd.read_csv(train_path, names=CSV_COLUMN_NAMES, header=0)
test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)

In [8]:
train.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,6.4,2.8,5.6,2.2,2
1,5.0,2.3,3.3,1.0,1
2,4.9,2.5,4.5,1.7,2
3,4.9,3.1,1.5,0.1,0
4,5.7,3.8,1.7,0.3,0


In [9]:
test.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,5.9,3.0,4.2,1.5,1
1,6.9,3.1,5.4,2.1,2
2,5.1,3.3,1.7,0.5,0
3,6.0,3.4,4.5,1.6,1
4,5.5,2.5,4.0,1.3,1


In [10]:
# 将训练数据和测试数据中的标签去掉
train_y = train.pop('Species')
test_y = test.pop('Species')

# 标签列现已从数据中删除
train.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,6.4,2.8,5.6,2.2
1,5.0,2.3,3.3,1.0
2,4.9,2.5,4.5,1.7
3,4.9,3.1,1.5,0.1
4,5.7,3.8,1.7,0.3


### 格式化输入数据

In [11]:
# 定义函数，将数据存储为dataset格式，可以节省内存，并且方便并行读取
# 此步是为了给搭建好的模型投喂格式正确的输入数据

def input_fn(features, labels, training=True, batch_size=256):
    """An input function for training or evaluating"""
    # 将输入转换为数据集。
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # 如果在训练模式下混淆并重复数据。
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

### 定义特征列

In [12]:
# 特征列描述了如何使用输入。
# 指定模型应该如何解读特定特征的一种函数
my_feature_columns = []
for key in train.keys():
    print(key)
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
print(my_feature_columns)

SepalLength
SepalWidth
PetalLength
PetalWidth
[NumericColumn(key='SepalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='SepalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='PetalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='PetalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]


### 构建Estimator

In [14]:
# 构建一个拥有两个隐层，隐藏节点分别为 30 和 10 的深度神经网络
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    
    # 隐层所含结点数量分别为 30 和 10.
    hidden_units=[30, 10],
    
    # 模型必须从三个类别中做出选择。
    n_classes=3)

W0518 20:11:02.604632 140736614536128 estimator.py:1821] Using temporary folder as model directory: /var/folders/_b/964gk3pj7998h867ynfnfnzr0000gn/T/tmpy29xpt70


### 训练模型

In [15]:
# 训练模型
classifier.train(input_fn=lambda: input_fn(train, train_y, training=True),
                 steps=5000)

W0518 20:13:38.552199 140736614536128 deprecation.py:506] From /anaconda3/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W0518 20:13:38.556589 140736614536128 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow_core/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
W0518 20:13:38.836825 140736614536128 base_layer.py:1814] Layer dnn is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x1829f93a58>

### 测试模型

In [16]:
# 测试模型
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test, test_y, training=False))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

W0518 20:15:44.068649 140736614536128 base_layer.py:1814] Layer dnn is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.




Test set accuracy: 0.967



In [17]:
# 由模型生成预测
expected = ['Setosa', 'Versicolor', 'Virginica']
predict_x = {
    'SepalLength': [5.1, 5.9, 6.9],
    'SepalWidth': [3.3, 3.0, 3.1],
    'PetalLength': [1.7, 4.2, 5.4],
    'PetalWidth': [0.5, 1.5, 2.1],
}

def input_fn(features, batch_size=256):
    """An input function for prediction."""
    # 将输入转换为无标签数据集。
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

predictions = classifier.predict(
    input_fn=lambda: input_fn(predict_x))

In [20]:
predictions

<generator object Estimator.predict at 0x182abd2938>

In [21]:
for pred_dict, expec in zip(predictions, expected):
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Prediction is "{}" ({:.1f}%), expected "{}"'.format(
        SPECIES[class_id], 100 * probability, expec))

Prediction is "Setosa" (77.3%), expected "Setosa"
Prediction is "Versicolor" (58.4%), expected "Versicolor"
Prediction is "Virginica" (60.2%), expected "Virginica"
