In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn 
import pandas as pd
import os, gc, sys, time
import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

## **2.1数据集titanic**

In [2]:
train_file = './data/titanic/titanic_train.csv'
eval_file = './data/titanic/titanic_eval.csv'

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

train_df.head(5)

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [3]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

train_df.head(10)

Unnamed: 0,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,female,35.0,1,0,53.1,First,C,Southampton,n
4,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
5,male,2.0,3,1,21.075,Third,unknown,Southampton,n
6,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
7,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
8,female,4.0,1,1,16.7,Third,G,Southampton,n
9,male,20.0,0,0,8.05,Third,unknown,Southampton,y


## **2.2 feature_column的使用**
**离散特征**: one-hot编码     
**连续特征**: 分桶

In [4]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []

# 将类别特征转换为one-hot的特征
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    # 使用tf.feature_columns下的api
    feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categorical_column, vocab)))
for categorical_column in numeric_columns:
    feature_columns.append(tf.feature_column.numeric_column(categorical_column, dtype=tf.float32))

In [5]:
def make_dataset(data_df, label_df, epochs = 10, shuffle = True, batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset.make_one_shot_iterator().get_next()

## **2.3  tf自定义estimator**

In [7]:
output_dir = 'customized_easimator_tf1'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
def model_fn(features, labels, mode, params):
    """
    mode:model运行时的状态, Train, Eval, Predict
    """
    # input_layer类似于DenseFeature
    input_for_next_layer = tf.feature_column.input_layer(features, params['feature_columns'])
    for n_unit in params['hidden_units']:
        input_for_next_layer = tf.layers.dense(input_for_next_layer, units=n_unit, activation=tf.nn.relu)
    logits = tf.layers.dense(input_for_next_layer, params['n_classes'], activation=None)
    predicted_classes = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class_ids' : predicted_classes[:, tf.newaxis], # 需要是一个二维矩阵
            'probablilities' : tf.nn.softmax(logits), 
            'logits' : logits
        }
        return tf.estimator.EstimatorSpec(mode,predictions=predictions) # 需要返回一个EsimatorSpec对象
    loss = tf.losses.sparse_softmax_cross_entropy(labels = labels, logits = logits)
    accuracy = tf.metrics.accuracy(labels = labels, predictions = predicted_classes, 
                                   name = 'acc_op') # 可以计算累计准确率
    metrics = {'accuracy': accuracy}
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops = metrics)
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss,
                                  global_step = tf.train.get_global_step())
    if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode, loss = loss, train_op=train_op)

estimator = tf.estimator.Estimator(model_fn = model_fn, model_dir = output_dir, params= {
    'feature_columns': feature_columns,
    'hidden_units' : [100, 100],
    'n_classes' : 2
})

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'customized_easimator_tf1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000276265D1278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [8]:
estimator.train(input_fn = lambda: make_dataset(train_df, y_train, epochs=100))

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being dep

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x276254990f0>

In [9]:
estimator.evaluate(lambda: make_dataset(eval_df, y_eval, epochs=100))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-12-26T14:07:58Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from customized_easimator_tf1\model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-12-26-14:08:02
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.79924244, global_step = 1960, loss = 0.5362938
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: customized_easimator_tf1\model.ckpt-1960


{'accuracy': 0.79924244, 'loss': 0.5362938, 'global_step': 1960}