In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn 
import pandas as pd
import os, gc, sys, time
import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

## **2.1数据集titanic**

In [2]:
train_file = './data/titanic/titanic_train.csv'
eval_file = './data/titanic/titanic_eval.csv'

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

train_df.head(5)

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [3]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

train_df.head(10)

Unnamed: 0,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,female,35.0,1,0,53.1,First,C,Southampton,n
4,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
5,male,2.0,3,1,21.075,Third,unknown,Southampton,n
6,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
7,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
8,female,4.0,1,1,16.7,Third,G,Southampton,n
9,male,20.0,0,0,8.05,Third,unknown,Southampton,y


## **2.2 feature_column的使用**
**离散特征**: one-hot编码     
**连续特征**: 分桶

In [4]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []

# 将类别特征转换为one-hot的特征
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    # 使用tf.feature_columns下的api
    feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categorical_column, vocab)))
for categorical_column in numeric_columns:
    feature_columns.append(tf.feature_column.numeric_column(categorical_column, dtype=tf.float32))

In [5]:
def make_dataset(data_df, label_df, epochs = 10, shuffle = True, batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

## **1.3  tf自带estimator**

有两种训练的模式
- 直接调用fit的方法
- 将model转换为estimator，然后再进行训练

In [11]:
output_dir = 'linear_model'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

linear_estimator = tf.estimator.LinearClassifier(model_dir=output_dir, 
                                                     n_classes = 2,
                                                   feature_columns=feature_columns)
linear_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs=100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000287B6924D30>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x287b69249b0>

In [12]:
linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-12-25T13:47:02Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_model\model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-12-25-13:47:03
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.780303, accuracy_baseline = 0.625, auc = 0.8393022, auc_precision_recall = 0.78339857, average_loss = 0.4715886, global_step = 1960, label/mean = 0.375, loss = 0.45434365, precision = 0.7113402, prediction/mean = 0.3666899, recall = 0.6969697
INFO:tenso

{'accuracy': 0.780303,
 'accuracy_baseline': 0.625,
 'auc': 0.8393022,
 'auc_precision_recall': 0.78339857,
 'average_loss': 0.4715886,
 'label/mean': 0.375,
 'loss': 0.45434365,
 'precision': 0.7113402,
 'prediction/mean': 0.3666899,
 'recall': 0.6969697,
 'global_step': 1960}

In [16]:
output_dir = 'dnn_model'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

dnn_estimator = tf.estimator.DNNClassifier(model_dir=output_dir, 
                                           n_classes = 2,
                                           hidden_units=[128, 128],
                                           activation_fn=tf.nn.relu,
                                           optimizer='Adam',
                                           feature_columns=feature_columns)
dnn_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs=100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'dnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002891A1DD8D0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers to

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x287b6b05400>

In [17]:
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-12-25T13:50:46Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from dnn_model\model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-12-25-13:50:47
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.780303, accuracy_baseline = 0.625, auc = 0.83936334, auc_precision_recall = 0.81086457, average_loss = 0.528422, global_step = 1960, label/mean = 0.375, loss = 0.5109839, precision = 0.6722689, prediction/mean = 0.47925717, recall = 0.8080808
INFO:tensorfl

{'accuracy': 0.780303,
 'accuracy_baseline': 0.625,
 'auc': 0.83936334,
 'auc_precision_recall': 0.81086457,
 'average_loss': 0.528422,
 'label/mean': 0.375,
 'loss': 0.5109839,
 'precision': 0.6722689,
 'prediction/mean': 0.47925717,
 'recall': 0.8080808,
 'global_step': 1960}

estimator在model之外额外完成了很多操作，或保存tensorboard和model