# 使用预定义的estimator处理titanic问题

In [None]:
!pip install tensorflow-gpu==2.0.0a0

In [3]:
# 导入
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2.0.0-alpha0
sys.version_info(major=3, minor=6, micro=4, releaselevel='final', serial=0)
matplotlib 2.2.3
numpy 1.16.2
pandas 0.22.0
sklearn 0.19.1
tensorflow 2.0.0-alpha0
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [5]:
def solve_cudnn_error():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)

solve_cudnn_error()

## 读取数据，构建数据集

In [6]:
train_file = 'data/titanic/train.csv'
eval_file = 'data/titanic/eval.csv'

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
   survived     sex   age  n_siblings_spouses  parch     fare   class  \
0         0    male  35.0                   0      0   8.0500   Third   
1         0    male  54.0                   0      0  51.8625   First   
2         1  female  58.0                   0      0  26.5500   First   
3         1  female  55.0                   0      0  16.0000  Second   
4         

In [7]:
# 将survived字段取出，赋给label
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

      sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
      sex   age  n_siblings_spouses  parch     fare   class     deck  \
0    male  35.0                   0      0   8.0500   Third  unknown   
1    male  54.0                   0      0  51.8625   First        E   
2  female  58.0                   0      0  26.5500   First        C   
3  female  55.0                   0      0  16.0000  Second  unknown   
4    male  34.0                   0      0  13.0000  Second        D   

  

In [8]:
# 将数据分成两类：离散和连续
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for categorical_column in categorical_columns:
    # 获取离散值所有可能值
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    # 对离散值封装，做onehot编码，再加入feature_columns中
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))
    
for numeric_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(numeric_column, dtype=tf.float32))

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [9]:
# 从pandas dataframe中构建dataset
def make_dataset(data_df, label_df, epochs = 10, shuffle = True, batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

def make_input_fn(data_df, label_df, epochs = 10, shuffle = True, batch_size = 32):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            dataset = dataset.shuffle(10000)
        dataset = dataset.repeat(epochs).batch(batch_size)
        return dataset
    return input_fn

train_input_fn = make_input_fn(train_df, y_train, epochs=100)
eval_input_fn = make_input_fn(eval_df, y_eval, epochs=1, shuffle=False)

## 使用预定义的estimator

### BaseLine Estimator

In [10]:
import shutil
# 定义文件夹，保存输出的模型
output_dir = 'baseline_model'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
else:
    shutil.rmtree(output_dir)
    os.mkdir(output_dir)
    
baseline_estimator = tf.estimator.BaselineClassifier(
    model_dir=output_dir, n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'baseline_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0fd57bf9e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [11]:
baseline_estimator.train(
    input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into baseline_model/model.ckpt.
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 267.446
INFO:tensorflow:loss = 0.69781625, step = 100 (0.375 sec)
INFO:tensorflow:global_step/sec: 360.908
INFO:tensorflow:loss = 0.67727596, step = 200 (0.277 sec)
INFO:tensorflow:global_step/sec: 370.339
INFO:tensorflow:loss = 0.66348416, step = 300 (0.270 sec)
INFO:tensorflow:global_step/sec

<tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifierV2 at 0x7f0fd57bf710>

In [12]:
baseline_estimator.evaluate(
    input_fn=lambda : make_dataset(
        eval_df, y_eval, epochs=1, shuffle=False, batch_size=20))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-07T15:50:40Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from baseline_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-07-15:50:41
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.625, accuracy_baseline = 0.625, auc = 0.5, auc_precision_recall = 0.375, average_loss = 0.66182214, global_step = 1960, label/mean = 0.375, loss = 0.6585086, precision = 0.0, prediction/mean = 0.38605928, recall = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: baseline_model/model.ckpt-1960


{'accuracy': 0.625,
 'accuracy_baseline': 0.625,
 'auc': 0.5,
 'auc_precision_recall': 0.375,
 'average_loss': 0.66182214,
 'global_step': 1960,
 'label/mean': 0.375,
 'loss': 0.6585086,
 'precision': 0.0,
 'prediction/mean': 0.38605928,
 'recall': 0.0}

### Linear Estimator

In [13]:
import shutil
# 定义文件夹，保存输出的模型
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
else:
    shutil.rmtree(linear_output_dir)
    os.mkdir(linear_output_dir)
    
linear_estimator = tf.estimator.LinearClassifier(
    model_dir=linear_output_dir,
    n_classes=2, 
    feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0fc4212cc0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [14]:
linear_estimator.train(
    input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into linear_model/model.ckpt.
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 164.314
INFO:tensorflow:loss = 0.38683054, step = 100 (0.610 sec)
INFO:tensorflow:global_step/sec: 232.357
INFO:tensorflow:loss = 0.45852956, step = 200 (0.431 sec)
INFO:tensorflow:global_step/sec: 238.923
INFO:tensorflow:loss = 0.41034898, step = 300 (0.418 sec)
INFO:tensorflow:global_step/sec: 238.063
INFO:tensorfl

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f0fc4212e10>

In [15]:
linear_estimator.evaluate(
    input_fn=lambda : make_dataset(eval_df, y_eval, epochs=1, shuffle=False))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-07T15:51:10Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-07-15:51:11
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.77272725, accuracy_baseline = 0.625, auc = 0.83618003, auc_precision_recall = 0.7919357, average_loss = 0.49561542, global_step = 1960, label/mean = 0.375, loss = 0.4794259, precision = 0.67256635, prediction/mean = 0.4565581, recall = 0.7676768
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: linear_model/model.ckpt-1960


{'accuracy': 0.77272725,
 'accuracy_baseline': 0.625,
 'auc': 0.83618003,
 'auc_precision_recall': 0.7919357,
 'average_loss': 0.49561542,
 'global_step': 1960,
 'label/mean': 0.375,
 'loss': 0.4794259,
 'precision': 0.67256635,
 'prediction/mean': 0.4565581,
 'recall': 0.7676768}

### DNN Estimator

In [16]:
import shutil
# 定义文件夹，保存输出的模型
dnn_output_dir = 'dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
else:
    shutil.rmtree(dnn_output_dir)
    os.mkdir(dnn_output_dir)
    
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir=dnn_output_dir,
    n_classes=2,
    feature_columns=feature_columns,
    hidden_units=[128,128],
    activation_fn=tf.nn.relu,
    optimizer='Adam')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'dnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f10462e4748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [17]:
dnn_estimator.train(
    input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into dnn_model/model.ckpt.
INFO:tensorflow:loss = 2.5398145, step = 0
INFO:tensorflow:global_step/sec: 178.35
INFO:tensorflow:loss = 0.66123134, step = 100 (0.562 sec)
INFO:tensorflow:global_step/sec: 241.721
INFO:tensorflow:loss = 0.4623671, step = 200 (0.414 sec)
INFO:tensorflow:global_step/sec: 235.946
INFO:tensorflow:loss = 0.31149834, step = 300 (0.424 sec)
INFO:tensorflow:global_step/sec: 217.333
INFO:tensorflow:loss = 0.3449259, step = 400 (0.460 sec)
INFO:tensorflow:global_step/sec: 240.293
INFO:tensorflow:loss = 0.37727702, step = 500 (0.416 sec)
INFO:tensorflow:global_step/sec: 244.324
I

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f10462e45c0>

In [18]:
dnn_estimator.evaluate(
    input_fn=lambda : make_dataset(eval_df, y_eval, epochs=1, shuffle=False))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-07T15:51:42Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from dnn_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-07-15:51:43
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.8030303, accuracy_baseline = 0.625, auc = 0.85561687, auc_precision_recall = 0.81294394, average_loss = 0.47788677, global_step = 1960, label/mean = 0.375, loss = 0.46174872, precision = 0.7196262, prediction/mean = 0.40425703, recall = 0.7777778
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: dnn_model/model.ckpt-1960


{'accuracy': 0.8030303,
 'accuracy_baseline': 0.625,
 'auc': 0.85561687,
 'auc_precision_recall': 0.81294394,
 'average_loss': 0.47788677,
 'global_step': 1960,
 'label/mean': 0.375,
 'loss': 0.46174872,
 'precision': 0.7196262,
 'prediction/mean': 0.40425703,
 'recall': 0.7777778}

## 交叉特征
cross feature：对两个离散特征做笛卡尔积

Eg：

age:[1,2,3,4,5], gender:[male, female]

age_x_gender:[(1,male),...,(5,male), (1,male),...,(5,female)]

100000:100 -> hash(100000 values) % 100

In [19]:
# tf.feature_column.crossed_column 定义交叉特征
feature_columns.append(tf.feature_column.indicator_column(
    tf.feature_column.crossed_column(['age', 'sex'], hash_bucket_size=100)))

In [20]:
import shutil
# 定义文件夹，保存输出的模型
dnn_output_dir = 'dnn_model_new_feature'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
else:
    shutil.rmtree(dnn_output_dir)
    os.mkdir(dnn_output_dir)
    
dnn_estimator_new = tf.estimator.DNNClassifier(
    model_dir=dnn_output_dir,
    n_classes=2,
    feature_columns=feature_columns,
    hidden_units=[128,128],
    activation_fn=tf.nn.relu,
    optimizer='Adam')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'dnn_model_new_feature', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f1046100080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [21]:
dnn_estimator_new.train(
    input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

INFO:tensorflow:Calling model_fn.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into dnn_model_new_feature/model.ckpt.
INFO:tensorflow:loss = 0.91685367, step = 0
INFO:tensorflow:global_step/sec: 171.815
INFO:tensorflow:loss = 0.5104016, step = 100 (0.583 sec)
INFO:tensorflow:global_step/sec: 234.375
INFO:tensorflow:loss = 0.37417728, step = 200 (0.427 sec)
INFO:tensorflow:global_step/sec: 229.51
INFO:tensorflow:loss = 0.45953017, step = 300 (0.437 sec)
INFO:tensorflow:global_step/sec: 233.944
INFO:tensorflow:loss = 0.43136257, step = 400 (0.426 sec)
INFO:tensorflow:global_step/sec: 217.871
INFO:tensorflow:loss = 0.44988418, step = 500 (0.459 sec)
INFO:tensorflow:global

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f1046105e48>

In [22]:
dnn_estimator_new.evaluate(
    input_fn=lambda : make_dataset(eval_df, y_eval, epochs=1, shuffle=False))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-07T15:52:19Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from dnn_model_new_feature/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-07-15:52:20
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.8030303, accuracy_baseline = 0.625, auc = 0.8413529, auc_precision_recall = 0.78737986, average_loss = 0.65251297, global_step = 1960, label/mean = 0.375, loss = 0.610228, precision = 0.742268, prediction/mean = 0.38915005, recall = 0.72727275
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: dnn_model_new_feature/model.ckpt-1960


{'accuracy': 0.8030303,
 'accuracy_baseline': 0.625,
 'auc': 0.8413529,
 'auc_precision_recall': 0.78737986,
 'average_loss': 0.65251297,
 'global_step': 1960,
 'label/mean': 0.375,
 'loss': 0.610228,
 'precision': 0.742268,
 'prediction/mean': 0.38915005,
 'recall': 0.72727275}