# 使用预定义的estimator处理titanic问题

In [1]:
# 导入
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.1
pandas 0.25.3
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


## 读取数据，构建数据集

In [2]:
train_file = 'data/titanic/train.csv'
eval_file = 'data/titanic/eval.csv'

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
   survived     sex   age  n_siblings_spouses  parch     fare   class  \
0         0    male  35.0                   0      0   8.0500   Third   
1         0    male  54.0                   0      0  51.8625   First   
2         1  female  58.0                   0      0  26.5500   First   
3         1  female  55.0                   0      0  16.0000  Second   
4         

In [3]:
# 将survived字段取出，赋给label
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

      sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
      sex   age  n_siblings_spouses  parch     fare   class     deck  \
0    male  35.0                   0      0   8.0500   Third  unknown   
1    male  54.0                   0      0  51.8625   First        E   
2  female  58.0                   0      0  26.5500   First        C   
3  female  55.0                   0      0  16.0000  Second  unknown   
4    male  34.0                   0      0  13.0000  Second        D   

  

In [4]:
# 将数据分成两类：离散和连续
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for categorical_column in categorical_columns:
    # 获取离散值所有可能值
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    # 对离散值封装，做onehot编码，再加入feature_columns中
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))
    
for numeric_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(numeric_column, dtype=tf.float32))

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [5]:
# 从pandas dataframe中构建dataset
def make_dataset(data_df, label_df, epochs = 10, shuffle = True, batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

def make_input_fn(data_df, label_df, epochs = 10, shuffle = True, batch_size = 32):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            dataset = dataset.shuffle(10000)
        dataset = dataset.repeat(epochs).batch(batch_size)
        return dataset
    return input_fn

train_input_fn = make_input_fn(train_df, y_train, epochs=100)
eval_input_fn = make_input_fn(eval_df, y_eval, epochs=1, shuffle=False)

## 使用预定义的estimator

### BaseLine Estimator

In [6]:
# import shutil
# # 定义文件夹，保存输出的模型
# output_dir = 'baseline_model'
# if not os.path.exists(output_dir):
#     os.mkdir(output_dir)
# else:
#     shutil.rmtree(output_dir)
#     os.mkdir(output_dir)
    
# baseline_estimator = tf.estimator.BaselineClassifier(
#     model_dir=output_dir, n_classes=2)
# baseline_estimator.train(
#     input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

In [7]:
# baseline_estimator.evaluate(
#     input_fn=lambda : make_dataset(
#         eval_df, y_eval, epochs=1, shuffle=False, batch_size=20))

### Linear Estimator

In [8]:
# import shutil
# # 定义文件夹，保存输出的模型
# linear_output_dir = 'linear_model'
# if not os.path.exists(linear_output_dir):
#     os.mkdir(linear_output_dir)
# else:
#     shutil.rmtree(linear_output_dir)
#     os.mkdir(linear_output_dir)
    
# linear_estimator = tf.estimator.LinearClassifier(
#     model_dir=linear_output_dir, n_classes=2, feature_columns=feature_columns)
# linear_estimator.train(
#     input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

# linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
# linear_est.train(train_input_fn)
# result = linear_est.evaluate(eval_input_fn)

# clear_output()
# print(result)

In [9]:
# linear_estimator.evaluate(
#     input_fn=lambda : make_dataset(eval_df, y_eval, epochs=1, shuffle=False))

### DNN Estimator

In [10]:
# import shutil
# # 定义文件夹，保存输出的模型
# dnn_output_dir = 'dnn_model'
# if not os.path.exists(dnn_output_dir):
#     os.mkdir(dnn_output_dir)
# else:
#     shutil.rmtree(dnn_output_dir)
#     os.mkdir(dnn_output_dir)
    
# dnn_estimator = tf.estimator.DNNClassifier(
#     model_dir=dnn_output_dir,
#     n_classes=2,
#     feature_columns=feature_columns,
#     hidden_units=[128,128],
#     activation_fn=tf.nn.relu,
#     optimizer='Adam')

In [11]:
# dnn_estimator.train(
#     input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

In [12]:
# dnn_estimator.evaluate(
#     input_fn=lambda : make_dataset(eval_df, y_eval, epochs=1, shuffle=False))

## 交叉特征
cross feature：对两个离散特征做笛卡尔积

Eg：

age:[1,2,3,4,5], gender:[male, female]

age_x_gender:[(1,male),...,(5,male), (1,male),...,(5,female)]

100000:100 -> hash(100000 values) % 100

In [13]:
# tf.feature_column.crossed_column 定义交叉特征
feature_columns.append(tf.feature_column.indicator_column(
    tf.feature_column.crossed_column(['age', 'sex'], hash_bucket_size=100)))

In [14]:
# import shutil
# # 定义文件夹，保存输出的模型
# dnn_output_dir = 'dnn_model_new_feature'
# if not os.path.exists(dnn_output_dir):
#     os.mkdir(dnn_output_dir)
# else:
#     shutil.rmtree(dnn_output_dir)
#     os.mkdir(dnn_output_dir)
    
# dnn_estimator = tf.estimator.DNNClassifier(
#     model_dir=dnn_output_dir,
#     n_classes=2,
#     feature_columns=feature_columns,
#     hidden_units=[128,128],
#     activation_fn=tf.nn.relu,
#     optimizer='Adam')

# dnn_estimator.train(
#     input_fn=lambda : make_dataset(train_df, y_train, epochs=100))

# dnn_estimator.evaluate(
#     input_fn=lambda : make_dataset(eval_df, y_eval, epochs=1, shuffle=False))