分为如下5步：
1. 数据导入
2. 构建特征列，将数据类型转化categorical->numeric
3. 构建input function，返回dataset（i. 从切片构建dataset. ii. 判断是否需要打乱. iii. 分划batch，重复epoch次.）
4. 使用特征列创建linear_estimator
5. 训练，评估，预测（train, evaluate, predict）

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output  # 清除jupyter notebook输出框中的内容

In [2]:
df_train = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
df_eval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
y_train = df_train.pop('survived')
y_eval = df_eval.pop('survived')

In [3]:
y_train  # 0表示死亡，1表示存活

0      0
1      1
2      1
3      1
4      0
      ..
622    0
623    0
624    1
625    0
626    0
Name: survived, Length: 627, dtype: int64

In [4]:
df_train.dtypes

sex                    object
age                   float64
n_siblings_spouses      int64
parch                   int64
fare                  float64
class                  object
deck                   object
embark_town            object
alone                  object
dtype: object

In [5]:
# 处理categorical数据，全部转为numeric
cate = ['sex', 'class', 'deck', 'embark_town', 'alone']
numeric = ['age', 'n_siblings_spouses', 'parch', 'fare']
feature = []  # 特征列
for col in cate:  # categorical
    word = df_train[col].unique()
    feature.append(tf.feature_column.categorical_column_with_vocabulary_list(col, word))
for col in numeric:  # numeric
    feature.append(tf.feature_column.numeric_column(col))

In [6]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
622    0
623    0
624    1
625    0
626    0
Name: survived, Length: 627, dtype: int64

In [7]:
# 构建Input function
def make_input_fn(x, y, epochs=10, shuffle=True, batch_size=32):
    def input_fn():
        ds = tf.data.Dataset.from_tensor_slices((dict(x), y))
        if shuffle:
            ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(epochs)
        return ds
    return input_fn

train_input_fn = make_input_fn(df_train, y_train)
eval_input_fn = make_input_fn(df_eval, y_eval, epochs=1, shuffle=False)

In [8]:
# 创建 linear estimator
linear = tf.estimator.LinearClassifier(feature)
clear_output()

In [9]:
linear.train(train_input_fn)  # 训练
result = linear.evaluate(eval_input_fn)  # 评估
clear_output()

In [10]:
result

{'accuracy': 0.7651515,
 'accuracy_baseline': 0.625,
 'auc': 0.83966935,
 'auc_precision_recall': 0.7873041,
 'average_loss': 0.47249624,
 'label/mean': 0.375,
 'loss': 0.45887482,
 'precision': 0.69473684,
 'prediction/mean': 0.35994655,
 'recall': 0.6666667,
 'global_step': 200}

In [11]:
predict = list(linear.predict(eval_input_fn))  # 预测
clear_output()

In [12]:
idx = 20  # 对第idx个人进行预测
print(df_eval.loc[idx])
print('真实存活情况: {}'.format(y_eval[idx]))
print('预测存活率: {}'.format(predict[idx]['probabilities'][1]))

sex                          male
age                          20.0
n_siblings_spouses              0
parch                           0
fare                       7.8542
class                       Third
deck                      unknown
embark_town           Southampton
alone                           y
Name: 20, dtype: object
真实存活情况: 0
预测存活率: 0.09286709874868393


In [13]:
# 自定义一个数据进行预测
df_pred = pd.DataFrame([{'sex': 'female', 'age': 22., 'n_siblings_spouses': 1, 'parch': 0, 'fare': 7., 'class': 'Third',
                        'deck': 'unknow', 'embark_town': 'unknow', 'alone': 'n'}])
y_pred = pd.DataFrame([1])
pred_input_fn = make_input_fn(df_pred, y_pred, epochs=1, shuffle=False)
predict = list(linear.predict(pred_input_fn))
clear_output()

In [14]:
predict[0]

{'logits': array([0.7727388], dtype=float32),
 'logistic': array([0.684113], dtype=float32),
 'probabilities': array([0.31588694, 0.6841131 ], dtype=float32),
 'class_ids': array([1], dtype=int64),
 'classes': array([b'1'], dtype=object),
 'all_class_ids': array([0, 1]),
 'all_classes': array([b'0', b'1'], dtype=object)}