In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
import os


In [2]:
tf.__version__

'2.0.0'

In [3]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
train_tf = pd.read_csv(TRAIN_DATA_URL)
test_data = pd.read_csv(TEST_DATA_URL)

In [None]:
print(train_tf.shape)
print(test_data.shape)

In [4]:
train_data = train_tf[:500]
valid_data = train_tf[500:]

In [5]:
print(train_data.shape)
print(valid_data.shape)

(500, 10)
(127, 10)


In [6]:
y_train= train_data.pop('survived')
y_test = test_data.pop('survived')
y_valid= valid_data.pop('survived')



In [7]:
categorical_columns = ['sex','n_siblings_spouses','parch','class','deck','embark_town','alone']
numeric_columns = ['age', 'fare']

feature_columns = []#收集所有feature，以运用到dataset上
#对离散特征进行onehot 编码处理
for c_column in categorical_columns:
    vocab = train_data[c_column].unique()#用unique取出唯一值
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                c_column, vocab)))
    
#categorical_column_with_vocabulary_list提取特征list
# indicator_column 对离散特征进行one_hot编码

#对连续型特征直接封装
for n_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            n_column, dtype = tf.float32))
    
# 交叉特征：
# cross_feature: age:[1,2,3,4,5],gender:[male, female]
# age_x_gender:[1:male,],[2,male],[3:male]......[5:female]
#100000:100    hash_bucket_size->hash(10000 calues)%100
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))


In [None]:
# 构建数据集.from_tensor_slices()传入字典
def make_datasets(x, y, epochs = 10, shuffle = True, batch_size = 32 ):
    dataset = tf.data.Dataset.from_tensor_slices((dict(x), y))
    if shuffle:
        dataset = dataset.shuffle(1000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [None]:
output_dir ='data/baseline_model_new_feature'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
baseline_estimator = tf.estimator.BaselineClassifier(
        model_dir = output_dir,
        n_classes =2,
        optimizer='Ftrl',)
baseline_estimator.train(input_fn = lambda : make_datasets(
        train_data, y_train, epochs=100))

In [None]:
baseline_estimator.evaluate(input_fn = lambda : make_datasets(
        test_data, y_test, epochs=20))

In [None]:
output_dir ='data/linear_model_new_feature'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
linear_estimator = tf.estimator.LinearClassifier(
        model_dir = output_dir,
        n_classes =2,
        optimizer='Ftrl',
        feature_columns=feature_columns)
linear_estimator.train(input_fn = lambda : make_datasets(
        train_data, y_train, epochs=100))

In [None]:
linear_estimator.evaluate(input_fn = lambda : make_datasets(
        test_data, y_test, epochs=20))

In [None]:
dnn_output_dir ='data/linear_model_new_feature'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
        model_dir = dnn_output_dir,
        n_classes =2,
        activation_fn='relu',
        hidden_units=2,
        optimizer='Ftrl',
        feature_columns=feature_columns)

dnn_estimator.train(input_fn = lambda : make_datasets(
        train_data, y_train, epochs=100))

In [None]:
dnn_estimator.evaluate(input_fn = lambda : make_datasets(
        test_data, y_test, epochs=20))