In [28]:
import tensorflow as tf
import pandas as pd
import numpy as np
import multiprocessing
from tensorflow.python.feature_column import feature_column

## 使用TF Estimator的基本步骤
1. 定义数据的元数据用于解析
2. 定义Estimator的输入函数，以完成从dataframe中读取数据并使用特征处理（ETL）
3. 基于元数据创建Estimator的特征列和扩展特征列（特征提取变换）
4. 根据特征列和超参数创建估计器实例
5. 使用数据训练估计器
6. 使用测试数据评估估计器
7. 使用估计其执行预测
8. 保存和部署估计器


In [29]:
MODEL_NAME ='reg-model-01'
TRAIN_FILE = 'data/train-data.csv'
VALID_FILE = 'data/valid-data.csv'
TEST_FILE = 'data/test-data.csv'

# 可选控制参数，不是通过设置属性实现，整体采用函数式编程
RESUME_TRAINING = False
PROCESS_FEATURES = True
MULTI_THREADING = False

### 1. 定义数据集的元信息
1. CSV文件的头及其默认值
2. 数值和类型特征列的名字
3. 目标特征的名字
4. 无用列名字


In [30]:
# 定义各种输入常量
HEADER= ['key','x','y','alpha','beta','target']

HEADER_DEFAULTS = [[0], [0.0], [0.0], ['NA'], ['NA'], [0.0]]  # 像spark一样设置一些data的元数据，如读取类型等,sparkRDD schema

NUMERIC_FEATURE_NAMES = ['x', 'y']  

CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {'alpha':['ax01', 'ax02'], 'beta':['bx01', 'bx02']} # 指定类别特征及其取值范围
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'target'

UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME}) #无用列

### 2.定义输入函数
1. 输入文件的名字
2. 加载pandas DataFrame
3. 使用处理函数
4. 返回的函数，这个函数能够返回（特征，目标）张量

In [31]:
def process_dataframe(dataset_df):
    """这里用的是numpy的square而不是tf.square,有点不同"""
    dataset_df['x_2']=np.square(dataset_df['x']) 
    dataset_df["y_2"] = np.square(dataset_df['y'])
    dataset_df["xy"] = dataset_df['x'] * dataset_df['y']
    dataset_df['dist_xy'] =  np.sqrt(np.square(dataset_df['x']-dataset_df['y']))
    return dataset_df

def generate_pandas_input_fn(file_name,mode=tf.estimator.ModeKeys.EVAL,
                             skip_header_line=0,num_epochs=1,batch_size=100):
    """直接使用全局常量，而不是采用传递的方式或成员属性，常量的作用域在此模块内；
       函数签名参数采用传递方式进入，因为其经常变化待输入
    """
    df_dataset=pd.read_csv(file_name,names=HEADER)
    x=df_dataset[FEATURE_NAMES].copy()
    if PROCESS_FEATURES:
        x=process_dataframe(x)
    y=df_dataset[TARGET_NAME]
    shuffle=True if mode==tf.estimator.ModeKeys.TRAIN else False
    num_threads=1
    if MULTI_THREADING:
        num_threads=multiprocessing.cpu_count()
        num_epochs =  num_epochs = int(num_epochs/num_threads) if mode == tf.estimator.ModeKeys.TRAIN else num_epochs
    # 以上都是在制备pandas_input_fn的参数值
    pandas_input_fn=tf.estimator.inputs.pandas_input_fn(x=x,y=y,batch_size=batch_size,num_epochs= num_epochs,
                                                        shuffle=shuffle,target_column=TARGET_NAME)
    return pandas_input_fn

In [32]:
features, target = generate_pandas_input_fn(file_name=TRAIN_FILE)()  #调用generate_pandas_input_fn返回的函数
print("Feature read from DataFrame: {}".format(list(features.keys())))
print("Target read from DataFrame: {}".format(target))

Feature read from DataFrame: ['x', 'y', 'alpha', 'beta', 'x_2', 'y_2', 'xy', 'dist_xy']
Target read from DataFrame: Tensor("fifo_queue_DequeueUpTo_1:9", shape=(?,), dtype=float64)


### 3. 定义特征列
先假设数值输被正规化或者具有相同的尺度，否则，正规化函数及其参数应当被输入到数值特征列构造器中去.

In [33]:
def get_feature_columns():   
    """将各种特征的制备定义在函数内部，方便后面主要调用流程的组装"""
    all_numeric_feature_names = NUMERIC_FEATURE_NAMES
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = ['x_2', 'y_2', 'xy', 'dist_xy']
    if PROCESS_FEATURES:
        all_numeric_feature_names += CONSTRUCTED_NUMERIC_FEATURES_NAMES

    numeric_columns = {feature_name: tf.feature_column.numeric_column(feature_name) for feature_name in all_numeric_feature_names}
    # 根据特征构造特征名称及其对象的字典
    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
        
    feature_columns = {}
    if numeric_columns is not None:
        feature_columns.update(numeric_columns)
    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
    # add extended features (crossing, bucektization, embedding)
    feature_columns['alpha_X_beta'] = tf.feature_column.crossed_column([feature_columns['alpha'], feature_columns['beta']], 4)
    return feature_columns

feature_columns = get_feature_columns()
print("Feature Columns: {}".format(feature_columns))

Feature Columns: {'x': _NumericColumn(key='x', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y': _NumericColumn(key='y', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'x_2': _NumericColumn(key='x_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y_2': _NumericColumn(key='y_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'xy': _NumericColumn(key='xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'dist_xy': _NumericColumn(key='dist_xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'alpha': _VocabularyListCategoricalColumn(key='alpha', vocabulary_list=('ax01', 'ax02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'beta': _VocabularyListCategoricalColumn(key='beta', vocabulary_list=('bx01', 'bx02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'alpha_X_beta': _CrossedColumn(keys=(_VocabularyListCategoricalColumn(key=

### 4.创建估计器
1. 定义一个估计器创建函数
2. 设置超参数及其运行参数
3. 初始化一个估计器实例  ----前面都是在定义

In [34]:
def create_estimator(run_config,hparams):
    feature_columns = list(get_feature_columns().values())  #产生特征列
    dense_columns = list(filter(lambda column: isinstance(column, feature_column._NumericColumn),feature_columns))
    # 筛查出验证各种特征
    categorical_columns = list(filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),feature_columns))
    indicator_columns = list(map(lambda column: tf.feature_column.indicator_column(column),categorical_columns))    
    estimator_feature_columns = dense_columns + indicator_columns 
    # 准备好了各种特征列和参数制备估计器
    estimator = tf.estimator.DNNRegressor(feature_columns= estimator_feature_columns,hidden_units= hparams.hidden_units, 
                                          optimizer= tf.train.AdamOptimizer(),activation_fn= tf.nn.elu,
                                          dropout= hparams.dropout_prob,config= run_config)
    print("Estimator Type: {}".format(type(estimator)))
    return estimator

In [None]:
# 这与传统及其学习估计器不同，其值定义好并并指导网络结构的生成，最后固定下来形成一个对象。
hparams  = tf.contrib.training.HParams(num_epochs = 100,batch_size = 500,hidden_units=[8, 4],dropout_prob = 0.0)
model_dir = 'trained_models/{}'.format(MODEL_NAME)
run_config = tf.estimator.RunConfig().replace(model_dir=model_dir)
print("Model directory: {}".format(run_config.model_dir))
print("Hyper-parameters: {}".format(hparams))

estimator = create_estimator(run_config, hparams)

In [36]:
#########定义和声明过程结束，下面部分进行脚本调用################

### 5.训练估计器

In [None]:
# 训练估计器需要使用到输入数据
import shutil
import time
from datetime import datetime
train_input_fn=generate_pandas_input_fn(file_name= TRAIN_FILE, mode=tf.estimator.ModeKeys.TRAIN,
                                         num_epochs=hparams.num_epochs,batch_size=hparams.batch_size) 
if not RESUME_TRAINING:
    shutil.rmtree(model_dir, ignore_errors=True)

    tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("Estimator training started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................")

# 通常简单训练一下
estimator.train(input_fn = train_input_fn)

time_end = datetime.utcnow() 
print(".......................................")
print("Estimator training finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Estimator training elapsed time: {} seconds".format(time_elapsed.total_seconds()))

### 6.评估模型

In [50]:
import math

TEST_SIZE = 5000
test_input_fn = generate_pandas_input_fn(file_name=TEST_FILE,mode= tf.estimator.ModeKeys.EVAL,batch_size= TEST_SIZE)

results = estimator.evaluate(input_fn=test_input_fn)
print("")
print(results)
rmse = round(math.sqrt(results["average_loss"]),5)
print("")
print("RMSE: {}".format(rmse))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-22-06:45:50
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01\model.ckpt-2400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-22-06:45:51
INFO:tensorflow:Saving dict for global step 2400: average_loss = 121.92495, global_step = 2400, label/mean = 1.108437, loss = 609624.75, prediction/mean = 1.7788389
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2400: trained_models/reg-model-01\model.ckpt-2400

{'average_loss': 121.92495, 'label/mean': 1.108437, 'loss': 609624.75, 'prediction/mean': 1.7788389, 'global_step': 2400}

RMSE: 11.04196


### 7. 预测

In [53]:
import itertools

predict_input_fn = generate_pandas_input_fn(file_name=TEST_FILE,mode= tf.estimator.ModeKeys.PREDICT, batch_size= 5)

predictions = estimator.predict(input_fn=predict_input_fn)
values = list(map(lambda item: item["predictions"][0],list(itertools.islice(predictions, 5))))
print()
print("Predicted Values: {}".format(values))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01\model.ckpt-2400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Predicted Values: [32.544582, 7.4533186, -0.5131136, 0.002710577, 3.5384486]


------------------

--------------------------------------

### 8. 保存 & 部署  Model
进入模型部署阶段

#### a.定义服务函数

In [60]:
def process_features(features):
    """采用tf.square等函数处理输入的特征"""
    features["x_2"] = tf.square(features['x'])
    features["y_2"] = tf.square(features['y'])
    features["xy"] = tf.multiply(features['x'], features['y'])
    features['dist_xy'] =  tf.sqrt(tf.squared_difference(features['x'],features['y']))
    return features

def csv_serving_input_fn():
    """服务输入函数与训练的不同"""
    SERVING_HEADER = ['x','y','alpha','beta']
    SERVING_HEADER_DEFAULTS = [[0.0], [0.0], ['NA'], ['NA']]
    
    # 数据来源是一个占位符
    rows_string_tensor = tf.placeholder(dtype=tf.string, shape=[None], name='csv_rows')
    
    receiver_tensor = {'csv_rows': rows_string_tensor} 

    row_columns = tf.expand_dims(rows_string_tensor, -1)
    # 将接受到的数据解析成特征字段的子弹
    columns = tf.decode_csv(row_columns, record_defaults=SERVING_HEADER_DEFAULTS)
    features = dict(zip(SERVING_HEADER, columns))
    
    if PROCESS_FEATURES:
        features = process_features(features)
    return tf.estimator.export.ServingInputReceiver(features, receiver_tensor)

#### b. 结合服务函数导出模型

In [None]:
export_dir = model_dir + "/export"

estimator.export_savedmodel(export_dir_base = export_dir,serving_input_receiver_fn = csv_serving_input_fn, as_text=True)

In [61]:
tf.estimator.Exporter.export()

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: ['serving_default', 'regression']
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01\model.ckpt-2400
Instructions for updating:
Pass your op to the equivalent parameter main_op instead.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: trained_models/reg-model-01/export\temp-b'1542869564'\saved_model.pbtxt


b'trained_models/reg-model-01/export\\1542869564'

#### c.部署保存的模型

In [62]:
import os

saved_model_dir = export_dir + "/" + os.listdir(path=export_dir)[-1] 

print(saved_model_dir)

predictor_fn = tf.contrib.predictor.from_saved_model(export_dir = saved_model_dir, signature_def_key="predict")

output = predictor_fn({'csv_rows': ["0.5,1,ax01,bx02", "-0.5,-1,ax02,bx02"]})
print(output)

trained_models/reg-model-01/export/1542869564
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01/export/1542869564\variables\variables
{'predictions': array([[ 55.15245],
       [-14.56848]], dtype=float32)}


In [63]:
saved_model_dir 

'trained_models/reg-model-01/export/1542869564'

In [None]:
tf.pre