In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import shutil
import math
import multiprocessing
from datetime import datetime
from tensorflow.python.feature_column import feature_column
from tensorflow.python.feature_column import feature_column_v2
print(tf.__version__)

1.15.4


In [2]:
MODEL_NAME = 'reg-model-01'

TRAIN_DATA_FILE = 'data/train-data.csv'
VALID_DATA_FILE = 'data/valid-data.csv'
TEST_DATA_FILE = 'data/test-data.csv'

RESUME_TRAINING = False
PROCESS_FEATURES = True
MULTI_THREADING = False

In [3]:
HEADER = ['key','x','y','alpha','beta','target']
HEADER_DEFAULTS = [[0], [0.0], [0.0], ['NA'], ['NA'], [0.0]]
train_df = pd.read_csv(TRAIN_DATA_FILE, names=HEADER, skiprows=0)
valid_df = pd.read_csv(VALID_DATA_FILE, names=HEADER, skiprows=0)
test_df = pd.read_csv(TEST_DATA_FILE, names=HEADER, skiprows=0)

In [4]:
train_df.head()

Unnamed: 0,key,x,y,alpha,beta,target
0,17262,0.893902,0.040267,ax02,bx02,-12.314443
1,4554,0.148486,0.015758,ax01,bx01,3.269937
2,19003,0.156807,-0.953493,ax02,bx02,39.556516
3,15196,0.374318,0.30675,ax02,bx02,-1.011721
4,9502,0.827361,0.305439,ax02,bx01,6.051278


In [5]:
len(train_df), len(valid_df), len(test_df)

(12000, 3000, 5000)

In [6]:
NUMERIC_FEATURE_NAMES = ['x', 'y']
CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {'alpha':['ax01', 'ax02'], 'beta': ['bx01', 'bx02']}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'target'
UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME})

print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {}".format(TARGET_NAME))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))

Header: ['key', 'x', 'y', 'alpha', 'beta', 'target']
Numeric Features: ['x', 'y']
Categorical Features: ['alpha', 'beta']
Target: target
Unused Features: ['key']


In [7]:
def process_dataframe(dataset_df):
    dataset_df['x_2'] = np.square(dataset_df['x'])
    dataset_df['y_2'] = np.square(dataset_df['y'])
    dataset_df['xy'] = dataset_df['x'] * dataset_df['y']
    dataset_df['dist_xy'] = np.sqrt(np.square(dataset_df['x'] - dataset_df['y']))
    return dataset_df

def generate_pandas_input_fn(file_name, mode=tf.estimator.ModeKeys.EVAL,
                            skip_header_lines=0,
                            num_epochs=1,
                            batch_size=100):
    df_dataset = pd.read_csv(file_name, names=HEADER, skiprows=skip_header_lines)
    
    x = df_dataset[FEATURE_NAMES].copy()
    if PROCESS_FEATURES:
        x = process_dataframe(x)
    y = df_dataset[TARGET_NAME]
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    num_threads=1
    
    if MULTI_THREADING:
        num_threads = multiprocessing.cpu_count()
        num_epochs = int(num_epochs / num_threads) if mode == tf.estimator.ModeKeys.TRAIN else num_epochs
        
    pandas_input_fn = tf.estimator.inputs.pandas_input_fn(
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        x=x,
        y=y,
        target_column=TARGET_NAME
    )
    print("")
    print("* data input_fn:")
    print("================")
    print("Input file: {}".format(file_name))
    print("Dataset size: {}".format(len(df_dataset)))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")
    
    return pandas_input_fn

In [8]:
features, target = generate_pandas_input_fn(file_name=TRAIN_DATA_FILE)()
print("Feature read from DataFrame: {}".format(list(features.keys())))
print("Target read from DataFrame: {}".format(target))


* data input_fn:
Input file: data/train-data.csv
Dataset size: 12000
Batch size: 100
Epoch Count: 1
Mode: eval
Thread Count: 1
Shuffle: False

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Feature read from DataFrame: ['x', 'y', 'alpha', 'beta', 'x_2', 'y_2', 'xy', 'dist_xy']
Target read from DataFrame: Tensor("fifo_queue_DequeueUpTo:9", shape=(?,), dtype=float64)


In [9]:
def get_feature_columns():
    all_numeric_feature_names = NUMERIC_FEATURE_NAMES
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = ['x_2', 'y_2', 'xy', 'dist_xy']
    
    if PROCESS_FEATURES:
        all_numeric_feature_names += CONSTRUCTED_NUMERIC_FEATURES_NAMES
    numeric_columns = {feature_name: tf.feature_column.numeric_column(feature_name)
                      for feature_name in all_numeric_feature_names}
    categorical_column_with_vocabulary = \
    {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
    for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
#     print(categorical_column_with_vocabulary)
    
    feature_columns = {}
    if numeric_columns is not None:
        feature_columns.update(numeric_columns)
    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
    feature_columns['alpha_X_beta'] = tf.feature_column.crossed_column(
        [feature_columns['alpha'], feature_columns['beta']], 4
    )
    return feature_columns

feature_columns = get_feature_columns()
print('Feature Columns: {}'.format(feature_columns))

Feature Columns: {'x': NumericColumn(key='x', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y': NumericColumn(key='y', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'x_2': NumericColumn(key='x_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y_2': NumericColumn(key='y_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'xy': NumericColumn(key='xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'dist_xy': NumericColumn(key='dist_xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'alpha': VocabularyListCategoricalColumn(key='alpha', vocabulary_list=('ax01', 'ax02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'beta': VocabularyListCategoricalColumn(key='beta', vocabulary_list=('bx01', 'bx02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'alpha_X_beta': CrossedColumn(keys=(VocabularyListCategoricalColumn(key='alpha', v

In [10]:
def create_estimator(run_config, hparams):
    feature_columns = list(get_feature_columns().values())
    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column_v2.NumericColumn),
              feature_columns)
    )
    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column_v2.VocabularyListCategoricalColumn) |
              isinstance(column, feature_column_v2.BucketizedColumn),
              feature_columns)
    )
    indicator_columns = list(
        map(lambda column: tf.feature_column.indicator_column(column),
           categorical_columns)
    )
    estimator_feature_columns = dense_columns + indicator_columns
    print(estimator_feature_columns)
    estimator = tf.estimator.DNNRegressor(
        feature_columns=estimator_feature_columns,
        hidden_units = hparams.hidden_units,
        optimizer=tf.train.AdamOptimizer(),
        activation_fn=tf.nn.relu,
        dropout=hparams.dropout_prob,
        config=run_config
    )
    print("")
    print("Estimator Type: {}".format(type(estimator)))
    print("")
    return estimator

In [11]:
hparams = tf.contrib.training.HParams(
    num_epochs = 100,
    batch_size = 500,
    hidden_units = [8, 4],
    dropout_prob = 0.0
)
model_dir = 'trained_models/{}'.format(MODEL_NAME)
run_config = tf.estimator.RunConfig().replace(model_dir=model_dir)
print("Model directory: {}".format(run_config.model_dir))
print("Hyper-paramters: {}".format(hparams))

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Model directory: trained_models/reg-model-01
Hyper-paramters: num_epochs=100,batch_size=500,hidden_units=[8, 4],dropout_prob=0.0


In [13]:
estimator = create_estimator(run_config, hparams)

[NumericColumn(key='x', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='y', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='x_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='y_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='dist_xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alpha', vocabulary_list=('ax01', 'ax02'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='beta', vocabulary_list=('bx01', 'bx02'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/reg-model-01', '_tf_rand

In [14]:
train_input_fn = generate_pandas_input_fn(file_name=TRAIN_DATA_FILE,
                                         mode=tf.estimator.ModeKeys.TRAIN,
                                         num_epochs=hparams.num_epochs,
                                         batch_size=hparams.batch_size)
if not RESUME_TRAINING:
    shutil.rmtree(model_dir, ignore_errors=True)
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow()
print("Estimator training started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................")

estimator.train(input_fn=train_input_fn)

time_end = datetime.utcnow() 
print(".......................................")
print("Estimator training finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Estimator training elapsed time: {} seconds".format(time_elapsed.total_seconds()))


* data input_fn:
Input file: data/train-data.csv
Dataset size: 12000
Batch size: 500
Epoch Count: 100
Mode: train
Thread Count: 1
Shuffle: True

Estimator training started at 02:38:25
.......................................
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Graph was finalize

In [15]:
TEST_SIZE = 5000

test_input_fn = generate_pandas_input_fn(file_name=TEST_DATA_FILE, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)

results = estimator.evaluate(input_fn=test_input_fn)
print("")
print(results)
rmse = round(math.sqrt(results["average_loss"]),5)
print("")
print("RMSE: {}".format(rmse))


* data input_fn:
Input file: data/test-data.csv
Dataset size: 5000
Batch size: 5000
Epoch Count: 1
Mode: eval
Thread Count: 1
Shuffle: False

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2022-01-23T10:39:26Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01/model.ckpt-2400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2022-01-23-10:39:26
INFO:tensorflow:Saving dict for global step 2400: average_loss = 121.21656, global_step = 2400, label/mean = 1.0653467, loss = 606082.8, prediction/mean = 0.9807678
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2400: trained_models/reg-model-01/model.ckpt-2400

{'average_loss': 121.21656, 'label/mean': 1.0653467, 'loss': 606082.8, 'prediction/mean': 0.9807678, 'global_step': 2400}

RMSE: 11.00984


In [16]:
import itertools

predict_input_fn = generate_pandas_input_fn(file_name=TEST_DATA_FILE, 
                                      mode= tf.estimator.ModeKeys.PREDICT,
                                      batch_size= 5)

predictions = estimator.predict(input_fn=predict_input_fn)
values = list(map(lambda item: item["predictions"][0],list(itertools.islice(predictions, 5))))
print()
print("Predicted Values: {}".format(values))


* data input_fn:
Input file: data/test-data.csv
Dataset size: 5000
Batch size: 5
Epoch Count: 1
Mode: infer
Thread Count: 1
Shuffle: False

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01/model.ckpt-2400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Predicted Values: [34.7328, -8.892282, 14.3126135, 5.1370325, -0.11072874]


In [17]:
def process_features(features):
    features["x_2"] = tf.square(features['x'])
    features["y_2"] = tf.square(features['y'])
    features["xy"] = tf.multiply(features['x'], features['y'])
    features["dist_xy"] = tf.sqrt(tf.squared_difference(features['x'], features['y']))
    return features

def csv_serving_input_fn():
    SERVING_HEADER = ['x', 'y', 'alpha', 'beta']
    SERVING_HEADER_DEFAULTS = [[0.0], [0.0], ['NA'], ['NA']]
    rows_string_tensor = tf.placeholder(dtype=tf.string,
                                       shape=[None],
                                       name='csv_rows')
    receiver_tensor = {'csv_rows': rows_string_tensor}
    row_columns = tf.expand_dims(rows_string_tensor, -1)
    columns = tf.decode_csv(row_columns, record_defaults=SERVING_HEADER_DEFAULTS)
    features = dict(zip(SERVING_HEADER, columns))
    
    if PROCESS_FEATURES:
        features = process_features(features)
    return tf.estimator.export.ServingInputReceiver(
        features, receiver_tensor
    )

In [18]:
export_dir = model_dir + "/export"

estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=csv_serving_input_fn,
    as_text=True
)

Instructions for updating:
This function has been renamed, use `export_saved_model` instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: ['serving_default', 'regression']
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01/model.ckpt-2400
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: trained_models/reg-model-01/export/temp-b'1642905585'/saved_model.pbtxt


b'trained_models/reg-model-01/export/1642905585'

In [None]:
import os
import time
saved_model_dir = export_dir + '/' + os.listdir(path=export_dir)[-1]
print(saved_model_dir)

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir=saved_model_dir,
    signature_def_key='predict'
)
start = time.time()
output = predictor_fn({'csv_rows': ["0.5,1,ax01,ax02", "-0.5,-1,ax02,bx02"]})
print("Elapse: {}ms".format(time.time() - start))
print(output)