In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Prepare data

In [2]:
# import
data = pd.read_csv(
    'input.csv', dtype={
        'item1': str,
        'value1': float,
        'item2': str,
        'value2': float
    })
print('data size: {}'.format(len(data)))

data['shift'] = data['value2'] - data['value1']

x = data[['item1', 'value1', 'item2', 'value2']]
y = data['shift']

data size: 270351


In [3]:
# split training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)

# Define columns

In [4]:
# count feature values
list_items = x_train['item1'].append(x_train['item2']).drop_duplicates().tolist()
n_items = len(list_items)
print("number of unique items: {}".format(n_items))

# columns
item1_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item1', vocabulary_list=list_items))
value1_col = tf.feature_column.numeric_column('value1')
item2_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item2', vocabulary_list=list_items))
value2_col = tf.feature_column.numeric_column('value2')

number of unique items: 704


# Configure model

In [5]:
# build input function
input_fn_train=tf.estimator.inputs.pandas_input_fn(
        x=x_train,
        y=y_train,
        batch_size=100,
        num_epochs=5,
        shuffle=True
    )

In [6]:
# build model function
def model_fn(features, labels, mode, params):
    input_layer = {
        'item1': tf.feature_column.input_layer(features={'item1': features['item1']},
                                               feature_columns=params['feature_columns'][0]),
        'value1': tf.feature_column.input_layer(features={'value1': features['value1']},
                                                feature_columns=params['feature_columns'][1]),
        'item2': tf.feature_column.input_layer(features={'item2': features['item2']},
                                               feature_columns=params['feature_columns'][2]),
        'value2': tf.feature_column.input_layer(features={'value2': features['value2']},
                                                feature_columns=params['feature_columns'][3])
    }
        
    input_1 = tf.concat((input_layer['item1'],
                         tf.multiply(tf.concat([input_layer['value1']]*n_items, axis=1),
                                     input_layer['item1'])),
                        axis=1)
    input_2 = tf.concat((input_layer['item2'],
                         tf.multiply(tf.concat([input_layer['value2']]*n_items, axis=1),
                                     input_layer['item2'])),
                        axis=1)
    
    subtracted = tf.subtract(input_1, input_2)
    out = tf.layers.dense(
        inputs=subtracted,
        units=1,
        use_bias=False,
        kernel_initializer=None)

    # define head
    my_head = tf.contrib.estimator.regression_head(
        label_dimension=1,
        loss_fn=None  # custom loss, default: mean_squared_error
    )

    return my_head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        optimizer=tf.train.FtrlOptimizer(params['step_size']),
        logits=out,
    )

In [7]:
# define dir to save model
model_dir = 'model_dir/'

# custom estimator
regressor = tf.estimator.Estimator(
    model_dir=model_dir,
    model_fn=model_fn,
    params={'feature_columns': [item1_col, value1_col, item2_col, value2_col],
            'step_size': 0.1}
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'model_dir/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f216ca66278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


# Execute training

In [8]:
# train estimator
regressor.train(
    input_fn=input_fn_train,
    steps=None
)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model_dir/model.ckpt-12978
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 12978 into model_dir/model.ckpt.
INFO:tensorflow:loss = 0.17732638, step = 12979
INFO:tensorflow:global_step/sec: 230.598
INFO:tensorflow:loss = 0.16399024, step = 13079 (0.434 sec)
INFO:tensorflow:global_step/sec: 270.256
INFO:tensorflow:loss = 0.17308079, step = 13179 (0.370 sec)
INFO:tensorflow:global_step/sec: 282.737
INFO:tensorflow:loss = 0.25082386, step = 13279 (0.354 sec)
INFO:tensorflow:global

INFO:tensorflow:global_step/sec: 273.442
INFO:tensorflow:loss = 0.17956023, step = 19779 (0.365 sec)
INFO:tensorflow:global_step/sec: 286.885
INFO:tensorflow:loss = 0.121622294, step = 19879 (0.349 sec)
INFO:tensorflow:global_step/sec: 285.156
INFO:tensorflow:loss = 0.15339659, step = 19979 (0.351 sec)
INFO:tensorflow:global_step/sec: 277.703
INFO:tensorflow:loss = 0.18206245, step = 20079 (0.360 sec)
INFO:tensorflow:global_step/sec: 290.167
INFO:tensorflow:loss = 0.12274916, step = 20179 (0.344 sec)
INFO:tensorflow:global_step/sec: 279.591
INFO:tensorflow:loss = 0.35775542, step = 20279 (0.358 sec)
INFO:tensorflow:global_step/sec: 295.765
INFO:tensorflow:loss = 0.14778133, step = 20379 (0.338 sec)
INFO:tensorflow:global_step/sec: 286.615
INFO:tensorflow:loss = 0.3274662, step = 20479 (0.349 sec)
INFO:tensorflow:global_step/sec: 266.382
INFO:tensorflow:loss = 0.2143168, step = 20579 (0.375 sec)
INFO:tensorflow:global_step/sec: 282.56
INFO:tensorflow:loss = 0.18783432, step = 20679 (0.3

<tensorflow.python.estimator.estimator.Estimator at 0x7f216c7f9f98>

# Evaluation

In [9]:
# evaluation input function
input_fn_eval=tf.estimator.inputs.pandas_input_fn(
        x=x_test,
        y=y_test,
        batch_size=1,
        num_epochs=1,
        shuffle=False
    )

# evaluate
eval_result = regressor.evaluate(input_fn=input_fn_eval)
print(eval_result)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-02-22-13:12:05
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model_dir/model.ckpt-23792
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-02-22-13:13:03
INFO:tensorflow:Saving dict for global step 23792: average_loss = 0.18916954, global_step = 23792, label/mean = 0.022441076, loss = 0.18916954, prediction/mean = 0.01291999
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 23792: model_dir/model.ckpt-23792
{'average_loss': 0.18916954, 'label/mean': 0.022441076, 'loss': 0.18916954, 'prediction/mean': 0.01291999, 'global_step': 23792}


In [10]:
print(eval_result)

{'average_loss': 0.18916954, 'label/mean': 0.022441076, 'loss': 0.18916954, 'prediction/mean': 0.01291999, 'global_step': 23792}


# Export training result (trainable variables) in (item, c, d) table

In [11]:
# construct and export coefficient table
checkpoint = tf.train.get_checkpoint_state(checkpoint_dir=model_dir)

with tf.Session() as sess:
    saver = tf.train.import_meta_graph(checkpoint.model_checkpoint_path + '.meta')
    saver.restore(sess, checkpoint.model_checkpoint_path)
    tvs = sess.run(tf.trainable_variables())

INFO:tensorflow:Restoring parameters from model_dir/model.ckpt-23792


In [12]:
coeffs = np.reshape(tvs, [2, 704])

# For each item,
# v' = c + (1 + d) * v
# where v' is the corrected value and v is the given tag value
item_coeffs = pd.DataFrame({'item': list_items, 'c': coeffs[0, :], 'd': coeffs[1, :]})
item_coeffs.to_csv('item_coeffs.csv', sep=',', index=False) 

In [13]:
# check that the tf prediction matches the correction formula
one_test = x_test[0:1] # subset only the first row in data frame format
print(one_test)

         item1  value1   item2  value2
21436  capital    25.0  action    24.6


In [14]:
# prediction input function
input_fn_pred=tf.estimator.inputs.pandas_input_fn(
        x=one_test,
        batch_size=1,
        num_epochs=1,
        shuffle=False
    )

# prediction by tf model
tf_pred = list(regressor.predict(input_fn=input_fn_pred))
print(tf_pred)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model_dir/model.ckpt-23792
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[{'predictions': array([-0.25890255], dtype=float32)}]


In [15]:
# extract coefficients and values for one_test
coeffs1 = item_coeffs[item_coeffs.item==one_test.iloc[0]['item1']].iloc[0]
c1 = coeffs1.c
d1 = coeffs1.d
v1 = one_test.iloc[0]['value1']
coeffs2 = item_coeffs[item_coeffs.item==one_test.iloc[0]['item2']].iloc[0]
c2 = coeffs2.c
d2 = coeffs2.d
v2 = one_test.iloc[0]['value2']

# prediction by algebra
cf_pred = (c1 + d1 * v1) - (c2 + d2 * v2)
print(cf_pred) # this value should be the same as tf_pred

-0.258902516961097
