In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# import
data = pd.read_csv(
    'input.csv', dtype={
        'item1': str,
        'value1': float,
        'item2': str,
        'value2': float
    })
print('data size: {}'.format(len(data)))

data['shift'] = data['value2'] - data['value1']

x = data[['item1', 'value1', 'item2', 'value2']]
y = data['shift']

data size: 270351


In [3]:
# split training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)

In [4]:
# count feature values
list_items = x_train['item1'].append(x_train['item2']).drop_duplicates().tolist()
n_items = len(list_items)
print("number of unique items: {}".format(n_items))

# columns
item1_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item1', vocabulary_list=list_items))
value1_col = tf.feature_column.numeric_column('value1')
item2_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item2', vocabulary_list=list_items))
value2_col = tf.feature_column.numeric_column('value2')

number of unique items: 704


In [5]:
# build input function
input_fn_train=tf.estimator.inputs.pandas_input_fn(
        x=x_train,
        y=y_train,
        batch_size=100,
        num_epochs=5,
        shuffle=True
    )

In [6]:
# build model function
def model_fn(features, labels, mode, params):
    input_layer = {
        'item1': tf.feature_column.input_layer(features={'item1': features['item1']},
                                               feature_columns=params['feature_columns'][0]),
        'value1': tf.feature_column.input_layer(features={'value1': features['value1']},
                                                feature_columns=params['feature_columns'][1]),
        'item2': tf.feature_column.input_layer(features={'item2': features['item2']},
                                               feature_columns=params['feature_columns'][2]),
        'value2': tf.feature_column.input_layer(features={'value2': features['value2']},
                                                feature_columns=params['feature_columns'][3])
    }
        
    input_1 = tf.concat((input_layer['item1'],
                         tf.multiply(tf.concat([input_layer['value1']]*n_items, axis=1),
                                     input_layer['item1'])),
                        axis=1)
    input_2 = tf.concat((input_layer['item2'],
                         tf.multiply(tf.concat([input_layer['value2']]*n_items, axis=1),
                                     input_layer['item2'])),
                        axis=1)
    
    subtracted = tf.subtract(input_1, input_2)
    #subtracted = tf.subtract(input_layer['item1'], input_layer['item2'])
    out = tf.layers.dense(
        inputs=subtracted,
        units=1,
        use_bias=False,
        kernel_initializer=None)

    # define head
    my_head = tf.contrib.estimator.regression_head(
        label_dimension=1,
        loss_fn=None  # custom loss, default: mean_squared_error
    )

    return my_head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        optimizer=tf.train.FtrlOptimizer(params['step_size']),
        logits=out,
    )

In [7]:
# custom estimator
regressor = tf.estimator.Estimator(
    model_fn=model_fn,
    params={'feature_columns': [item1_col, value1_col, item2_col, value2_col],
            'step_size': 0.1}
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp7knrt6a1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff9ffec0ef0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [8]:
# train estimator
regressor.train(
    input_fn=input_fn_train,
    steps=None
)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp7knrt6a1/model.ckpt.
INFO:tensorflow:loss = 1.7273831, step = 1
INFO:tensorflow:global_step/sec: 238.098
INFO:tensorflow:loss = 0.4019601, step = 101 (0.421 sec)
INFO:tensorflow:global_step/sec: 275.273
INFO:tensorflow:loss = 0.5103058, step = 201 (0.363 sec)
INFO:tensorflow:global_step/sec: 274.331
INFO:tensorflow:loss = 0.3949973, step = 301 (0.365 sec)
INFO:tensorflow:global_step/sec: 288.621
INFO:tensorflow:loss = 0.33797494, step = 401 (0.346 sec)
INF

INFO:tensorflow:loss = 0.3318516, step = 6901 (0.349 sec)
INFO:tensorflow:global_step/sec: 293.842
INFO:tensorflow:loss = 0.19725952, step = 7001 (0.340 sec)
INFO:tensorflow:global_step/sec: 296.359
INFO:tensorflow:loss = 0.17041412, step = 7101 (0.337 sec)
INFO:tensorflow:global_step/sec: 292.616
INFO:tensorflow:loss = 0.17393938, step = 7201 (0.342 sec)
INFO:tensorflow:global_step/sec: 277.779
INFO:tensorflow:loss = 0.15085949, step = 7301 (0.360 sec)
INFO:tensorflow:global_step/sec: 276.173
INFO:tensorflow:loss = 0.3333887, step = 7401 (0.362 sec)
INFO:tensorflow:global_step/sec: 275.234
INFO:tensorflow:loss = 0.274044, step = 7501 (0.363 sec)
INFO:tensorflow:global_step/sec: 272.39
INFO:tensorflow:loss = 0.16262011, step = 7601 (0.367 sec)
INFO:tensorflow:global_step/sec: 284.042
INFO:tensorflow:loss = 0.1619623, step = 7701 (0.352 sec)
INFO:tensorflow:global_step/sec: 284.562
INFO:tensorflow:loss = 0.3015152, step = 7801 (0.351 sec)
INFO:tensorflow:global_step/sec: 305.461
INFO:te

<tensorflow.python.estimator.estimator.Estimator at 0x7ff9ffec0e48>

In [9]:
# evaluation

# evaluation input function
input_fn_eval=tf.estimator.inputs.pandas_input_fn(
        x=x_test,
        y=y_test,
        batch_size=1,
        num_epochs=1,
        shuffle=False
    )

# evaluate
eval_result = regressor.evaluate(input_fn=input_fn_eval)
print(eval_result)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-02-08-16:24:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp7knrt6a1/model.ckpt-10814
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-02-08-16:25:56
INFO:tensorflow:Saving dict for global step 10814: average_loss = 0.23796499, global_step = 10814, label/mean = 0.022441076, loss = 0.23796499, prediction/mean = 0.012240659
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10814: /tmp/tmp7knrt6a1/model.ckpt-10814
{'average_loss': 0.23796499, 'label/mean': 0.022441076, 'loss': 0.23796499, 'prediction/mean': 0.012240659, 'global_step': 10814}


In [10]:
print(eval_result)

{'average_loss': 0.23796499, 'label/mean': 0.022441076, 'loss': 0.23796499, 'prediction/mean': 0.012240659, 'global_step': 10814}


In [11]:
# construct prediction table

# list existing items with value 0
x_pred = pd.DataFrame({'item1': list_items})
x_pred['value1'] = 0
x_pred['item2'] = ''
x_pred['value2'] = 0

# list existing items with value 1
x_pred_ext = x_pred.copy()
x_pred_ext['value1'] = 1
x_pred = x_pred.append(x_pred_ext, ignore_index=True)

# swap 1 <--> 2
x_pred2 = x_pred[['item2', 'value2', 'item1', 'value1']]
x_pred2.columns = ['item1', 'value1', 'item2', 'value2']
x_pred = x_pred.append(x_pred2, ignore_index=True).drop_duplicates()

In [12]:
# prediction
predictions = list(
    regressor.predict(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x=x_pred,
            batch_size=len(x_pred),
            num_epochs=1,
            shuffle=False)))

# output
pred = []
for i in predictions:
    pred.append(np.float64(i['predictions'][0]))
x_pred['shift'] = pred
x_pred.to_csv('predictions.csv', sep=',', index=False)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp7knrt6a1/model.ckpt-10814
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
