In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
# import
data = pd.read_csv(
    'input.csv', dtype={
        'item1': str,
        'value1': float,
        'item2': str,
        'value2': float
    })
print('data size: {}'.format(len(data)))

# after correction on both sides, the difference is zero
data['zero'] = 0

x = data[['item1', 'value1', 'item2', 'value2']]
y = data['zero']

data size: 270351


In [3]:
# count feature values
list_items = x['item1'].append(x['item2']).drop_duplicates().tolist()
n_items = len(list_items)
print("number of unique items: {}".format(n_items))

# columns
item1_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item1', vocabulary_list=list_items))
value1_col = tf.feature_column.numeric_column('value1')
item2_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item2', vocabulary_list=list_items))
value2_col = tf.feature_column.numeric_column('value2')
zero_col = tf.feature_column.numeric_column('zero')

number of unique items: 710


In [4]:
# build input function
input_fn_train=tf.estimator.inputs.pandas_input_fn(
        x=x,
        y=y,
        batch_size=100,
        num_epochs=1,
        shuffle=True
    )

In [5]:
# build model function
def model_fn(features, labels, mode, params):
    input_layer = {
        'item1': tf.feature_column.input_layer(features={'item1': features['item1']},
                                               feature_columns=params['feature_columns'][0]),
        'value1': tf.feature_column.input_layer(features={'value1': features['value1']},
                                                feature_columns=params['feature_columns'][1]),
        'item2': tf.feature_column.input_layer(features={'item2': features['item2']},
                                               feature_columns=params['feature_columns'][2]),
        'value2': tf.feature_column.input_layer(features={'value2': features['value2']},
                                                feature_columns=params['feature_columns'][3])
    }
        
    input_1 = tf.concat((input_layer['item1'],
                         tf.matmul(tf.reshape(input_layer['value1'], shape=[1, n_items]),
                                   input_layer['item1'])),
                        axis=1)
    input_2 = tf.concat((input_layer['item2'],
                         tf.matmul(tf.reshape(input_layer['value2'], shape=[1, n_items]),
                                   input_layer['item2'])),
                        axis=1)
    
    #subtracted = tf.subtract(input_1, input_2)
    subtracted = tf.subtract(input_layer['item1'], input_layer['item2'])
    out = tf.layers.dense(
        inputs=subtracted,
        units=1,
        use_bias=False,
        kernel_initializer=None)

    # define head
    my_head = tf.contrib.estimator.regression_head(
        label_dimension=1,
        loss_fn=None  # custom loss, default: mean_squared_error
    )

    return my_head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        optimizer=tf.train.FtrlOptimizer(params['step_size']),
        logits=out,
    )

In [6]:
# custom estimator
regressor = tf.estimator.Estimator(
    model_fn=model_fn,
    params={'feature_columns': [item1_col, value1_col, item2_col, value2_col],
            'step_size': 0.2}
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpwg4__zap', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7eff4df4de80>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [7]:
# train estimator
regressor.train(
    input_fn=input_fn_train,
    steps=None
)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpwg4__zap/model.ckpt.
INFO:tensorflow:loss = 0.00428363, step = 1
INFO:tensorflow:global_step/sec: 304.048
INFO:tensorflow:loss = 7.285021e-08, step = 101 (0.330 sec)
INFO:tensorflow:global_step/sec: 347.694
INFO:tensorflow:loss = 2.9991124e-08, step = 201 (0.287 sec)
INFO:tensorflow:global_step/sec: 325.285
INFO:tensorflow:loss = 1.317915e-08, step = 301 (0.307 sec)
INFO:tensorflow:global_step/sec: 350.761
INFO:tensorflow:loss = 3.5386517e-08, step = 401 (

<tensorflow.python.estimator.estimator.Estimator at 0x7eff4df4dcf8>

In [8]:
# construct prediction table

# list existing items with value 0
x_pred = pd.DataFrame({'item1': list_items})
x_pred['value1'] = 0
x_pred['item2'] = ''
x_pred['value2'] = 0

# list existing items with value 1
x_pred_ext = x_pred.copy()
x_pred_ext['value1'] = 1
x_pred = x_pred.append(x_pred_ext, ignore_index=True)

# swap 1 <--> 2
x_pred2 = x_pred[['item2', 'value2', 'item1', 'value1']]
x_pred2.columns = ['item1', 'value1', 'item2', 'value2']
x_pred = x_pred.append(x_pred2, ignore_index=True).drop_duplicates()

In [9]:
# prediction
predictions = list(
    regressor.predict(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x=x_pred,
            batch_size=len(x_pred),
            num_epochs=1,
            shuffle=False)))

# output
pred = []
for i in predictions:
    pred.append(np.float64(i['predictions'][0]))
x_pred['shift'] = pred
x_pred.to_csv('predictions.csv', sep=',', index=False)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpwg4__zap/model.ckpt-2704
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
