In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
# import
data = pd.read_csv(
    'input_full.csv', dtype={
        'item1': str,
        'item2': str,
        'dif': float
    })
print('data size: {}'.format(len(data)))

x = data[['item1', 'item2']]
y = data['dif']

data size: 2092535


In [3]:
# count feature values
list_items = x['item1'].append(x['item2']).drop_duplicates().tolist()
print("number of unique items: {}".format(len(list_items)))

# columns
item1_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item1', vocabulary_list=list_items))
item2_col = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'item2', vocabulary_list=list_items))
dif_col = tf.feature_column.numeric_column('dif')

number of unique items: 751


In [4]:
# build input function
input_fn_train=tf.estimator.inputs.pandas_input_fn(
        x=x,
        y=y,
        batch_size=100,
        num_epochs=1,
        shuffle=True
    )

In [5]:
# build model function
def model_fn(features, labels, mode, params):
    # split features into ref and tar features
    col1 = params['feature_columns'][0]
    col2 = params['feature_columns'][1]
    feats = {
        'col1':{'item1': features['item1']},
        'col2':{'item2': features['item2']}
    }
    input_layer = {
        'col1': tf.feature_column.input_layer(feats['col1'], col1),
        'col2': tf.feature_column.input_layer(feats['col2'], col2),
    }
    subtracted = tf.subtract(input_layer['col1'], input_layer['col2'])
    out = tf.layers.dense(
        subtracted, units=1,
        use_bias=False,
        kernel_initializer=None)

    # define head
    my_head = tf.contrib.estimator.regression_head(
        label_dimension=1,
        loss_fn=None  # custom loss, default: mean_squared_error
    )

    return my_head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        optimizer=tf.train.FtrlOptimizer(params['step_size']),
        logits=out,
    )

In [6]:
# custom estimator
regressor = tf.estimator.Estimator(
    model_fn=model_fn,
    params={'feature_columns': [item1_col, item2_col],
            'step_size': 0.2}
)

# train estimator
regressor.train(
    input_fn=input_fn_train,
    steps=None
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp8jexgyq2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f20a22e4dd8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, u

INFO:tensorflow:global_step/sec: 340.311
INFO:tensorflow:loss = 0.327683, step = 6101 (0.294 sec)
INFO:tensorflow:global_step/sec: 319.552
INFO:tensorflow:loss = 0.350402, step = 6201 (0.313 sec)
INFO:tensorflow:global_step/sec: 349.818
INFO:tensorflow:loss = 0.503439, step = 6301 (0.286 sec)
INFO:tensorflow:global_step/sec: 346.697
INFO:tensorflow:loss = 0.363038, step = 6401 (0.288 sec)
INFO:tensorflow:global_step/sec: 348.336
INFO:tensorflow:loss = 0.350803, step = 6501 (0.287 sec)
INFO:tensorflow:global_step/sec: 336.221
INFO:tensorflow:loss = 0.123463, step = 6601 (0.297 sec)
INFO:tensorflow:global_step/sec: 346.711
INFO:tensorflow:loss = 0.135619, step = 6701 (0.288 sec)
INFO:tensorflow:global_step/sec: 350.597
INFO:tensorflow:loss = 0.325959, step = 6801 (0.285 sec)
INFO:tensorflow:global_step/sec: 328.172
INFO:tensorflow:loss = 0.219574, step = 6901 (0.305 sec)
INFO:tensorflow:global_step/sec: 348.62
INFO:tensorflow:loss = 0.123288, step = 7001 (0.287 sec)
INFO:tensorflow:globa

INFO:tensorflow:loss = 0.392366, step = 14401 (0.304 sec)
INFO:tensorflow:global_step/sec: 386.245
INFO:tensorflow:loss = 0.216794, step = 14501 (0.259 sec)
INFO:tensorflow:global_step/sec: 343.709
INFO:tensorflow:loss = 0.190906, step = 14601 (0.291 sec)
INFO:tensorflow:global_step/sec: 353.697
INFO:tensorflow:loss = 0.201757, step = 14701 (0.283 sec)
INFO:tensorflow:global_step/sec: 359.862
INFO:tensorflow:loss = 0.151007, step = 14801 (0.278 sec)
INFO:tensorflow:global_step/sec: 343.602
INFO:tensorflow:loss = 0.0964957, step = 14901 (0.291 sec)
INFO:tensorflow:global_step/sec: 381.206
INFO:tensorflow:loss = 0.0165505, step = 15001 (0.262 sec)
INFO:tensorflow:global_step/sec: 368.94
INFO:tensorflow:loss = 0.0852365, step = 15101 (0.271 sec)
INFO:tensorflow:global_step/sec: 359.698
INFO:tensorflow:loss = 0.0275539, step = 15201 (0.278 sec)
INFO:tensorflow:global_step/sec: 347.883
INFO:tensorflow:loss = 0.139114, step = 15301 (0.288 sec)
INFO:tensorflow:global_step/sec: 340.89
INFO:ten

<tensorflow.python.estimator.estimator.Estimator at 0x7f20a22e4ba8>

In [7]:
# construct prediction table
x_pred = x.drop_duplicates()
x_pred2 = x_pred[['item2', 'item1']]
x_pred2.columns = ['item1', 'item2']
x_pred = x_pred.append(x_pred2, ignore_index=True).drop_duplicates()
x_pred.loc[len(x_pred)] = ['', '']

In [8]:
# predict for each item
predictions = list(
    regressor.predict(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x=x_pred, batch_size=len(x_pred), num_epochs=1, shuffle=False)))

# output
pred = []
for i in predictions:
    pred.append(np.float64(i['predictions'][0]))
x_pred['shift'] = pred
x_pred.to_csv('predictions.csv', sep=',', index=False)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp8jexgyq2/model.ckpt-20926
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
