In [None]:
import tensorflow as tf
import pickle
from collections import defaultdict
from random import randint
import numpy as np
from sklearn.metrics import mean_squared_error

tf.logging.set_verbosity(tf.logging.ERROR)

data_path = r"..\data\training_data.p"
labels_path = r"..\data\training_labels_continuous.p"
validation_data_path = r"..\data\validation_data.p"
validation_labels_path = r"..\data\validation_labels_continuous.p"
validation_targets_path = r"..\data\validation_targets.p"
model_dir = r"..\model"

COLUMNS = ['weekday','hour','region','city','adexchange','advertiser','os','browser','usertag']
LABEL_COLUMN = "payprice"
CATEGORICAL_COLUMNS = ['weekday','hour','region','city','adexchange','advertiser','os','browser','usertag']

USERTAGS = ['13776', '10133', '10146', '10052', '13800', '13678', '10077', '10057', '10048',
            '16753', '16706', '10120', '11278', '10140', '10127', '10684', '10138', '10148', '11092',
            '15398', '10067', '11632', '10117', '10114', '10145', '11576', '14273', '10059', '16617',
            '10083', '13403', '10126', '11944', '13874', '11724', '10076', '10131', '10093', '11423',
            '10110', '10123', '16751', '13496', '10149', '10111', '10031', '10142', '10118', '10074',
            '10024', '16593', '10006', '10116', '11680', '10130', '10147', '10102', '10063',
            '10075', '11512', '10129', '10079', '10125', '10115', '13042', '11379', '16661', '13866']

dnn_hidden_layers_param = [100,50]

In [None]:
def load_data(path):
    return pickle.load(open(path,"rb"))

def build_estimator(model_dir,model_type=None):
    weekday = tf.contrib.layers.sparse_column_with_keys(column_name="weekday",
                                                     keys=["0","1","2","3","4","5","6"],
                                                     combiner="sqrtn")
    hour = tf.contrib.layers.sparse_column_with_keys(column_name="hour",
                                                     keys=["0","1","2","3","4","5","6","7"
                                                          "8","9","10","11","12","13","14",
                                                           "15","16","17","18","19","20","21",
                                                           "22","23"],
                                                    combiner="sqrtn")
    region = tf.contrib.layers.sparse_column_with_hash_bucket(
      "region", hash_bucket_size=100,combiner="sqrtn")
    city = tf.contrib.layers.sparse_column_with_hash_bucket(
      "city", hash_bucket_size=1000,combiner="sqrtn")
    adexchange = tf.contrib.layers.sparse_column_with_keys(column_name="adexchange",
                                                     keys=["1","2","3","4","null"],
                                                          combiner="sqrtn")
    advertiser = tf.contrib.layers.sparse_column_with_hash_bucket(
      "advertiser", hash_bucket_size=20,combiner="sqrtn")
    os = tf.contrib.layers.sparse_column_with_hash_bucket(
      "os", hash_bucket_size=100,combiner="sqrtn")
    browser = tf.contrib.layers.sparse_column_with_hash_bucket(
      "browser", hash_bucket_size=100,combiner="sqrtn")
    usertag = tf.contrib.layers.sparse_column_with_hash_bucket(
      "usertag", hash_bucket_size=1000000,combiner="sqrtn")

    # Wide columns and deep columns.
    wide_columns = [weekday,hour,region,city,adexchange,advertiser,os,browser,usertag]

    deep_columns = [
      tf.contrib.layers.embedding_column(weekday, dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(hour, dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(region, dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(city,dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(adexchange, dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(advertiser, dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(os, dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(browser, dimension=4,combiner="sqrtn"),
      tf.contrib.layers.embedding_column(usertag, dimension=8,combiner="sqrtn"),
      ]
    
    estimator = tf.contrib.learn.DNNLinearCombinedRegressor(
        # wide settings
        linear_feature_columns=wide_columns,
        linear_optimizer=tf.train.FtrlOptimizer(learning_rate=0.1,
                                                l1_regularization_strength=0.001,
                                                l2_regularization_strength=0.001),
        # deep settings
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=dnn_hidden_layers_param,
        dnn_optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.1,
                                                        l1_regularization_strength=0.001,
                                                        l2_regularization_strength=0.001),
        config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1800),
        model_dir=model_dir
    )
    
    return estimator

def input_fn(data,labels, first=None, last=None):
    categorical_cols = {}
    for k in CATEGORICAL_COLUMNS:
        categorical_cols[k] = tf.SparseTensor(
            indices=[[i, 0] for i in range(len(data[k][first:last]))],
            values=data[k][first:last],
            shape=[len(data[k][first:last]),1])
    
    label = tf.constant(labels[first:last])
    return categorical_cols, label


def train(data,labels,model_dir, epochs=200):
    size_limit = len(data['weekday']) - 10000
    m = build_estimator(model_dir)
    
    for i in range(epochs):
        if i % 10 == 0:
            print("Training step: " + str(i))
        starting_index = randint(0,size_limit)
        last_index = starting_index + 10000
        m.partial_fit(input_fn=lambda: input_fn(data,labels,starting_index,last_index), steps=1)
    
    return m

def predict(model,data,labels):
    return model.predict(input_fn=lambda: input_fn(data,labels), as_iterable=False)

In [None]:
print("Loading data...")
training_data = load_data(data_path)
training_labels = load_data(labels_path)
validation_data = load_data(validation_data_path)
validation_labels = load_data(validation_labels_path)
print("Done.")

In [None]:
print("Training model")
model = train(training_data, training_labels,model_dir,epochs=200);
print("Done.")

In [None]:
print("Evaluating model")
results = predict(model,validation_data,validation_labels)
y_est = np.array(results).astype(int).clip(0,200)
y_true = np.array(validation_labels)
print("Done.")

In [None]:
print(mean_squared_error(y_true,y_est))