In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
IGNORE_COLS = [
    "id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09",
    "ps_calc_10", "ps_calc_11", "ps_calc_12", "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin", "ps_calc_18_bin",
    "ps_calc_19_bin", "ps_calc_20_bin"
]
dfTrain = pd.read_csv(train_file)
dfTest = pd.read_csv(test_file)

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS or col in NUMERIC_COLS:
        continue
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
# 获取类型为numeric列的值
train_feature_numeric_value = dfTrain[NUMERIC_COLS]
dfTrain.drop(NUMERIC_COLS,axis=1,inplace=True)

train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    else:
        train_feature_index[col] = train_feature_index[col].map(feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [13]:
dcn_params = {
    "embedding_size": 8,
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "deep_layers_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer_type": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "random_seed": 0,
    "cross_layer_num":3,
    "numeric_feature_size": len(NUMERIC_COLS)
}
dcn_params['feature_size'] = total_feature
dcn_params['field_size'] = len(train_feature_index.columns)
print(total_feature)
print(dcn_params['field_size'])
print(len(NUMERIC_COLS))

247
30
7


In [6]:
weights = dict()
weights['feature_embeddings'] = tf.Variable(tf.random_normal(
    [dcn_params['feature_size'], dcn_params['embedding_size']],
    mean=0.0,
    stddev=0.01),
                                            name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [dcn_params['feature_size'], 1], mean=0.0, stddev=0.01),
                                      name='feature_bias')
total_size = dcn_params['field_size'] * dcn_params[
    'embedding_size'] + dcn_params['numeric_feature_size']
num_layer = len(dcn_params['deep_layers'])
glorot = np.sqrt(2.0 / (total_size + dcn_params['deep_layers'][0]))

weights['layer_0'] = tf.Variable(
    tf.random_normal([total_size, dcn_params['deep_layers'][0]],
                     mean=0.0,
                     stddev=glorot))
weights['bias_0'] = tf.Variable(
    tf.random_normal([1, dcn_params['deep_layers'][0]],
                     mean=0.0,
                     stddev=glorot))
for i in range(1, num_layer):
    glorot = np.sqrt(
        2.0 /
        (dcn_params['deep_layers'][i - 1] + dcn_params['deep_layers'][i]))
    weights['layer_%d' % i] = tf.Variable(
        tf.random_normal(
            [dcn_params['deep_layers'][i - 1], dcn_params['deep_layers'][i]],
            mean=0.0,
            stddev=glorot))
    weights['bias_%d' % i] = tf.Variable(
        tf.random_normal([1, dcn_params['deep_layers'][i]],
                         mean=0.0,
                         stddev=glorot))
for i in range(dcn_params['cross_layer_num']):
    weights["cross_layer_%d" % i] = tf.Variable(
        tf.random_normal([total_size, 1], mean=0.0, stddev=glorot))
    weights["cross_bias_%d" % i] = tf.Variable(
        tf.random_normal([total_size, 1], mean=0.0, stddev=glorot))

input_size = total_size + dcn_params['deep_layers'][-1]

glorot = np.sqrt(2.0 / (input_size + 1))
weights['concat_projection'] = tf.Variable(
    tf.random_normal([input_size, 1], mean=0.0, stddev=glorot))
weights['concat_bias'] = tf.Variable(tf.constant(0.01), dtype=np.float32)

In [7]:
feat_index = tf.placeholder(tf.int32, shape=[None, dcn_params['field_size']], name='feat_index')
feat_value = tf.placeholder(tf.float32, shape=[None, dcn_params['field_size']], name='feat_value')
numeric_value = tf.placeholder(tf.float32,
                               shape=[None, dcn_params['numeric_feature_size']],
                               name='num_value')
label = tf.placeholder(tf.float32, shape=[None, 1], name='label')

In [8]:
embeddings = tf.nn.embedding_lookup(weights['feature_embeddings'], feat_index)
reshaped_feat_value = tf.reshape(feat_value,
                                 shape=[-1, dcn_params['field_size'], 1])
embeddings = tf.multiply(embeddings, reshaped_feat_value)

In [12]:
# numeric_size + field_size * embedding_size
x0 = tf.concat([
    numeric_value,
    tf.reshape(embeddings,
               [-1, dcn_params['field_size'] * dcn_params['embedding_size']])
],
               axis=1)
# deep part
y_deep = x0
for i in range(0, len(dcn_params['deep_layers'])):
    y_deep = tf.matmul(y_deep,
                       weights['layer_%d' % i]) + weights['bias_%d' % i]
    y_deep = dcn_params['deep_layers_activation'](y_deep)

# y_deep: (?, dcn_params['deep_layers'][-1])

# cross part

x0_reshape = tf.reshape(x0, [-1, total_size, 1])

# 这种实现方式占内存
# for l in range(dcn_params['cross_layer_num']):
#     # 这里存疑
#     x_l = tf.tensordot(tf.matmul(x0_reshape, x_l, transpose_b=True),
#                        weights["cross_layer_%d" % l],
#                        axes=1) + weights['cross_bias_%d' % i] + x_l
x_l = x0_reshape
for l in range(dcn_params['cross_layer_num']):
    xb = tf.tensordot(tf.reshape(x_l, [-1, 1, total_size]),
                      weights["cross_layer_%d" % l], 1)
    x_l = tf.multiply(x0_reshape, xb) + weights['cross_bias_%d' % i] + x_l
cross_network_out = tf.reshape(x_l, (-1, total_size))

# concat part
concat_input = tf.concat([cross_network_out, y_deep], axis=1)
print(cross_network_out.shape)
print(y_deep.shape)
print(x0.shape)
out = tf.matmul(concat_input,
                weights['concat_projection']) + weights['concat_bias']
out = tf.nn.sigmoid(out)
loss = tf.losses.log_loss(label, out)
optimizer = tf.train.AdamOptimizer(learning_rate=dcn_params['learning_rate'],
                                   beta1=0.9,
                                   beta2=0.999,
                                   epsilon=1e-8).minimize(loss)

(?, 247)
(?, 32)
(?, 247)


In [10]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batch_size = int(len(train_feature_index)/dcn_params['batch_size'])
    for i in range(dcn_params['epoch']):
        for j in range(batch_size):
            start = i * dcn_params['batch_size']
            end = (i+1) * dcn_params['batch_size']
            end = end if end<len(train_feature_index) else len(train_feature_index)
            feat_index_batch = train_feature_index[start:end]
            feat_value_batch = train_feature_value[start:end]
            feat_numeric_value_batch = train_feature_numeric_value[start:end]
            label_batch = train_y[start:end]
            feed_dict = {
                feat_index:feat_index_batch,
                feat_value:feat_value_batch,
                numeric_value:feat_numeric_value_batch,
                label:label_batch
            }
            l,o = sess.run([loss,optimizer], feed_dict)
            print(l)

1.1480167
1.0077935
0.88329786
0.77332556
0.676273
0.59070575
0.5155548
0.45002508
0.39344358
0.3488221
0.3094035
0.2769541
0.25069693
0.22984628
0.21364105
0.20135972
0.19233568
0.18596679
0.16188584
0.15811683
0.15567833
0.1542659
0.15362176
0.15353443
0.15382954
0.15436718
0.15503475
0.16017298
0.16103034
0.1617975
0.16244242
0.16294551
0.16329738
0.16349743
0.16355018
0.16346508
0.14118245
0.14098166
0.14070672
0.14037105
0.13998735
0.1395668
0.13912013
0.13865685
0.13818575
0.1713267
0.17063288
0.16988662
0.16910955
0.16832158
0.16753843
0.16677485
0.1660431
0.16535157
0.16931258
0.16873437
0.16819037
0.16768612
0.16722275
0.16680124
0.16642162
0.16608247
0.16577917
0.17687869
0.1766331
0.17639929
0.17617798
0.17596902
0.1757722
0.1755852
0.17540625
0.17523368
0.1909211
0.1907798
0.19063514
0.19048896
0.19034213
0.19019483
0.19004712
0.18990093
0.18975554
0.16511305
0.16510352
0.16505602
0.16497931
0.16487654
0.164748
0.16459806
0.16443194
0.16425292
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.