In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
IGNORE_COLS = [
    "id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09",
    "ps_calc_10", "ps_calc_11", "ps_calc_12", "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin", "ps_calc_18_bin",
    "ps_calc_19_bin", "ps_calc_20_bin"
]
dfTrain = pd.read_csv(train_file)
dfTest = pd.read_csv(test_file)

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS:
        continue
    elif col in NUMERIC_COLS:
        # 数字类型列，作为一个特征
        feature_dict[col] = total_feature
        total_feature += 1
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)
print(feature_dict)

{'ps_car_01_cat': {10: 0, 11: 1, 7: 2, 6: 3, 9: 4, 5: 5, 4: 6, 8: 7, 3: 8, 0: 9, 2: 10, 1: 11, -1: 12}, 'ps_car_02_cat': {1: 13, 0: 14}, 'ps_car_03_cat': {-1: 15, 0: 16, 1: 17}, 'ps_car_04_cat': {0: 18, 1: 19, 8: 20, 9: 21, 2: 22, 6: 23, 3: 24, 7: 25, 4: 26, 5: 27}, 'ps_car_05_cat': {1: 28, -1: 29, 0: 30}, 'ps_car_06_cat': {4: 31, 11: 32, 14: 33, 13: 34, 6: 35, 15: 36, 3: 37, 0: 38, 1: 39, 10: 40, 12: 41, 9: 42, 17: 43, 7: 44, 8: 45, 5: 46, 2: 47, 16: 48}, 'ps_car_07_cat': {1: 49, -1: 50, 0: 51}, 'ps_car_08_cat': {0: 52, 1: 53}, 'ps_car_09_cat': {0: 54, 2: 55, 3: 56, 1: 57, -1: 58, 4: 59}, 'ps_car_10_cat': {1: 60, 0: 61, 2: 62}, 'ps_car_11': {2: 63, 3: 64, 1: 65, 0: 66}, 'ps_car_11_cat': {12: 67, 19: 68, 60: 69, 104: 70, 82: 71, 99: 72, 30: 73, 68: 74, 20: 75, 36: 76, 101: 77, 103: 78, 41: 79, 59: 80, 43: 81, 64: 82, 29: 83, 95: 84, 24: 85, 5: 86, 28: 87, 87: 88, 66: 89, 10: 90, 26: 91, 54: 92, 32: 93, 38: 94, 83: 95, 89: 96, 49: 97, 93: 98, 1: 99, 22: 100, 85: 101, 78: 102, 31: 103, 3

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    elif col in NUMERIC_COLS:
        train_feature_index[col] = feature_dict[col]
    else:
        train_feature_index[col] = train_feature_index[col].map(
            feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [5]:
"""模型参数"""
xdfm_params = {
    "embedding_size": 8,
    "dropout_fm": [1.0, 1.0],
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "cross_layers" : [80, 80, 40],
    "deep_layer_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 1024,
    "learning_rate": 0.0001,
    "optimizer": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": 'gini_norm',
    "random_seed": 3
}
xdfm_params['feature_size'] = total_feature
xdfm_params['field_size'] = len(train_feature_index.columns)
print(total_feature)
print(len(train_feature_index.columns))

254
37


In [6]:
# weight
weights = dict()
# linear params
weights['feature_embeddings'] = tf.Variable(tf.random_normal(
    [xdfm_params['feature_size'], xdfm_params['embedding_size']], 0.0, 0.01),
                                            name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [xdfm_params['feature_size'], 1], 0.0, 0.01),
                                      name='feature_bias')
linear_size = xdfm_params['field_size'] 
weights['linear_weight'] = tf.Variable(tf.random_normal(shape=[linear_size, 1]), dtype=tf.float32)
weights['linear_bias'] = tf.Variable(tf.random_normal(shape=[1]),dtype=tf.float32)

# deep params
input_size = xdfm_params['field_size'] * xdfm_params['embedding_size']
glort = np.sqrt(2.0 / (input_size + xdfm_params['deep_layers'][0]))
weights['deeplayer_weight_0'] = tf.Variable(tf.random_normal(
    shape=[input_size, xdfm_params['deep_layers'][0]], mean=0, stddev=glort),
                                            dtype=tf.float32)
weights['deeplayer_bias_0'] = tf.Variable(
    tf.random_normal(shape=[1, xdfm_params['deep_layers'][0]],
                     mean=0,
                     stddev=glort))

for i in range(1, len(xdfm_params['deep_layers'])):
    glort = np.sqrt(
        2.0 /
        (xdfm_params['deep_layers'][i - 1] + xdfm_params['deep_layers'][i]))
    weights['deeplayer_weight_%d' % i] = tf.Variable(tf.random_normal(
        shape=[
            xdfm_params['deep_layers'][i - 1], xdfm_params['deep_layers'][i]
        ],
        mean=0.0,
        stddev=glort),
                                                     dtype=tf.float32)
    weights['deeplayer_bias_%d' % i] = tf.Variable(tf.random_normal(
        shape=[1, xdfm_params['deep_layers'][i]], mean=0.0, stddev=glort),
                                            dtype=tf.float32)

deep_size = xdfm_params['deep_layers'][-1]
weights['deep_weight'] = tf.Variable(tf.random_normal(shape=[deep_size, 1]), dtype=tf.float32)
weights['deep_bias'] = tf.Variable(tf.random_normal(shape=[1]),dtype=tf.float32)
    
# cin params
weights['cross_layer_0'] = tf.Variable(
        tf.random_normal(shape=[
            1, xdfm_params['field_size'] * xdfm_params['field_size'], xdfm_params['cross_layers'][0]
        ], mean=0.0, stddev=0.1))
for i in range(1, len(xdfm_params['cross_layers'])):
    weights['cross_layer_%d' % i] = tf.Variable(
        tf.random_normal(shape=[
            1, xdfm_params['field_size'] *
            xdfm_params['cross_layers'][i-1], xdfm_params['cross_layers'][i]
        ], mean=0.0, stddev=0.1))

cross_size = sum(xdfm_params['cross_layers'])
weights['cross_weight'] = tf.Variable(tf.random_normal(shape=[cross_size, 1]), dtype=tf.float32)
weights['cross_bias'] = tf.Variable(tf.random_normal(shape=[1]),dtype=tf.float32)


# final params 
weights['final_weight'] = tf.Variable(tf.random_normal(shape=[3, 1]), dtype=tf.float32)
weights['final_bias'] = tf.Variable(tf.random_normal(shape=[1]),dtype=tf.float32)

In [7]:
# 输入
feat_index = tf.placeholder(tf.int32, shape=[None, None], name='feat_index')
feat_value = tf.placeholder(tf.float32, shape=[None, None], name='feat_value')
label = tf.placeholder(tf.float32, shape=[None, 1], name='label')

In [8]:
# embedding
reshaped_feat_value = tf.reshape(feat_value,
                                 shape=[-1, xdfm_params['field_size'], 1])
embeddings = tf.nn.embedding_lookup(weights['feature_embeddings'], feat_index)
embeddings = tf.multiply(embeddings, reshaped_feat_value)

In [9]:
# linear part
weight = tf.nn.embedding_lookup(weights['feature_bias'], feat_index)
linear_out = tf.reduce_sum(tf.multiply(weight, reshaped_feat_value), 2)
linear_out = tf.matmul(linear_out , weights['linear_weight']) + weights['linear_bias']
print(linear_out.shape)

(?, 1)


In [23]:
# cin part
cross_layers = []
field_nums = []
final_result = []
# 对输入进行处理
reshape_cin_input = tf.reshape(
    embeddings,
    shape=[-1, xdfm_params['field_size'], xdfm_params['embedding_size']])
cross_layers.append(reshape_cin_input)
field_nums.append(int(xdfm_params['field_size']))
final_len = 0
split_tensor_0 = tf.split(cross_layers[0], xdfm_params['embedding_size']*[1], 2)
for idx, layer_size in enumerate(xdfm_params['cross_layers']):
    print(cross_layers[-1].shape)
    split_tensor = tf.split(cross_layers[-1], xdfm_params['embedding_size']*[1], 2)
    print(split_tensor[0].shape)
    # 计算外积
    dot_result_m = tf.matmul(split_tensor_0, split_tensor, transpose_b=True)
    print(dot_result_m.shape)
    dot_result_o = tf.reshape(dot_result_m,
                              shape=[
                                  xdfm_params['embedding_size'], -1,
                                  field_nums[0] * field_nums[-1]
                              ])
    print(dot_result_o.shape)
    dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2])    
    print(dot_result.shape)
    filters = weights['cross_layer_%d' % idx]
    print(filters.shape)
    curr_out = tf.nn.conv1d(dot_result,
                            filters=filters,
                            stride=1,
                            padding="VALID")
    print(curr_out.shape)
    curr_out = tf.nn.relu(curr_out)
    curr_out = tf.transpose(curr_out, perm=[0, 2, 1])
    print(curr_out.shape)
    direct_connect = curr_out
    next_hidden = curr_out
    final_len += layer_size
    field_nums.append(int(layer_size))
    final_result.append(direct_connect)
    cross_layers.append(next_hidden)
result = tf.concat(final_result, axis=1)
print(result.shape)
cin_out = tf.reduce_sum(result, -1)
cin_out = tf.matmul(cin_out, weights['cross_weight']) + weights['cross_bias']

(?, 37, 8)
(?, 37, 1)
(8, ?, 37, 37)
(8, ?, 1369)
(?, 8, 1369)
(1, 1369, 80)
(?, 8, 80)
(?, 80, 8)
(?, 80, 8)
(?, 80, 1)
(8, ?, 37, 80)
(8, ?, 2960)
(?, 8, 2960)
(1, 2960, 80)
(?, 8, 80)
(?, 80, 8)
(?, 80, 8)
(?, 80, 1)
(8, ?, 37, 80)
(8, ?, 2960)
(?, 8, 2960)
(1, 2960, 40)
(?, 8, 40)
(?, 40, 8)
(?, 200, 8)


In [11]:
# dnn part
deep_out = tf.reshape(embeddings, [-1, xdfm_params['field_size'] * xdfm_params['embedding_size']])
for i in range(len(xdfm_params['deep_layers'])):
    deep_out = tf.matmul(deep_out, weights['deeplayer_weight_%d'%i]) + weights['deeplayer_bias_%d'%i]
    deep_out = tf.nn.relu(deep_out)
deep_out = tf.matmul(deep_out, weights['deep_weight']) + weights['deep_bias']
print(deep_out.shape)

(?, 1)


In [12]:
final_layer = tf.concat([linear_out, cin_out, deep_out], axis=1)
res = tf.matmul(final_layer,weights['final_weight'] ) + weights['final_bias']
logit = tf.nn.sigmoid(res)

In [13]:
loss = tf.losses.log_loss(label, logit)
optimizer = tf.train.AdamOptimizer(learning_rate=xdfm_params['learning_rate']).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [14]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batch_size = int(len(train_feature_index)/xdfm_params['batch_size'])
    for i in range(xdfm_params['epoch']):
        for j in range(batch_size):
            start = i * xdfm_params['batch_size']
            end = (i+1) * xdfm_params['batch_size']
            end = end if end<len(train_feature_index) else len(train_feature_index)
            feat_index_batch = train_feature_index[start:end]
            feat_value_batch = train_feature_value[start:end]
            label_batch = train_y[start:end]
            feed_dict = {
                feat_index:feat_index_batch,
                feat_value:feat_value_batch,
                label:label_batch
            }
            l,o = sess.run([loss,optimizer], feed_dict)
            print(l)

2.4881177
2.476877
2.4656577
2.4544654
2.4433217
2.4322414
2.4212332
2.410305
2.3994555
2.3755052
2.3648849
2.3543231
2.3438103
2.3333373
2.3228934
2.3124666
2.3020468
2.2916288
2.298889
2.2883945
2.2778764
2.2673316
2.2567606
2.2461588
2.2355254
2.2248566
2.2141492
2.1999784
2.1892438
2.1784656
2.1676397
2.156764
2.145834
2.1348476
2.123803
2.1126952
2.1131983
2.101869
2.0904686
2.079003
2.0674734
2.0558946
2.044274
2.0326266
2.020966
1.9938593
1.9823277
1.9707787
1.9592164
1.9476277
1.9359907
1.9242959
1.9125276
1.9006746
1.887008
1.8749936
1.8628726
1.850639
1.8382951
1.8258338
1.8132558
1.8005549
1.7877265
1.7691326
1.756096
1.7429289
1.7296286
1.7161901
1.702614
1.688897
1.6750376
1.661035
1.642071
1.6279202
1.6136248
1.5991838
1.5845969
1.5698615
1.5549754
1.5399368
1.5247443
1.5214747
1.5057855
1.48992
1.4738764
1.4576578
1.4412652
1.4247016
1.4079703
1.391076
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
