In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
IGNORE_COLS = [
    "id", "target",
    "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
    "ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
    "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
    "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
dfTrain = pd.read_csv('../data/train.csv')
dfTest = pd.read_csv('../data/test.csv')

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS:
        continue
    elif col in NUMERIC_COLS:
        # 数字类型列，作为一个特征
        feature_dict[col] = total_feature
        total_feature += 1
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)
print(feature_dict)

{'ps_car_01_cat': {10: 0, 11: 1, 7: 2, 6: 3, 9: 4, 5: 5, 4: 6, 8: 7, 3: 8, 0: 9, 2: 10, 1: 11, -1: 12}, 'ps_car_02_cat': {1: 13, 0: 14}, 'ps_car_03_cat': {-1: 15, 0: 16, 1: 17}, 'ps_car_04_cat': {0: 18, 1: 19, 8: 20, 9: 21, 2: 22, 6: 23, 3: 24, 7: 25, 4: 26, 5: 27}, 'ps_car_05_cat': {1: 28, -1: 29, 0: 30}, 'ps_car_06_cat': {4: 31, 11: 32, 14: 33, 13: 34, 6: 35, 15: 36, 3: 37, 0: 38, 1: 39, 10: 40, 12: 41, 9: 42, 17: 43, 7: 44, 8: 45, 5: 46, 2: 47, 16: 48}, 'ps_car_07_cat': {1: 49, -1: 50, 0: 51}, 'ps_car_08_cat': {0: 52, 1: 53}, 'ps_car_09_cat': {0: 54, 2: 55, 3: 56, 1: 57, -1: 58, 4: 59}, 'ps_car_10_cat': {1: 60, 0: 61, 2: 62}, 'ps_car_11': {2: 63, 3: 64, 1: 65, 0: 66}, 'ps_car_11_cat': {12: 67, 19: 68, 60: 69, 104: 70, 82: 71, 99: 72, 30: 73, 68: 74, 20: 75, 36: 76, 101: 77, 103: 78, 41: 79, 59: 80, 43: 81, 64: 82, 29: 83, 95: 84, 24: 85, 5: 86, 28: 87, 87: 88, 66: 89, 10: 90, 26: 91, 54: 92, 32: 93, 38: 94, 83: 95, 89: 96, 49: 97, 93: 98, 1: 99, 22: 100, 85: 101, 78: 102, 31: 103, 3

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    elif col in NUMERIC_COLS:
        train_feature_index[col] = feature_dict[col]
    else:
        train_feature_index[col] = train_feature_index[col].map(
            feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [5]:
"""模型参数"""
dfm_params = {
    "embedding_size":8,
    "deep_layers":[32,32],
    "deep_layer_activation":tf.nn.relu,
    "epoch":30,
    "batch_size":1024,
    "learning_rate":0.001,
    "optimizer":"adam",
    "batch_norm":1,
    "batch_norm_decay":0.995,
    "verbose":True,
    "random_seed":0,
    "deep_init_size":50,
    "use_inner":False,
    "pairs": int(len(train_feature_index.columns) * (len(train_feature_index.columns)-1)/2)
}
dfm_params['feature_size'] = total_feature
dfm_params['field_size'] = len(train_feature_index.columns)
print(total_feature)
print(len(train_feature_index.columns))

254
37


In [6]:
feat_index = tf.placeholder(tf.int32, shape=[None, None], name='feat_index')
feat_value = tf.placeholder(tf.float32, shape=[None, None], name='feat_value')
label = tf.placeholder(tf.float32,shape=[None,1],name='label')

In [7]:
weights = dict()

weights['feature_embedding'] = tf.Variable(tf.random_normal(
    [dfm_params['feature_size'], dfm_params['embedding_size']], 0.0, 0.1),
                                           name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [dfm_params['feature_size'], 1], 0.0, 0.1),
                                      name='feature_bias')
if dfm_params['use_inner']:
    pass
else:
    weights['product-quadratic-outer'] = tf.Variable(
        tf.random_normal([
            dfm_params['embedding_size'], dfm_params['pairs'],
            dfm_params['embedding_size']
        ], 0.0, 0.1))

weights['product-linear'] = tf.Variable(
    tf.random_normal([1,dfm_params['field_size']*dfm_params['embedding_size']], 
                     0.0, 0.01))

input_size = dfm_params['embedding_size'] * dfm_params['field_size']+ dfm_params['pairs']
glorot = np.sqrt(2.0 / (input_size + dfm_params['deep_layers'][0]))
weights['layer_0'] = tf.Variable(np.random.normal(
    loc=0, scale=glorot, size=(input_size, dfm_params['deep_layers'][0])),
                                 dtype=np.float32)
weights['bias_0'] = tf.Variable(np.random.normal(
    loc=0, scale=glorot, size=(1, dfm_params['deep_layers'][0])),
                                dtype=np.float32)
num_layer = len(dfm_params['deep_layers'])
for i in range(1, num_layer):
    glorot = np.sqrt(
        2.0 /
        (dfm_params['deep_layers'][i - 1] + dfm_params['deep_layers'][i]))
    weights["layer_%d" % i] = tf.Variable(
        np.random.normal(loc=0,
                         scale=glorot,
                         size=(dfm_params['deep_layers'][i - 1],
                               dfm_params['deep_layers'][i])),
        dtype=np.float32)  # layers[i-1] * layers[i]
    weights["bias_%d" % i] = tf.Variable(np.random.normal(
        loc=0, scale=glorot, size=(1, dfm_params['deep_layers'][i])),
                                         dtype=np.float32)  # 1 * layer[i]

glorot = np.sqrt(2.0 / (input_size + 1))
weights['output'] = tf.Variable(np.random.normal(
    loc=0, scale=glorot, size=(dfm_params['deep_layers'][-1], 1)),
                                dtype=np.float32)
weights['output_bias'] = tf.Variable(tf.constant(0.01), dtype=np.float32)

In [8]:
embeddings = tf.nn.embedding_lookup(weights['feature_embedding'], feat_index)
reshape_feat_value = tf.reshape(feat_value,
                                shape=[-1, dfm_params['field_size'], 1])
embeddings = tf.multiply(embeddings, reshape_feat_value)
# linear part
linear_output = []

reshape_embedding = tf.reshape(
    embeddings, [-1, dfm_params['field_size'] * dfm_params['embedding_size']])
lz = tf.multiply(reshape_embedding, weights['product-linear'])

# quadratic part
if dfm_params['use_inner']:
    row = []
    col = []

    for i in range(dfm_params['field_size'] - 1):
        for j in range(i + 1, dfm_params['field_size']):
            row.append(i)
            col.append(j)
    # tf.transpose(embeddings, [1, 0, 2]) -> (37, ?, 8)
    # tf.gather(...) -> (666, ?, 8)
    # tf.transpose(...) -> (?, 666, 8)
    p = tf.transpose(tf.gather(tf.transpose(embeddings, [1, 0, 2]), row),
                     [1, 0, 2])
    q = tf.transpose(tf.gather(tf.transpose(embeddings, [1, 0, 2]), col),
                     [1, 0, 2])
    p = tf.reshape(p, [-1, dfm_params['pairs'], dfm_params['embedding_size']])
    q = tf.reshape(q, [-1, dfm_params['pairs'], dfm_params['embedding_size']])
    lp = tf.reshape(tf.reduce_sum(p * q, [-1]), [-1, dfm_params['pairs']])
else:
    row = []
    col = []
    for i in range(dfm_params['field_size'] - 1):
        for j in range(i + 1, dfm_params['field_size']):
            row.append(i)
            col.append(j)
    p = tf.transpose(tf.gather(tf.transpose(embeddings, [1, 0, 2]), row),
                     [1, 0, 2])
    q = tf.transpose(tf.gather(tf.transpose(embeddings, [1, 0, 2]), col),
                     [1, 0, 2])
    # (?, 1, 666, 8)->batch * 1 * pair * k
    p = tf.expand_dims(p, axis=1)
    lp = tf.reduce_sum(
        tf.multiply(
            tf.transpose(
                # (batch, k, pair, k) -> (batch, k, pair)
                tf.reduce_sum(
                    # (batch * 1 * pair * k) (k * pair * k)=(batch, k, pair, k)
                    tf.multiply(p, weights['product-quadratic-outer']),
                    axis=-1),
                [0, 2, 1]),
            q),
        axis=-1)
    
l = tf.concat([lz, lp], axis=1)
# deep part

for i in range(len(dfm_params['deep_layers'])):
    y_deep = tf.add(tf.matmul(l, weights['layer_%d' % i]),weights['bias_%d' % i])
    l = dfm_params['deep_layer_activation'](y_deep)

out = tf.add(tf.matmul(l, weights['output']), weights['output_bias'])
out = tf.nn.sigmoid(out)
loss = tf.losses.log_loss(label, out)
optimizer = tf.train.AdamOptimizer(learning_rate=dfm_params['learning_rate'],
                                   beta1=0.9,
                                   beta2=0.999,
                                   epsilon=1e-8).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [9]:
epoch = 10
batch_size = 256
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    total_batch = int(len(train_y) / batch_size)
    print(total_batch)
    for epoch in range(epoch):
        for i in range(total_batch):
            start = i * batch_size
            end = (i + 1) * batch_size
            end = end if end<len(train_y) else len(train_y)
            x_batch = train_feature_index[start:end]
            v_batch = train_feature_value[start:end]
            y_batch = train_y[start:end]
            feed_dict = {
                feat_index: x_batch,
                feat_value: v_batch,
                label: y_batch
            }
            l,opt = sess.run([loss,optimizer],feed_dict=feed_dict)
            print(l)

39
0.6971856
0.69426274
0.6913178
0.6882985
0.6856513
0.6826079
0.6800853
0.6768952
0.6739539
0.6709627
0.66829187
0.66392195
0.6615734
0.6586554
0.6539332
0.6517693
0.64514637
0.6435536
0.64030904
0.63553286
0.6293227
0.6305168
0.6271801
0.6249153
0.6195475
0.6133213
0.6067265
0.6022898
0.59576684
0.5905149
0.5855487
0.57908195
0.5741982
0.5571495
0.5546503
0.53436863
0.5317271
0.50207675
0.49737224
0.48402137
0.46149322
0.44165567
0.41190687
0.40520346
0.37473184
0.364684
0.32817173
0.30407536
0.27997422
0.26533112
0.21113986
0.21573842
0.20442523
0.15578339
0.18643521
0.11166666
0.15621513
0.16164088
0.12833434
0.053876217
0.19330655
0.19900447
0.2534563
0.22675635
0.18332174
0.15662938
0.17005171
0.16007625
0.18012278
0.21721712
0.23724025
0.2770925
0.16470027
0.2466008
0.14415082
0.23455313
0.10275097
0.16665998
0.19049218
0.16974297
0.15896156
0.10868257
0.18093446
0.15474084
0.19553483
0.16542952
0.1560275
0.15684395
0.17698734
0.12003453
0.1569738
0.16991627
0.1210828
0.1737891