In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [4]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
IGNORE_COLS = [
    "id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09",
    "ps_calc_10", "ps_calc_11", "ps_calc_12", "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin", "ps_calc_18_bin",
    "ps_calc_19_bin", "ps_calc_20_bin"
]
dfTrain = pd.read_csv(train_file)
dfTest = pd.read_csv(test_file)
# dfTrain.head(10)
# dfTrain['ps_reg_01']

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS:
        continue
    elif col in NUMERIC_COLS:
        # 数字类型列，作为一个特征
        feature_dict[col] = total_feature
        total_feature += 1
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)
print(feature_dict)

{'ps_car_01_cat': {10: 0, 11: 1, 7: 2, 6: 3, 9: 4, 5: 5, 4: 6, 8: 7, 3: 8, 0: 9, 2: 10, 1: 11, -1: 12}, 'ps_car_02_cat': {1: 13, 0: 14}, 'ps_car_03_cat': {-1: 15, 0: 16, 1: 17}, 'ps_car_04_cat': {0: 18, 1: 19, 8: 20, 9: 21, 2: 22, 6: 23, 3: 24, 7: 25, 4: 26, 5: 27}, 'ps_car_05_cat': {1: 28, -1: 29, 0: 30}, 'ps_car_06_cat': {4: 31, 11: 32, 14: 33, 13: 34, 6: 35, 15: 36, 3: 37, 0: 38, 1: 39, 10: 40, 12: 41, 9: 42, 17: 43, 7: 44, 8: 45, 5: 46, 2: 47, 16: 48}, 'ps_car_07_cat': {1: 49, -1: 50, 0: 51}, 'ps_car_08_cat': {0: 52, 1: 53}, 'ps_car_09_cat': {0: 54, 2: 55, 3: 56, 1: 57, -1: 58, 4: 59}, 'ps_car_10_cat': {1: 60, 0: 61, 2: 62}, 'ps_car_11': {2: 63, 3: 64, 1: 65, 0: 66}, 'ps_car_11_cat': {12: 67, 19: 68, 60: 69, 104: 70, 82: 71, 99: 72, 30: 73, 68: 74, 20: 75, 36: 76, 101: 77, 103: 78, 41: 79, 59: 80, 43: 81, 64: 82, 29: 83, 95: 84, 24: 85, 5: 86, 28: 87, 87: 88, 66: 89, 10: 90, 26: 91, 54: 92, 32: 93, 38: 94, 83: 95, 89: 96, 49: 97, 93: 98, 1: 99, 22: 100, 85: 101, 78: 102, 31: 103, 3

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    elif col in NUMERIC_COLS:
        train_feature_index[col] = feature_dict[col]
    else:
        train_feature_index[col] = train_feature_index[col].map(
            feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [5]:
"""模型参数"""
dfm_params = {
    "use_fm": True,
    "use_deep": True,
    "embedding_size": 8,
    "dropout_fm": [1.0, 1.0],
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "deep_layer_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": 'gini_norm',
    "random_seed": 3
}
dfm_params['feature_size'] = total_feature
dfm_params['field_size'] = len(train_feature_index.columns)
print(total_feature)
print(len(train_feature_index.columns))

254
37


In [6]:
# 输入
feat_index = tf.placeholder(tf.int32, shape=[None, None], name='feat_index')
feat_value = tf.placeholder(tf.float32, shape=[None, None], name='feat_value')
label = tf.placeholder(tf.float32, shape=[None, 1], name='label')
# weight
weights = dict()
# liearn weight
weights['feature_embeddings'] = tf.Variable(tf.random_normal(
    [dfm_params['feature_size'], dfm_params['embedding_size']], 0.0, 0.01),
                                            name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [dfm_params['feature_size'], 1], 0.0, 0.01),
                                      name='feature_bias')
# deep weight
num_layer = len(dfm_params['deep_layers'])
# 这里重点注意，deep层输入个数
input_size = dfm_params['field_size'] * dfm_params['embedding_size']
glorot = np.sqrt(2.0 / (input_size + dfm_params['deep_layers'][0]))
weights['layer_0'] = tf.Variable(np.random.normal(
    loc=0, scale=glorot, size=(input_size, dfm_params['deep_layers'][0])),
                                 dtype=np.float32)
weights['bias_0'] = tf.Variable(np.random.normal(
    loc=0, scale=glorot, size=(1, dfm_params['deep_layers'][0])),
                                dtype=np.float32)
for i in range(1, num_layer):
    glorot = np.sqrt(
        2.0 /
        (dfm_params['deep_layers'][i - 1] + dfm_params['deep_layers'][i]))
    weights["layer_%d" % i] = tf.Variable(np.random.normal(
        loc=0,
        scale=glorot,
        size=(dfm_params['deep_layers'][i - 1], dfm_params['deep_layers'][i])),
                                          dtype=np.float32)
    weights["bias_%d" % i] = tf.Variable(np.random.normal(
        loc=0, scale=glorot, size=(1, dfm_params['deep_layers'][i])),
                                         dtype=np.float32)
input_size = dfm_params['field_size'] + dfm_params[
    'embedding_size'] + dfm_params['deep_layers'][-1]
glorot = np.sqrt(2.0 / (input_size + 1))
weights['concat_projection'] = tf.Variable(np.random.normal(loc=0,
                                                            scale=glorot,
                                                            size=(input_size,
                                                                  1)),
                                           dtype=np.float32)
weights['concat_bias'] = tf.Variable(tf.constant(0.01), dtype=np.float32)

In [15]:
# fm part
# 一次项部分
# reshaped_feat_value的形状是(?, 37, 1)
reshaped_feat_value = tf.reshape(feat_value,
                                 shape=[-1, dfm_params['field_size'], 1])
# 形状是(?,37,1)，就是w * x
fm_first_order = tf.nn.embedding_lookup(weights['feature_bias'], feat_index)
fm_first_order = tf.reduce_sum(
    tf.multiply(fm_first_order, reshaped_feat_value), 2)
print(fm_first_order.shape)

# 二次项部分
# embeddings的形状是(254，8)，一共254个特征，每个特征embedsize是8
embeddings = tf.nn.embedding_lookup(weights['feature_embeddings'], feat_index)
# 形状为(?,37,8)，这里计算的是v_i * x_i，fm公式里包含这一项
embeddings = tf.multiply(embeddings, reshaped_feat_value)
# 形状为(?, 8)
summed_features_emb = tf.reduce_sum(embeddings, 1)
summed_features_emb_square = tf.square(summed_features_emb)

squared_features_emb = tf.square(embeddings)
squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1)
# 形状为(?, 8)
fm_second_order = 0.5 * tf.subtract(summed_features_emb_square,
                                    squared_sum_features_emb)

(?, 37)


In [8]:
# deep part

# 形状(?, 296)
y_deep = tf.reshape(
    embeddings,
    shape=[-1, dfm_params['field_size'] * dfm_params['embedding_size']])
for i in range(0,len(dfm_params['deep_layers'])):
    y_deep = tf.add(tf.matmul(y_deep,weights["layer_%d" %i]), weights["bias_%d"%i])
    y_deep = tf.nn.relu(y_deep)

In [10]:
#(?, 77)，37 + 8 + 32，其中fm部分输出维度为field_size + embedding_size，deep部分是32
concat_input = tf.concat([fm_first_order, fm_second_order, y_deep], axis=1)
out = tf.nn.sigmoid(
    tf.add(tf.matmul(concat_input, weights['concat_projection']),
           weights['concat_bias']))


In [11]:
"""loss and optimizer"""
loss = tf.losses.log_loss(tf.reshape(label, (-1, 1)), out)
optimizer = tf.train.AdamOptimizer(learning_rate=dfm_params['learning_rate'],
                                   beta1=0.9,
                                   beta2=0.999,
                                   epsilon=1e-8).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [13]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(100):
        epoch_loss,_ = sess.run([loss,optimizer],feed_dict={feat_index:train_feature_index,
                             feat_value:train_feature_value,
                             label:train_y})
        print("epoch %s,loss is %s" % (str(i),str(epoch_loss)))

epoch 0,loss is 0.7904554
epoch 1,loss is 0.7777381
epoch 2,loss is 0.76574767
epoch 3,loss is 0.7544462
epoch 4,loss is 0.7433578
epoch 5,loss is 0.732174
epoch 6,loss is 0.72087646
epoch 7,loss is 0.7096608
epoch 8,loss is 0.69849986
epoch 9,loss is 0.687696
epoch 10,loss is 0.6772066
epoch 11,loss is 0.66655195
epoch 12,loss is 0.65562814
epoch 13,loss is 0.64439714
epoch 14,loss is 0.6329011
epoch 15,loss is 0.6214559
epoch 16,loss is 0.61020094
epoch 17,loss is 0.5988858
epoch 18,loss is 0.5877118
epoch 19,loss is 0.576404
epoch 20,loss is 0.5647501
epoch 21,loss is 0.5533871
epoch 22,loss is 0.5419371
epoch 23,loss is 0.5301462
epoch 24,loss is 0.51809543
epoch 25,loss is 0.50573486
epoch 26,loss is 0.49304345
epoch 27,loss is 0.4799881
epoch 28,loss is 0.4665479
epoch 29,loss is 0.45272022
epoch 30,loss is 0.43854293
epoch 31,loss is 0.42414662
epoch 32,loss is 0.40954432
epoch 33,loss is 0.3947473
epoch 34,loss is 0.37980595
epoch 35,loss is 0.36478946
epoch 36,loss is 0.349775