In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
IGNORE_COLS = [
    "id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09",
    "ps_calc_10", "ps_calc_11", "ps_calc_12", "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin", "ps_calc_18_bin",
    "ps_calc_19_bin", "ps_calc_20_bin"
]
dfTrain = pd.read_csv(train_file)
dfTest = pd.read_csv(test_file)

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS:
        continue
    elif col in NUMERIC_COLS:
        # 数字类型列，作为一个特征
        feature_dict[col] = total_feature
        total_feature += 1
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    elif col in NUMERIC_COLS:
        train_feature_index[col] = feature_dict[col]
    else:
        train_feature_index[col] = train_feature_index[col].map(
            feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [5]:
nfm_params = {
    "embedding_size":8,
    "deep_layers":[32,32],
    "dropout_deep":[0.5,0.5,0.5],
    "deep_layer_activation":tf.nn.relu,
    "epoch":30,
    "batch_size":1024,
    "learning_rate":0.001,
    "optimizer":"adam",
    "batch_norm":1,
    "batch_norm_decay":0.995,
    "verbose":True,
    "random_seed":0,
    "deep_init_size":50,
    "use_inner":False
}
nfm_params['feature_size'] = total_feature
nfm_params['field_size'] = len(train_feature_index.columns)

In [6]:
# weight初始化
weights = dict()
weights['feature_embeddings'] = tf.Variable(tf.random_normal(
    [nfm_params['feature_size'], nfm_params['embedding_size']],
    mean=0.0,
    stddev=0.01),
                                            name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [nfm_params['feature_size'],1], mean=0.0, stddev=0.01),
                                      name='feature_bias')
num_layers = len(nfm_params['deep_layers'])
input_size = nfm_params['embedding_size']
glorot = np.sqrt(2.0 / (input_size + nfm_params['deep_layers'][0]))
weights['layer_0'] = tf.Variable(
    tf.random_normal([input_size, nfm_params['deep_layers'][0]],
                     mean=0.0,
                     stddev=glorot))
weights['bias_0'] = tf.Variable(
    tf.random_normal([1, nfm_params['deep_layers'][1]],
                     mean=0.0,
                     stddev=glorot))

for i in range(1, num_layers):
    glorot = np.sqrt(
        2.0 /
        (nfm_params['deep_layers'][i] + nfm_params['deep_layers'][i - 1]))
    weights['layer_%d' % i] = tf.Variable(
        tf.random_normal(
            [nfm_params['deep_layers'][i - 1], nfm_params['deep_layers'][i]],
            mean=0.0,
            stddev=glorot))
    weights['bias_%d' % i] = tf.Variable(
        tf.random_normal([1,nfm_params['deep_layers'][i]],
                         mean=0.0,
                         stddev=glorot))
weights['bias'] = tf.Variable(tf.constant(0.1),name='bias')

In [7]:
feat_index = tf.placeholder(tf.int32, [None, nfm_params['field_size']],
                            name='feat_index')
feat_value = tf.placeholder(tf.float32, [None, nfm_params['field_size']],
                            name='feat_value')
label = tf.placeholder(tf.float32, [None, 1], name='label')

In [8]:
embedding = tf.nn.embedding_lookup(weights['feature_embeddings'], feat_index)
# print(embedding.shape)
reshape_feat_value = tf.reshape(feat_value, [-1, nfm_params['field_size'], 1])
# print(reshape_feat_value.shape)
embedding = tf.multiply(embedding, reshape_feat_value)
# print(embedding.shape)

# first order part
first_order_weight = tf.nn.embedding_lookup(weights['feature_bias'],
                                            feat_index)
first_order = tf.reduce_sum(tf.multiply(first_order_weight, reshape_feat_value), 2)

# second order part

# sum square part
summed_feature_emb = tf.reduce_sum(embedding, 1)
summed_feature_emb_square = tf.square(summed_feature_emb)

# square sum part
squared_feature_emb = tf.square(embedding)
squared_sum_feature_emb = tf.reduce_sum(squared_feature_emb, 1)

second_order = 0.5 * (summed_feature_emb_square - squared_sum_feature_emb)

print(second_order.shape)

y_deep = second_order
for i in range(0, len(nfm_params['deep_layers'])):
    y_deep = tf.matmul(y_deep,
                       weights['layer_%d' % i]) + weights['bias_%d' % i]
    y_deep = nfm_params['deep_layer_activation'](y_deep)

y_bias = weights['bias'] * tf.ones_like(label)
out = tf.add_n([
    tf.reduce_sum(first_order, axis=1, keep_dims=True),
    tf.reduce_sum(y_deep, axis=1, keep_dims=True),y_bias
])
out = tf.nn.sigmoid(out)
loss = tf.losses.log_loss(label, out)
optimizer = tf.train.AdamOptimizer(learning_rate=nfm_params['learning_rate'],
                                   beta1=0.9,
                                   beta2=0.999,
                                   epsilon=1e-8).minimize(loss)

(?, 8)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
epoch = 10
batch_size = 256
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    total_batch = int(len(train_y) / batch_size)
    print(total_batch)
    for epoch in range(epoch):
        for i in range(total_batch):
            start = i * batch_size
            end = (i + 1) * batch_size
            end = end if end<len(train_y) else len(train_y)
            x_batch = train_feature_index[start:end]
            v_batch = train_feature_value[start:end]
            y_batch = train_y[start:end]
            feed_dict = {
                feat_index: x_batch,
                feat_value: v_batch,
                label: y_batch
            }
            l,opt = sess.run([loss,optimizer],feed_dict=feed_dict)
            print(l)

39
2.1598244
2.095179
2.0136921
1.9714515
1.8545756
1.7952203
1.7039896
1.6477356
1.5841694
1.5193391
1.4449953
1.4093797
1.3379505
1.2755582
1.2379825
1.1718273
1.1430907
1.0807393
1.0330644
0.9965128
0.96844083
0.91296536
0.8752416
0.83867633
0.80963504
0.7716607
0.73857886
0.7134645
0.68185925
0.6549593
0.6293579
0.6044177
0.5862564
0.55785304
0.54223627
0.51145077
0.5021272
0.46016574
0.45547277
0.44494024
0.42508113
0.4149819
0.39013907
0.39625907
0.37646362
0.3790006
0.35941312
0.34570262
0.33719325
0.33458787
0.30655578
0.31536537
0.31664115
0.28745782
0.30694366
0.26704672
0.2847782
0.27921197
0.2572776
0.22128169
0.26993245
0.26642174
0.28316623
0.26976103
0.25102434
0.23401722
0.24198402
0.23181556
0.24242792
0.25380224
0.26005858
0.27287364
0.23202132
0.26803297
0.22539409
0.26003364
0.19706944
0.23121753
0.2432051
0.2286931
0.22845411
0.19338751
0.23377329
0.21684903
0.2390626
0.22152212
0.21359475
0.21345107
0.22411892
0.18848993
0.21141952
0.22233391
0.18475977
0.22216569