In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
IGNORE_COLS = [
    "id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09",
    "ps_calc_10", "ps_calc_11", "ps_calc_12", "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin", "ps_calc_18_bin",
    "ps_calc_19_bin", "ps_calc_20_bin"
]
dfTrain = pd.read_csv(train_file)
dfTest = pd.read_csv(test_file)

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS:
        continue
    elif col in NUMERIC_COLS:
        # 数字类型列，作为一个特征
        feature_dict[col] = total_feature
        total_feature += 1
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    elif col in NUMERIC_COLS:
        train_feature_index[col] = feature_dict[col]
    else:
        train_feature_index[col] = train_feature_index[col].map(
            feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [5]:
ffm_params = {
    "embedding_size": 8,
    "deep_layer_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "verbose": True,
    "random_seed": 0,
    "deep_init_size": 50,
    "use_inner": False
}
ffm_params['feature_size'] = total_feature
ffm_params['field_size'] = len(train_feature_index.columns)

In [6]:
weights = dict()

feature_embeddings = tf.Variable(tf.random_normal([
    ffm_params['field_size'], ffm_params['feature_size'],
    ffm_params['embedding_size']
],
                                                  mean=0.0,
                                                  stddev=0.01),
                                 name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [ffm_params['feature_size'], 1], mean=0.0, stddev=0.01),
                                      name='feature_bias')
weights['bias'] = tf.Variable(tf.constant(0.1), name='bias')

In [7]:
feat_index = tf.placeholder(tf.int32, shape=[None, ffm_params['field_size']], name='feat_index')
feat_value = tf.placeholder(tf.float32, shape=[None, ffm_params['field_size']], name='feat_value')
label = tf.placeholder(tf.float32, shape=[None, 1], name='label')

In [8]:
# linear part
linear_weight = tf.nn.embedding_lookup(weights['feature_bias'], feat_index)
reshaped_feat_val = tf.reshape(feat_value, [-1, ffm_params['field_size'], 1])
linear_output = tf.reduce_sum( tf.multiply(linear_weight, reshaped_feat_val), axis=1)
print(linear_output.shape)

(?, 1)


In [9]:
second_order_res = tf.ones_like(label)
for i in range(ffm_params['field_size']):
    for j in range(i + 1, ffm_params['field_size']):
        # 第i个特征与第j个field的隐向量
        v_dj_fi = tf.nn.embedding_lookup(feature_embeddings[j],
                                         feat_index[:, i])
        # 第j个特征与第i个field的隐向量
        v_di_fj = tf.nn.embedding_lookup(feature_embeddings[i],
                                         feat_index[:, j])
        second_order_res += tf.reduce_sum(tf.multiply(v_dj_fi, v_di_fj), 1, keepdims=True)

In [10]:
output = linear_output + second_order_res
output = tf.nn.sigmoid(second_order_res)
loss = tf.losses.log_loss(label, output)
optimizer = tf.train.AdamOptimizer(learning_rate=ffm_params['learning_rate'], 
                                   beta1=0.9,
                                   beta2=0.999,
                                   epsilon=1e-8).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batch_size = int(len(train_feature_index)/ffm_params['batch_size'])
    for i in range(ffm_params['epoch']):
        for j in range(batch_size):
            start = i * ffm_params['batch_size']
            end = (i+1) * ffm_params['batch_size']
            end = end if end<len(train_feature_index) else len(train_feature_index)
            feat_index_batch = train_feature_index[start:end]
            feat_value_batch = train_feature_value[start:end]
            label_batch = train_y[start:end]
            feed_dict = {
                feat_index:feat_index_batch,
                feat_value:feat_value_batch,
                label:label_batch
            }
            l,o = sess.run([loss,optimizer], feed_dict)
            print(l)

1.2712694
1.225049
1.1792042
1.1328077
1.0851325
1.0356503
0.9840219
0.93009543
0.87391263
0.81571716
0.75659555
0.69652325
0.6362619
0.57670915
0.5188507
0.46370173
0.41223383
0.36529678
0.31694987
0.27926198
0.24718258
0.22062233
0.19925454
0.18257576
0.16997573
0.16080464
0.15442625
0.15671447
0.15477328
0.15405127
0.15418889
0.15489872
0.15595596
0.15718845
0.15846685
0.15969652
0.13473457
0.13552928
0.13620517
0.13674656
0.1371445
0.13739613
0.1375027
0.13746907
0.13730244
0.17302518
0.17281619
0.17236349
0.17169702
0.17084585
0.16983832
0.16870189
0.16746297
0.16614649
0.17011681
0.16893521
0.16768533
0.16639495
0.16508822
0.16378605
0.16250628
0.16126372
0.1600703
0.17583936
0.17485996
0.17387125
0.17289378
0.17194319
0.17103052
0.17016289
0.16934378
0.16857363
0.18555841
0.18509223
0.18462701
0.1841674
0.18371469
0.18326777
0.18282387
0.182379
0.18192863
0.16529718
0.16543846
0.16545688
0.16535772
0.16514878
0.16483964
0.1644414
0.16396575
0.1634247
0.0
0.0
0.0
0.0
0.0
0.0
0.0
