In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
IGNORE_COLS = [
    "id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09",
    "ps_calc_10", "ps_calc_11", "ps_calc_12", "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin", "ps_calc_18_bin",
    "ps_calc_19_bin", "ps_calc_20_bin"
]
dfTrain = pd.read_csv(train_file)
dfTest = pd.read_csv(test_file)

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS:
        continue
    elif col in NUMERIC_COLS:
        # 数字类型列，作为一个特征
        feature_dict[col] = total_feature
        total_feature += 1
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    elif col in NUMERIC_COLS:
        train_feature_index[col] = feature_dict[col]
    else:
        train_feature_index[col] = train_feature_index[col].map(
            feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [5]:
"""模型参数"""
fm_params = {
    "embedding_size": 8,
    "deep_layer_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": 'gini_norm',
    "random_seed": 3
}
fm_params['feature_size'] = total_feature
fm_params['field_size'] = len(train_feature_index.columns)
print(total_feature)
print(len(train_feature_index.columns))

254
37


In [6]:
# weight
weights = dict()
# liearn weight
weights['feature_embeddings'] = tf.Variable(tf.random_normal(
    [fm_params['feature_size'], fm_params['embedding_size']], 0.0, 0.01),
                                            name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [fm_params['feature_size'], 1], 0.0, 0.01),
                                      name='feature_bias')

In [7]:
feat_index = tf.placeholder(tf.int32, [None, None], name='feat_index')
feat_value = tf.placeholder(tf.float32, [None, None], name='feat_value')
label = tf.placeholder(tf.float32, [None, None], name='label')

In [8]:
embeddings = tf.nn.embedding_lookup(weights['feature_embeddings'], feat_index)
reshape_feat_value = tf.reshape(feat_value, [-1, fm_params['field_size'], 1])
embeddings = tf.multiply(embeddings, reshape_feat_value)

In [9]:
# first order
first_order_weight = tf.nn.embedding_lookup(weights['feature_bias'],
                                            feat_index)
first_order_output = tf.reduce_sum(
    tf.multiply(first_order_weight, reshape_feat_value), 2)
print(first_order_output.shape)
first_order_output = tf.reduce_sum(first_order_output, 1, keepdims=True)
print(first_order_output.shape)

(?, 37)
(?, 1)


In [10]:
# second order

# sum square
sum_square = tf.square(tf.reduce_sum(embeddings, 1))
# square sum
square_sum = tf.reduce_sum(tf.square(embeddings), 1)
print(square_sum.shape)
second_order_output = 0.5 * tf.reduce_sum(sum_square - square_sum, 1, keepdims=True)
print(second_order_output.shape)

(?, 8)
(?, 1)


In [11]:
output = first_order_output + second_order_output
output = tf.nn.sigmoid(output)
loss = tf.losses.log_loss(label, output)
optimizer = tf.train.AdamOptimizer(learning_rate=fm_params['learning_rate'],
                                   beta1=0.9,
                                   beta2=0.999,
                                   epsilon=1e-8).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batch_size = int(len(train_feature_index)/fm_params['batch_size'])
    for i in range(fm_params['epoch']):
        for j in range(batch_size):
            start = i * fm_params['batch_size']
            end = (i+1) * fm_params['batch_size']
            end = end if end<len(train_feature_index) else len(train_feature_index)
            feat_index_batch = train_feature_index[start:end]
            feat_value_batch = train_feature_value[start:end]
            label_batch = train_y[start:end]
            feed_dict = {
                feat_index:feat_index_batch,
                feat_value:feat_value_batch,
                label:label_batch
            }
            l,o = sess.run([loss,optimizer], feed_dict)
            print(l)

0.67144793
0.65197283
0.63483924
0.61894643
0.60371095
0.5887092
0.57379717
0.5590541
0.5446048
0.5330095
0.51953155
0.50644183
0.4937455
0.48143566
0.46949145
0.45788646
0.44660142
0.43563005
0.42028856
0.40966675
0.39935493
0.38936085
0.37968487
0.37032285
0.36126828
0.3525144
0.3440545
0.33864957
0.33085674
0.3233325
0.31607428
0.3090788
0.30234158
0.29585683
0.2896178
0.28361723
0.2679786
0.2622391
0.25669777
0.2513522
0.24619938
0.24123569
0.23645657
0.23185706
0.22743197
0.23836872
0.23459928
0.23099649
0.22755358
0.22426412
0.22112168
0.21812007
0.21525322
0.21251522
0.211855
0.20943972
0.20713228
0.2049284
0.20282388
0.20081447
0.19889608
0.19706455
0.19531587
0.2006785
0.19921757
0.19782117
0.19648683
0.19521207
0.19399454
0.19283184
0.19172141
0.19066069
0.20194356
0.20115584
0.20040363
0.1996844
0.19899568
0.19833511
0.19770035
0.19708924
0.19649959
0.17599693
0.17544329
0.17489585
0.17435555
0.17382315
0.17329913
0.17278379
0.17227712
0.17177898
0.0
0.0
0.0
0.0
0.0
0.0
0.0
