In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
train_file = '../data/train.csv'
test_file = '../data/test.csv'
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13",
    "ps_car_14", "ps_car_15"
]
IGNORE_COLS = [
    "id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09",
    "ps_calc_10", "ps_calc_11", "ps_calc_12", "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin", "ps_calc_18_bin",
    "ps_calc_19_bin", "ps_calc_20_bin"
]
dfTrain = pd.read_csv(train_file)
dfTest = pd.read_csv(test_file)

In [3]:
df = pd.concat([dfTrain, dfTest])
# 特征字典，key是每一列，即每个field，value是每个值对应的feature_id
feature_dict = {}
# 特征总数量
total_feature = 0
for col in df.columns:
    if col in IGNORE_COLS:
        continue
    elif col in NUMERIC_COLS:
        # 数字类型列，作为一个特征
        feature_dict[col] = total_feature
        total_feature += 1
    else:
        # 查看这一列有多少个unique的值
        unique_val = df[col].unique()
        feature_dict[col] = dict(
            zip(unique_val,
                range(total_feature,
                      len(unique_val) + total_feature)))
        total_feature += len(unique_val)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_y = dfTrain[['target']].values.tolist()
dfTrain.drop(['target', 'id'], axis=1, inplace=True)
train_feature_index = dfTrain.copy()
train_feature_value = dfTrain.copy()
for col in train_feature_index.columns:
    if col in IGNORE_COLS:
        train_feature_index.drop(col, axis=1, inplace=True)
        train_feature_value.drop(col, axis=1, inplace=True)
        continue
    elif col in NUMERIC_COLS:
        train_feature_index[col] = feature_dict[col]
    else:
        train_feature_index[col] = train_feature_index[col].map(
            feature_dict[col])
        train_feature_value[col] = 1
train_y = np.reshape(np.array(train_y), (-1, 1))

In [5]:
"""模型参数"""
wnd_params = {
    "embedding_size": 8,
    "deep_layer_activation": tf.nn.relu,
    "epoch": 30,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": 'gini_norm',
    "random_seed": 3,
    "deep_layers": [256, 128, 64],
}
wnd_params['feature_size'] = total_feature
wnd_params['field_size'] = len(train_feature_index.columns)
print(total_feature)
print(len(train_feature_index.columns))

254
37


In [6]:
weights = dict()
# embedding矩阵
weights['feature_embeddings'] = tf.Variable(tf.random_normal(
    [wnd_params['feature_size'], wnd_params['embedding_size']], 0.0, 0.1),
                                            name='feature_embeddings')
# LR部分的权重矩阵
weights['feature_bias'] = tf.Variable(tf.random_normal(
    [wnd_params['feature_size'], 1], mean=0.0, stddev=0.1),
                                      name='feature_embeddings')
# deep部分的权重
num_layer = len(wnd_params['deep_layers'])
input_size = wnd_params['field_size'] * wnd_params['embedding_size']
glorot = np.sqrt(2.0 / (input_size + wnd_params['deep_layers'][0]))
weights['layer_0'] = tf.Variable(tf.random_normal(
    [input_size, wnd_params['deep_layers'][0]], mean=0.0, stddev=glorot),
                                 dtype=np.float32)
weights['bias_0'] = tf.Variable(tf.random_normal(
    [1, wnd_params['deep_layers'][0]], mean=0.0, stddev=glorot),
                                dtype=np.float32)
for i in range(1, num_layer):
    glorot = np.sqrt(
        2.0 /
        (wnd_params['deep_layers'][i - 1] + wnd_params['deep_layers'][i]))
    weights['layer_%d' % i] = tf.Variable(tf.random_normal(
        [wnd_params['deep_layers'][i - 1], wnd_params['deep_layers'][i]],
        mean=0.0,
        stddev=glorot),
                                          dtype=np.float32)
    weights['bias_%d' % i] = tf.Variable(tf.random_normal(
        [1, wnd_params['deep_layers'][i]], mean=0.0, stddev=glorot),
                                         dtype=np.float32)

input_size = 1 + wnd_params['deep_layers'][-1]
glorot = np.sqrt(2.0 / (input_size + 1))
weights['concat_projection'] = tf.Variable(
    tf.random_normal([input_size, 1], mean=0.0, stddev=glorot))
weights['concat_bias'] = tf.Variable(tf.constant(0.01), dtype=np.float32)

In [7]:
feat_index = tf.placeholder(tf.int32,
                            shape=[None, wnd_params['field_size']],
                            name='feat_index')
feat_value = tf.placeholder(tf.float32,
                            shape=[None, wnd_params['field_size']],
                            name='feat_value')
label = tf.placeholder(tf.float32, shape=[None, 1], name='label')

In [8]:
# LR part
reshape_feat_value = tf.reshape(feat_value, [-1, wnd_params['field_size'], 1])
lr_part_weight = tf.nn.embedding_lookup(weights['feature_bias'],feat_index)
print(lr_part_weight.shape)
lr_output = tf.reduce_sum(tf.multiply(feat_value, reshape_feat_value),2)
print(lr_output.shape)
lr_output = tf.reduce_sum(lr_output, 1, keepdims=True)
print(lr_output.shape)

(?, 37, 1)
(?, 37)
(?, 1)


In [17]:
# deep part
embeddings = tf.nn.embedding_lookup(weights['feature_embeddings'], feat_index)
embeddings = tf.multiply(embeddings, reshape_feat_value)
reshaped_embeddings = tf.reshape(
    embeddings, [-1, wnd_params['field_size'] * wnd_params['embedding_size']])
print(embeddings.shape)
y_deep = reshaped_embeddings
for i in range(len(wnd_params['deep_layers'])):
    y_deep = tf.matmul(y_deep, weights['layer_%d'%i]) + weights['bias_%d'%i]
    y_deep = tf.nn.relu(y_deep)
print(y_deep.shape)

(?, 37, 8)
(?, 64)


In [18]:
concat_input = tf.concat([lr_output, y_deep], axis=1)
print(concat_input.shape)
res = tf.nn.sigmoid(
    tf.matmul(concat_input, weights['concat_projection']) +
    weights['concat_bias'])
print(res.shape)

(?, 65)
(?, 1)


In [19]:
loss = tf.losses.log_loss(tf.reshape(label, (-1, 1)), res)
optimizer = tf.train.AdamOptimizer(learning_rate=wnd_params['learning_rate'],
                                   beta1=0.9,
                                   beta2=0.999,
                                   epsilon=1e-8).minimize(loss)

In [20]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    batch_size = int(len(train_feature_index)/wnd_params['batch_size'])
    for i in range(wnd_params['epoch']):
        for j in range(batch_size):
            start = i * wnd_params['batch_size']
            end = (i+1) * wnd_params['batch_size']
            end = end if end<len(train_feature_index) else len(train_feature_index)
            feat_index_batch = train_feature_index[start:end]
            feat_value_batch = train_feature_value[start:end]
            label_batch = train_y[start:end]
            feed_dict = {
                feat_index:feat_index_batch,
                feat_value:feat_value_batch,
                label:label_batch
            }
            l,o = sess.run([loss,optimizer], feed_dict)
            print(l)

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: Incompatible shapes: [1024,37,1] vs. [1024,37]
	 [[node Mul (defined at <ipython-input-8-ec1b47fc1dce>:5) ]]
	 [[log_loss_4/value/_13]]
  (1) Invalid argument: Incompatible shapes: [1024,37,1] vs. [1024,37]
	 [[node Mul (defined at <ipython-input-8-ec1b47fc1dce>:5) ]]
0 successful operations.
0 derived errors ignored.

Errors may have originated from an input operation.
Input Source operations connected to node Mul:
 Reshape (defined at <ipython-input-8-ec1b47fc1dce>:2)	
 feat_value (defined at <ipython-input-7-70cb632ecde3>:6)

Input Source operations connected to node Mul:
 Reshape (defined at <ipython-input-8-ec1b47fc1dce>:2)	
 feat_value (defined at <ipython-input-7-70cb632ecde3>:6)

Original stack trace for 'Mul':
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/asyncio/base_events.py", line 539, in run_forever
    self._run_once()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/asyncio/base_events.py", line 1775, in _run_once
    handle._run()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 378, in dispatch_queue
    yield self.process_one()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 225, in wrapper
    runner = Runner(result, future, yielded)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 714, in __init__
    self.run()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 272, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 542, in execute_request
    user_expressions, allow_stdin,
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2854, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2880, in _run_cell
    return runner(coro)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3057, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3248, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-ec1b47fc1dce>", line 5, in <module>
    lr_output = tf.reduce_sum(tf.multiply(feat_value, reshape_feat_value),2)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py", line 180, in wrapper
    return target(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py", line 322, in multiply
    return gen_math_ops.mul(x, y, name)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 6490, in mul
    "Mul", x=x, y=y, name=name)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 3616, in create_op
    op_def=op_def)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 2005, in __init__
    self._traceback = tf_stack.extract_stack()
