In [1]:
import pandas as pd
from scipy.sparse import coo_matrix
import numpy as np
import tensorflow as tf

In [2]:
# 共有22个field, 每个field中属性取值的可枚举个数为:
FIELD_SIZES = [
    1037, 151, 59, 1603, 4, 333, 77890, 1857, 9, 8, 4, 7, 22, 3, 92, 56, 4,
    920, 38176, 240, 2697, 4
]
INPUT_DIM = sum(FIELD_SIZES)
# 记录每个field的开始和结束的下标
FIELD_OFFSETS = [sum(FIELD_SIZES[:i]) for i in range(len(FIELD_SIZES))]

In [3]:
# 制作sparse输入
def libsvm_2_coo(libsvm_data, shape):
    coo_rows = []
    coo_cols = []
    coo_data = []
    n = 0
    for x, d in libsvm_data:
        # n是行号，第几个数据就是第几行，有len(x)个取值就有len(x)个坐标，所以这里需要[n] * len(x)
        coo_rows.extend([n] * len(x))
        # 这里是列取值
        coo_cols.extend(x)
        coo_data.extend(d)
        n += 1
        break
    coo_rows = np.array(coo_rows)
    coo_cols = np.array(coo_cols)
    coo_data = np.array(coo_data)
    return coo_matrix((coo_data, (coo_rows, coo_cols)), shape=shape)

In [4]:
# 把fields:index:value转化成onehost的sparse输入
def read_data(file_name):
    X = []
    D = []
    y = []
    fin = pd.read_csv(file_name)
    fin = fin.as_matrix()
    for line in fin:
        X_i = []
        D_i = []
        line = line[0].strip().split()
        if str(line[0]) == '0':
            y_i = float(0)
        else:
            y_i = float(1)
        for x in line[1:]:
            X_i.append(float(x.split(':')[1]))
            D_i.append(float(x.split(':')[2]))
        y.append(y_i)
        X.append(X_i)
        D.append(D_i)
    y = np.reshape(np.array(y), [-1])
    X = libsvm_2_coo(zip(X, D), (len(X), INPUT_DIM)).tocsr()
    return X, y

In [5]:
def shuffle(data):
    X, y = data
    ind = np.arange(X.shape[0])
    for i in range(7):
        np.random.shuffle(ind)
    return X[ind], y[ind]

In [6]:
train_data = read_data('./train_ffm.csv')
train_data = shuffle(train_data)
test_data = read_data('./test_ffm.csv')

  import sys


In [7]:
def split_data(data, skip_empty=True):
    fields = []
    for i in range(len(FIELD_OFFSETS) - 1):
        start_ind = FIELD_OFFSETS[i]
        end_ind = FIELD_OFFSETS[i + 1]
        if skip_empty and start_ind == end_ind:
            continue
        field_i = data[0][:, start_ind:end_ind]
        fields.append(field_i)
    fields.append(data[0][:, FIELD_OFFSETS[-1]:])
    return fields, data[1]

In [8]:
train_data = split_data(train_data)
test_data = split_data(test_data)

In [9]:
fnn_params = {
        'field_sizes': FIELD_SIZES,
        'embed_size': 128,
        'layer_sizes': [500, 1],
        'layer_acts': ['relu', None],
        'drop_out': [0, 0],
        'opt_algo': 'gd',
        'learning_rate': 0.1,
        'embed_l2': 0,
        'layer_l2': [0, 0],
        'random_seed': 0
    }

In [10]:
num_inputs = len(fnn_params['field_sizes'])
x = [tf.sparse_placeholder(tf.float32) for i in range(num_inputs)]
y = tf.placeholder(tf.float32)

In [11]:
weights_dict = dict()
# fm part variables
feature_size = sum(fnn_params['field_sizes'])
weights_dict['feature_embeddings'] = tf.Variable(tf.random_normal(
    [feature_size, fnn_params['embed_size']], mean=0.0, stddev=0.01),
                                                 name='feature_embedding')
weights_dict['feature_w'] = tf.Variable(tf.random_normal([feature_size, 1],
                                                         mean=0.0,
                                                         stddev=0.01),
                                        name='feature_weight')
weights_dict['feature_b'] = tf.Variable(tf.random_normal([1],
                                                         mean=0.0,
                                                         stddev=0.01),
                                        name='feature_bias')

# deep part variables
input_size = num_inputs * (fnn_params['embed_size'] + 1)
glorot = np.sqrt(2.0 / (input_size + fnn_params['layer_sizes'][0]))
weights_dict['layer_0'] = tf.Variable(
    tf.random_uniform([input_size, fnn_params['layer_sizes'][0]],
                      minval=-glorot,
                      maxval=glorot,
                      dtype=tf.float32))

for i in range(len(fnn_params['layer_sizes'])):
    glorot = np.sqrt(
        2.0 /
        (fnn_params['layer_sizes'][i - 1] + fnn_params['layer_sizes'][i]))
    
    weights_dict['layer_%d' % i] = tf.Variable(tf.random_uniform(
        [input_size, fnn_params['layer_sizes'][0]], minval=-glorot, maxval= glorot),
                                               dtype=tf.float32)
    weights_dict['bias_%d' % i] = tf.Variable(tf.zeros([fnn_params['layer_sizes'][0]]),
                                     dtype=tf.float32)
    input_size = fnn_params['layer_sizes'][i]

In [30]:
init_w0 = tf.concat(
    [weights_dict['feature_w'], weights_dict['feature_embeddings']], 1)
lower, upper = 0, fnn_params['field_sizes'][0]
for i in range(num_inputs):
    if (i != 0):
        lower, upper = upper, upper + fnn_params['field_sizes'][i]
    weights_dict['embed_%d' % i] = init_w0[lower:upper]

w0 = [weights_dict['embed_%d' % i] for i in range(num_inputs)]

xw = tf.concat(
    [tf.sparse_tensor_dense_matmul(x[i], w0[i]) for i in range(num_inputs)], 1)

l = xw
for i in range(len(fnn_params['layer_sizes'])):
    wi = weights_dict['layer_%d' % i]
    bi = weights_dict['bias_%d' % i]
    l = tf.nn.relu(tf.matmul(l, wi) + bi)

l = tf.squeeze(l)
y_prob = tf.sigmoid(l)
loss = tf.reduce_mean(
    tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=y))
optimizer = tf.train.AdamOptimizer(fnn_params['learning_rate']).minimize(loss)

In [31]:
def csr_2_input(csr_mat):
    if not isinstance(csr_mat, list):
        coo_mat = csr_mat.tocoo()
        indices = np.vstack((coo_mat.row, coo_mat.col)).transpose()
        values = csr_mat.data
        shape = csr_mat.shape
        return indices, values, shape
    else:
        inputs = []
        for csr_i in csr_mat:
            inputs.append(csr_2_input(csr_i))
        return inputs

In [32]:
def slice(csr_data, start=0, size=-1):
    if not isinstance(csr_data[0], list):
        if size == -1 or start + size >= csr_data[0].shape[0]:
            slc_data = csr_data[0][start:]
            slc_labels = csr_data[1][start:]
        else:
            slc_data = csr_data[0][start:start + size]
            slc_labels = csr_data[1][start:start + size]
    else:
        if size == -1 or start + size >= csr_data[0][0].shape[0]:
            slc_data = []
            for d_i in csr_data[0]:
                slc_data.append(d_i[start:])
            slc_labels = csr_data[1][start:]
        else:
            slc_data = []
            for d_i in csr_data[0]:
                slc_data.append(d_i[start:start + size])
            slc_labels = csr_data[1][start:start + size]
    return csr_2_input(slc_data), slc_labels

In [40]:
train_size = train_data[0][0].shape[0]
test_size = test_data[0][0].shape[0]
print(train_size)
print(test_size)
batch_size = 1024
iters = int((train_size + batch_size - 1) / batch_size)
print(iters)
with tf.Session() as sess:
    tf.global_variables_initializer()
    for j in range(iters):
        X_i, y_i = slice(train_data, j * batch_size, batch_size)
        fd = {}
        for i in range(len(X_i)):
            fd[x[i]] = X_i[i]
        fd[y] = y_i
        _, l = sess.run([optimizer, loss], feed_dict=fd)
#         ls.append(l)
        

250915
62728
246


FailedPreconditionError: 2 root error(s) found.
  (0) Failed precondition: Attempting to use uninitialized value Variable_1
	 [[node Variable_1/read (defined at <ipython-input-11-d445c4d294ce>:32) ]]
	 [[Mean_4/_143]]
  (1) Failed precondition: Attempting to use uninitialized value Variable_1
	 [[node Variable_1/read (defined at <ipython-input-11-d445c4d294ce>:32) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'Variable_1/read':
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/asyncio/base_events.py", line 539, in run_forever
    self._run_once()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/asyncio/base_events.py", line 1775, in _run_once
    handle._run()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 378, in dispatch_queue
    yield self.process_one()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 225, in wrapper
    runner = Runner(result, future, yielded)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 714, in __init__
    self.run()
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 272, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 542, in execute_request
    user_expressions, allow_stdin,
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2854, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2880, in _run_cell
    return runner(coro)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3057, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3248, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-d445c4d294ce>", line 32, in <module>
    dtype=tf.float32)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/variables.py", line 259, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/variables.py", line 220, in _variable_v1_call
    shape=shape)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/variables.py", line 198, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/variable_scope.py", line 2511, in default_variable_creator
    shape=shape)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/variables.py", line 263, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/variables.py", line 1568, in __init__
    shape=shape)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/variables.py", line 1755, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py", line 180, in wrapper
    return target(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py", line 86, in identity
    ret = gen_array_ops.identity(input, name=name)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 4253, in identity
    "Identity", input=input, name=name)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 3616, in create_op
    op_def=op_def)
  File "/home/huangyajian/repo/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 2005, in __init__
    self._traceback = tf_stack.extract_stack()
