In [1]:
import tensorflow as tf
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
import collections
from tensorflow.python.framework import errors
from tensorflow.python.platform import test
from tensorflow.python.training import coordinator
from tensorflow import feature_column

from tensorflow.python.feature_column.feature_column import _LazyBuilder

# `numeric_column`

```py
numeric_column(
    key,
    shape=(1,),
    default_value=None,
    dtype=tf.float32,
    normalizer_fn=None
)
```

- `key`: 特征的名字。也就是对应的列名称。
- `shape`: 该`key`所对应的特征的`shape`. 默认是`1`，但是比如`one-hot`类型的，`shape`就不是`1`，而是实际的维度。总之，这里是`key`所对应的维度，不一定是`1`.
- `default_value`: 如果不存在使用的默认值
- `normalizer_fn`: 对该特征下的所有数据进行转换。如果需要进行`normalize`，那么就是使用`normalize`的函数.这里不仅仅局限于`normalize`，也可以是任何的转换方法，比如取对数，取指数，这仅仅是一种变换方法.

接下来对 `numeric_column` 测试的demo如下:

In [2]:
def test_numeric():

    price = {'price': [[1.], [2.], [3.], [4.]]}  # 4行样本
    builder = _LazyBuilder(price)

    def transform_fn(x):
        return x + 2

    price_column = feature_column.numeric_column(
        'price', normalizer_fn=transform_fn)

    price_transformed_tensor = price_column._get_dense_tensor(builder)

    with tf.Session() as session:
        print(session.run([price_transformed_tensor]))

    # 使用input_layer

    price_transformed_tensor = feature_column.input_layer(price, [
                                                          price_column])

    with tf.Session() as session:
        print('use input_layer' + '_' * 40)
        print(session.run([price_transformed_tensor]))

In [3]:
test_numeric()

[array([[ 3.],
       [ 4.],
       [ 5.],
       [ 6.]], dtype=float32)]
use input_layer________________________________________
[array([[ 3.],
       [ 4.],
       [ 5.],
       [ 6.]], dtype=float32)]


从上面的结果可以看出，`transform_fn` 将所有的数值`+2`来处理了。使用`_LazyBuilder`和`inpu_layer`来分别进行了测试，效果是一样的.

# `bucketized_column`
```bucketized_column(source_column, boundaries)```

- `source_column`: 必须是`numeric_column`
- `boundaries`: 不同的桶。`boundaries=[0., 1., 2.]`,产生的`bucket`就是, `(-inf, 0.), [0., 1.), [1., 2.), and [2., +inf)`, 每一个区间分别表示`0`, `1`, `2`, `3`,所以相当于分桶分了`4`个.

In [4]:
def test_bucketized_column():

    price = {'price': [[5.], [15.], [25.], [35.]]}  # 4行样本

    price_column = feature_column.numeric_column('price')
    bucket_price = feature_column.bucketized_column(
        price_column, [0, 10, 20, 30, 40])

    price_bucket_tensor = feature_column.input_layer(price, [bucket_price])

    with tf.Session() as session:
        print(session.run([price_bucket_tensor]))


test_bucketized_column()

[array([[ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.]], dtype=float32)]


我们看到分桶之后，会直接转换成`one-hot`m形式的。

# `categorical_column_with_vocabulary_list`
```py
categorical_column_with_vocabulary_list(
    key,
    vocabulary_list,
    dtype=None,
    default_value=-1,
    num_oov_buckets=0
)
```
- `key`: `feature`名字
- `vocabulary_list`: 对于 `category` 来说，进行转换的`list`.也就是`category`列表.
- `dtype`: 仅仅`string` 和 `int` 被支持，其他的类型是无法进行这个操作的.
- `default_value`: 当不在`vocabulary_list`中的默认值，这时候`num_oov_buckets`必须是`0`.
- `num_oov_buckets`: 用来处理那些不在`vocabulary_list` 中的值，如果是`0`，那么使用`default_value`进行填充;如果大于`0`，则会在`[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets]`这个区间上重新计算当前特征的值.
与前面 `numeric` 不同的是，这里返回的是稀疏`tensor`.

In [5]:
def test_categorical_column_with_vocabulary_list():

    color_data = {'color': [['R', 'R'], [
        'G', 'R'], ['B', 'G'], ['A', 'A']]}  # 4行样本

    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
    )

    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense，也就是one-hot形式，只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(
        color_data, [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))

In [6]:
test_categorical_column_with_vocabulary_list()

[SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1],
       [2, 0],
       [2, 1],
       [3, 0],
       [3, 1]], dtype=int64), values=array([ 0,  0,  1,  0,  2,  1, -1, -1], dtype=int64), dense_shape=array([4, 2], dtype=int64))]
use input_layer________________________________________
[array([[ 2.,  0.,  0.],
       [ 1.,  1.,  0.],
       [ 0.,  1.,  1.],
       [ 0.,  0.,  0.]], dtype=float32)]


对于`categorical_column_with_vocabulary_list`来说返回的是`sparser_tensor`，注意 `id_tensor` 这个是有效的，另外一个是`None`. 对于线性模型来说是可以直接使用`sparser_tensor`的。然而，对于深度模型来说，需要将`sparser`转换成`dense`，所以也就有了`indicator_column`这个函数的出现。`indicator_column`的作用就是将`category`产生的`sparser tensor`转换成`dense tensor`.

注意: 
* `input_layer`: 只接受  `dense tensor` 
* `tables_initializer`: 在`sparser`的时候使用的，如果不进行初始化会出现 `Table not initialized`. `[Node: hash_table_Lookup = LookupTableFindV2` 这样的异常`

# `categorical_column_with_hash_bucket`
```py
categorical_column_with_hash_bucket(
    key,
    hash_bucket_size,
    dtype=tf.string
)
```

当`category`的数量很多，也就无法使用指定`category`的方法来处理了，那么，可以使用这种哈希分桶的方式来进行处理。比如，切词之后的句子，每一个词可以使用这种方式来处理. 使用 `categorical_column_with_vocabulary_file` 也是一种不错的选择，比如将词频高的拿出来。毕竟对于`hash_bucket`来说，对于`bucket_size`的选取是个问题。

In [7]:
def test_categorical_column_with_hash_bucket():

    color_data = {'color': [['R'], ['G'], ['B'], ['A']]}  # 4行样本

    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7)

    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense，也就是one-hot形式，只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(
        color_data, [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))

In [8]:
test_categorical_column_with_hash_bucket()

[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0]], dtype=int64), values=array([5, 2, 6, 3], dtype=int64), dense_shape=array([4, 1], dtype=int64))]
use input_layer________________________________________
[array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.]], dtype=float32)]


从上面看这种`hash`分桶的方法，在`hash_size`的选择上是很重要的。现在选择`3`，对于 R 和 B 来说分桶到一个烈面了；对于 G 和 A 分桶到一个里面了。当将 `hash_size=7`来测试, R G B A 就都分到了不同的桶中，所以值越大也容易精确的分桶.

# `categorical_column_with_identity`
```py
categorical_column_with_identity(
    key,
    num_buckets,
    default_value=None
)
```

这是对连续的数字类的处理函数。比如 `id` 一共有`10000`个，那么可以使用这种方式。但是如果多数没有被使用，那么还不如使用 `categorical_column_with_hash_bucket` 进行重新处理。

# `embedding_column`
```py
embedding_column(
    categorical_column,
    dimension,
    combiner='mean',
    initializer=None,
    ckpt_to_load_from=None,
    tensor_name_in_ckpt=None,
    max_norm=None,
    trainable=True
)
```

- `categorical_column`: 使用`categoryical_column`产生的`sparsor column`
- `dimension`: 定义`embedding`的维数
- `combiner`: 对于多个`entries`进行的推导。默认是`meam`, 但是 `sqrtn` 在词袋模型中，有更好的准确度。
- `initializer`: 初始化方法，默认使用高斯分布来初始化。
- `tensor_name_in_ckpt`: 可以从 check point 中恢复
- `ckpt_to_load_from`: check point file，这是在 `tensor_name_in_ckpt` 不为空的情况下设置的.
- `max_norm`: 默认是`l2`
- `trainable`: 是否可训练的，默认是`True`
将`sparsor tensor`转换成`dense tensor`. 在 DNN 的输入中需要使用`dense tensor`. `embedding`如果共用，需要的是`name`一样.

In [10]:
def test_embedding():
    color_data = {'color': [['R'], ['G'], ['B'], ['A']]}  # 4行样本

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
    )

    color_embeding = feature_column.embedding_column(color_column, 8)
    color_embeding_dense_tensor = feature_column.input_layer(color_data, [
                                                             color_embeding])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('embeding' + '_' * 40)
        print(session.run([color_embeding_dense_tensor]))


test_embedding()

embeding________________________________________
[array([[ 0.17713186, -0.66315776,  0.57591653,  0.28331786,  0.28766611,
        -0.44875047, -0.07845679,  0.0941939 ],
       [-0.56759846,  0.2312403 , -0.10596016, -0.00640535, -0.31522641,
        -0.05999918, -0.11839385,  0.26132873],
       [-0.09545483,  0.55210358, -0.30541521, -0.07820553,  0.35961756,
         0.0802813 ,  0.38947856, -0.17908967],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ]], dtype=float32)]


每一个都会转换成8个维度的数据，并且使用高斯分布来进行初始化。因为A 没有在`catergorical_column`中出现，所以使用了`0`进行初始化.

# `weighted_categorical_column`
```py
weighted_categorical_column(
    categorical_column,
    weight_feature_key,
    dtype=tf.float32
)
```
为`categorical_column`赋值权重。默认的`categorical_column`中，所有的权重都是一样的，但是有些时候，对于同样一组`category_column`不同的`category`的权重不同。例如，如果使用`tag`来表示文本，那么`tag`的权重就不同。

In [11]:
def test_weighted_categorical_column():
    color_data = {'color': [['R'], ['G'], ['B'], ['A']],
                  'weight': [[1.0], [2.0], [4.0], [8.0]]}  # 4行样本

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
    )

    color_weight_categorical_column = feature_column.weighted_categorical_column(
        color_column, 'weight')

    builder = _LazyBuilder(color_data)

    with tf.Session() as session:
        id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(
            builder)

        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('weighted categorical' + '-' * 40)

        print(session.run([id_tensor]))
        print('-' * 40)
        print(session.run([weight]))


test_weighted_categorical_column()

weighted categorical----------------------------------------
[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0]], dtype=int64), values=array([ 0,  1,  2, -1], dtype=int64), dense_shape=array([4, 1], dtype=int64))]
----------------------------------------
[SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [2, 0],
       [3, 0]], dtype=int64), values=array([ 1.,  2.,  4.,  8.], dtype=float32), dense_shape=array([4, 1], dtype=int64))]


可以看到，`weight` 这个 tensor 也是存在的。对于前面其他 `categorical_column` 来说是不存在 `weight` 的。

# `linear_model`
```py
linear_model(
    features,
    feature_columns,
    units=1,
    sparse_combiner='sum',
    weight_collections=None,
    trainable=True
)
```
对所有特征进行线性加权操作.

In [12]:
def get_linear_model_bias():
    with tf.variable_scope('linear_model', reuse=True):
        return tf.get_variable('bias_weights')


def get_linear_model_column_var(column):
    return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                             'linear_model/' + column.name)[0]


def test_linear_model():
    """
    测试线性模型
    :return:
    """

    featrues = {
        'price': [[1.0], [5.0], [10.0]],
        'color': [['R'], ['G'], ['B']]
    }

    price_column = feature_column.numeric_column('price')
    color_column = feature_column.categorical_column_with_vocabulary_list('color',
                                                                          ['R', 'G', 'B'])
    prediction = feature_column.linear_model(
        featrues, [price_column, color_column])

    bias = get_linear_model_bias()
    price_var = get_linear_model_column_var(price_column)
    color_var = get_linear_model_column_var(color_column)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        sess.run(bias.assign([7.0]))
        sess.run(price_var.assign([[10.0]]))
        sess.run(color_var.assign([[2.0], [2.0], [2.0]]))

        predication_result = sess.run([prediction])

        print(predication_result)


test_linear_model()

InternalError: Blas GEMV launch failed:  m=1, n=3
	 [[Node: linear_model/price/weighted_sum = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](linear_model/price/Reshape, linear_model/price/weights/read)]]
	 [[Node: linear_model/weighted_sum/_67 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_98_linear_model/weighted_sum", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'linear_model/price/weighted_sum', defined at:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 112, in start
    self.asyncio_loop.run_forever()
  File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 421, in run_forever
    self._run_once()
  File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1431, in _run_once
    handle._run()
  File "C:\ProgramData\Anaconda3\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 102, in _handle_events
    handler_func(fileobj, events)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-fce934b1a40e>", line 46, in <module>
    test_linear_model()
  File "<ipython-input-12-fce934b1a40e>", line 26, in test_linear_model
    featrues, [price_column, color_column])
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 433, in linear_model
    trainable=trainable)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 1710, in _create_weighted_sum
    trainable=trainable)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 1729, in _create_dense_column_weighted_sum
    return math_ops.matmul(tensor, weight, name='weighted_sum')
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 2108, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 4492, in mat_mul
    name=name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3290, in create_op
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1654, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InternalError (see above for traceback): Blas GEMV launch failed:  m=1, n=3
	 [[Node: linear_model/price/weighted_sum = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](linear_model/price/Reshape, linear_model/price/weights/read)]]
	 [[Node: linear_model/weighted_sum/_67 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_98_linear_model/weighted_sum", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


# `crossed_column`
组合特征，这仅仅适用于`sparser`特征.产生的依然是`sparsor`特征.

In [None]:
def test_crossed_column():
    """
    crossed column测试
    :return:
    """
    featrues = {
        'price': [['A', 'A'], ['B', 'D'], ['C', 'A']],
        'color': [['R', 'R'], ['G', 'G'], ['B', 'B']]
    }

    price = feature_column.categorical_column_with_vocabulary_list('price',
                                                                   ['A', 'B', 'C', 'D'])
    color = feature_column.categorical_column_with_vocabulary_list('color',
                                                                   ['R', 'G', 'B'])
    p_x_c = feature_column.crossed_column([price, color], 16)

    p_x_c_identy = feature_column.indicator_column(p_x_c)

    p_x_c_identy_dense_tensor = feature_column.input_layer(featrues, [
                                                           p_x_c_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([p_x_c_identy_dense_tensor]))


test_crossed_column()