# Feature columns

## feature_column 分为三大类
![title](img/feature_column.png)
### Dense_Column
### Categorial_Column
### bucketized_column：继承上面两类，输出 one-hot

### Dense_Column 可分为
#### numerical_column: 整数
##### indicator_column: one-hot
##### embedding_column: 稠密矢量
    
### Categorial_Column 可分为
##### categorical_column_with_identity: 数字分类后返回类别 index
##### categorical_column_with_vocabulary_list：字符串分类后返回类别 index 
##### categorical_column_with_vocabulary_file：字符串分类后返回类别 index ，分类词汇存在文件
##### categorical_column_with_hash_bucket：hash 后直接将字符串分成几类
##### crossed_column：组合特征构造成整形特征值

In [1]:
# 参考资料
# https://tensorflow.google.cn/guide/feature_columns
# https://www.jianshu.com/p/516e882699cf
# https://www.jianshu.com/p/fceb64c790f3
import tensorflow as tf

### numeric_column 

In [4]:
# 转化成数字
# tf.feature_column.numeric_column(
#     key,
#     shape=(1,),
#     default_value=None,
#     dtype=tf.float32,
#     normalizer_fn=None
# )
test = {'test': [[0.], [1.], [2.], [3.], [4.], [5.]]}
column = tf.feature_column.numeric_column(key = 'test')
tensor = tf.feature_column.input_layer(test,[column])
with tf.Session() as sess:
    print(sess.run([tensor]))

[array([[0.],
       [1.],
       [2.],
       [3.],
       [4.],
       [5.]], dtype=float32)]


###  bucketized_column 

In [8]:
# 分痛: 将数字分段后再转化为 one-hot
# tf.feature_column.bucketized_column(
#     source_column,
#     boundaries
# )
year={"year":[1958, 1978, 1981, 1999, 2005,2010]}
year_numeric = tf.feature_column.numeric_column(key ='year')
# boundaries 界限
year_bucket = tf.feature_column.bucketized_column(year_numeric,[1960,1980,2000,2006])
year_tensor = tf.feature_column.input_layer(year,[year_bucket])

with tf.Session() as sess:
    print(sess.run([year_tensor]))

[array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)]


### categorical_column_with_identity 

In [26]:
# 数字 返回分类后的类别 index 
# tf.feature_column.categorical_column_with_identity(
#     key,
#     num_buckets,
#     default_value=None
# )
test = {'cat': [1,3,2,0,3]}
# _IdentityCategoricalColumn(key='cat', num_buckets=4, default_value=None)
# 类似与 bucketized，不过 bucketized = categorical_column_with_identity + indicator_column
column = tf.feature_column.categorical_column_with_identity(
    key='cat',
    num_buckets=4)

indicator = tf.feature_column.indicator_column(column)
tensor = tf.feature_column.input_layer(test, [indicator])

with tf.Session() as session:
    print(session.run([tensor]))
    print(column)                 

[array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)]
_IdentityCategoricalColumn(key='cat', num_buckets=4, default_value=None)


### categorical_column_with_vocabulary_list 

In [29]:
# 字符串 返回分类后的类别 index
# tf.feature_column.categorical_column_with_vocabulary_list(
#     key,
#     vocabulary_list,
#     dtype=None,
#     default_value=-1,
#     num_oov_buckets=0
# )
test = {"test": ["kitchenware", "electronics", "sport", "shirt"]}
# _VocabularyListCategoricalColumn(key='test', vocabulary_list=('kitchenware', 'electronics', 'sports'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
vocabulary_feature_column = tf.feature_column.categorical_column_with_vocabulary_list(
        key="test",
        vocabulary_list=["kitchenware", "electronics", "sports"])

# indicator_column 输入必须是 categorical_column
indicator = tf.feature_column.indicator_column(vocabulary_feature_column)

tensor = tf.feature_column.input_layer(test,[indicator])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    print(sess.run([tensor]))
    print(vocabulary_feature_column)

[array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)]
_VocabularyListCategoricalColumn(key='test', vocabulary_list=('kitchenware', 'electronics', 'sports'), dtype=tf.string, default_value=-1, num_oov_buckets=0)


### categorical_column_with_vocabulary_file 

In [32]:
# 功能与 categorical_column_with_vocabulary_list相同，不过当 vocabulary_list 太长时，可以写在文件中
# categorical_column_with_vocabulary_file 用文件代替，文件每行包含一个 word

# tf.feature_column.categorical_column_with_vocabulary_file(
#     key,
#     vocabulary_file,
#     vocabulary_size=None,
#     dtype=tf.dtypes.string,
#     default_value=None,
#     num_oov_buckets=0
# )
vocabulary_feature_column =tf.feature_column.categorical_column_with_vocabulary_file(
        key="test",
        vocabulary_file="product_class.txt",
        vocabulary_size=3)

###  categorical_column_with_hash_bucket

In [34]:
# 将字符串直接分成几类
# 与 categorical_column_with_identity 类似，只是把数字换成字符串

# tf.feature_column.categorical_column_with_hash_bucket(
#     key,
#     hash_bucket_size,
#     dtype=tf.dtypes.string
# )
test = {'chars': ['a','c','b','d','e','f','g','b']}
# _HashedCategoricalColumn(key='chars', hash_bucket_size=5, dtype=tf.string)
hash_bucket = tf.feature_column.categorical_column_with_hash_bucket(key = 'chars',hash_bucket_size=5)

indicator = tf.feature_column.indicator_column(hash_bucket)
tensor = tf.feature_column.input_layer(test,[indicator])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run([tensor]))
    print(hash_bucket)

[array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]], dtype=float32)]
_HashedCategoricalColumn(key='chars', hash_bucket_size=5, dtype=tf.string)


### crossed_column 

In [43]:
# 组合特征构造成整形特征值
# tf.feature_column.crossed_column(
#     keys,
#     hash_bucket_size,
#     hash_key=None
# )
featrues = {
        'longtitude': [19,61,30,9,45],
        'latitude': [45,40,72,81,24]
    }

longtitude = tf.feature_column.numeric_column(key = 'longtitude')
latitude = tf.feature_column.numeric_column(key = 'latitude')
longtitude_bucket = tf.feature_column.bucketized_column(longtitude,[33,36])
latitude_bucket = tf.feature_column.bucketized_column(latitude,[33,36])
# _CrossedColumn(keys=(_BucketizedColumn(source_column=_NumericColumn(key='longtitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(33, 36)), _BucketizedColumn(source_column=_NumericColumn(key='latitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(33, 36))), hash_bucket_size=12, hash_key=None)
cross_column = tf.feature_column.crossed_column([longtitude_bucket,latitude_bucket],12)

indicator = tf.feature_column.indicator_column(cross_column)
tensor = tf.feature_column.input_layer(featrues,[indicator])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    print(sess.run(tensor))
    print(cross_column)

[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
_CrossedColumn(keys=(_BucketizedColumn(source_column=_NumericColumn(key='longtitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(33, 36)), _BucketizedColumn(source_column=_NumericColumn(key='latitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(33, 36))), hash_bucket_size=12, hash_key=None)


### embedding_column

In [48]:
# embedding：最佳值要训练出来，而不是直接求值
# tf.feature_column.embedding_column(
#     categorical_column,
#     dimension,
#     combiner='mean',
#     initializer=None,
#     ckpt_to_load_from=None,
#     tensor_name_in_ckpt=None,
#     max_norm=None,
#     trainable=True
# )
features = {'pets': ['dog','cat','rabbit','pig','mouse']}  

pets_f_c = tf.feature_column.categorical_column_with_vocabulary_list(
    'pets',
    ['cat','dog','rabbit','pig','mouse'], 
    dtype=tf.string, 
    default_value=-1)

column = tf.feature_column.embedding_column(pets_f_c, 3)
tensor = tf.feature_column.input_layer(features, [column])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    print(session.run([tensor]))

[array([[ 0.48584163,  0.40583292,  0.11255753],
       [-0.602625  ,  0.05574511, -0.08122382],
       [ 0.88433486, -0.22031519,  0.00225582],
       [-0.40975562,  0.37421274,  0.68637836],
       [-0.13163354,  0.20485495, -0.23570992]], dtype=float32)]


## tf.nn.embedding_lookup_sparse 

参考 https://www.jianshu.com/p/4a7525c018b2
当一个特征是多维离散值时，比如一个人喜欢的球员(肯定不止一个)，该如何处理呢？一种方式是每维用 one-hot 表示，加和取平均 one-hot 值。当然也可以用 Embedding 来表示，更能体现特征关联性。"embedding_lookup_sparse" 字面意思就是从 sparse 查找出对应的 embedding 表示。

In [9]:
# 3个用户一个特征值
csv = [
  "1,harden|james|curry",
  "2,wrestbrook|harden|durant",
  "3,|paul|towns",
]
TAG_SET = ["harden", "james", "curry", "durant", "paul","towns","wrestbrook"]

def sparse_from_csv(csv):
    ids,post_tag_strs = tf.decode_csv(csv,[[-1],[""]])
    table = tf.contrib.lookup.index_table_from_tensor(mapping=TAG_SET, default_value=-1)
    split_tags = tf.string_split(post_tag_strs,"|")
    return tf.SparseTensor(
      indices=split_tags.indices,
      values=table.lookup(split_tags.values), ## 这里给出了不同值通过表查到的index ##
      dense_shape=split_tags.dense_shape)

TAG_EMBEDDING_DIM = 3
embedding_params = tf.Variable(tf.truncated_normal([len(TAG_SET), TAG_EMBEDDING_DIM]))

tags = sparse_from_csv(csv)
embedding = tf.nn.embedding_lookup_sparse(embedding_params,sp_ids=tags,sp_weights=None)

with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(),tf.tables_initializer()])
    print(sess.run([embedding]))
    print(sess.run([tags]))

1.12.0
[array([[-0.20015812,  0.15428694, -0.38727465],
       [-0.7556567 , -0.00486706, -1.3744396 ],
       [ 0.1395809 ,  0.7680633 , -0.42304695]], dtype=float32)]
[SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [1, 0],
       [1, 1],
       [1, 2],
       [2, 0],
       [2, 1]], dtype=int64), values=array([0, 1, 2, 6, 0, 3, 4, 5], dtype=int64), dense_shape=array([3, 3], dtype=int64))]
