# category feature embedding expt

In [2]:
from collections import namedtuple
import numpy as np
import tensorflow as tf

In [3]:
VarMeta = namedtuple('VarMeta', ['name', 'catalog', 'dist_num'])

In [4]:
var_metas = {
    'var1': VarMeta(name='var1', catalog='N', dist_num=None),
    'var2': VarMeta(name='var2', catalog='N', dist_num=None),
    'var3': VarMeta(name='var3', catalog='C', dist_num=10),
    'var4': VarMeta(name='var4', catalog='N', dist_num=None),
    'var5': VarMeta(name='var5', catalog='C', dist_num=10),
}

In [5]:
features = sorted([v for v in var_metas])

In [6]:
class CategoryEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, var_metas, features, embed_dim=10):
        super(CategoryEmbeddingLayer, self).__init__()
        self.var_metas = var_metas
        self.features = features
        self.embed_dim = embed_dim
        
        self.numerical_features = [v for v in features if var_metas[v].catalog == 'N']
        self.category_features = [v for v in features if var_metas[v].catalog == 'C']
    
    def call(self, feature_vec):
        print(f'input feature_vec shape: {feature_vec.shape}')
        
        numerical_num = len(self.numerical_features)
        category_num = len(self.category_features)
        
        mask = [self.var_metas[v].catalog=='N' for v in self.features]
        print(f'numerical mask: {mask}')
        numerical_vec = tf.boolean_mask(feature_vec, mask, axis=1)
        numerical_vec = tf.cast(numerical_vec, dtype=tf.float64)
        numerical_vec = tf.ensure_shape(numerical_vec, (None, numerical_num))
        print(f'input numerical_vec shape: {numerical_vec.shape}')

        mask = [self.var_metas[v].catalog=='C' for v in self.features]
        print(f'cateogry mask: {mask}')
        category_idx = tf.boolean_mask(feature_vec, mask, axis=1)
        category_idx = tf.cast(category_idx, dtype=tf.int32)
        category_idx = tf.ensure_shape(category_idx, (None, category_num))
        print(f'input category_idx shape: {category_idx.shape}')       
        category_embeds = self.cate_embedding(category_idx)
        
        if len(numerical_vec.shape.as_list()) == 1:
            numerical_vec = tf.expand_dims(numerical_vec, axis=0)
    
        print(f'numberical vector shape: {numerical_vec.shape}, dtype: {numerical_vec.dtype}')
        print(f'category embedding shape: {category_embeds.shape}, dtype: {category_embeds.dtype}')
        combo_vec = tf.concat([numerical_vec, category_embeds], axis=1)
        print(f'concated feature vector, shape: {combo_vec.shape}')
        
        return combo_vec

    
    def cate_embedding(self, category_idx):
        embeds = []
        for i, v in enumerate(self.category_features):
            cate_idx = tf.gather(category_idx, i, axis=1)
            tf.Assert(tf.less_equal(tf.squeeze(cate_idx), self.var_metas[v].dist_num - 1),
                      [cate_idx, self.var_metas[v].dist_num])
            
            embed = tf.keras.layers.Embedding(input_dim=self.var_metas[v].dist_num,
                                              output_dim=self.embed_dim)(cate_idx)
            print(f'orgin embed shape: {embed.shape}')
            if len(embed.shape.as_list()) == 1:
                embed = tf.expand_dims(embed, axis=0)
            embeds.append(embed)

        category_embeds = tf.cast(tf.concat(embeds, axis=-1), dtype=tf.float64)
        return category_embeds

In [None]:

model = tf.keras.Sequential()

model.add(tf.keras.layers.InputLayer(name='inputs', input_shape=(len(features),)))
model.add(CategoryEmbeddingLayer(var_metas=var_metas, features=features))
model.add(tf.keras.layers.Dense(23))
model.compile(tf.optimizers.RMSprop(0.001), loss='mse')

In [None]:
model.summary()



In [7]:
feature_vec = np.random.randn(len(features))

for i, f in enumerate(features):
    if var_metas[f].catalog == 'N':
        continue
    feature_vec[i] += i 

feature_vec = tf.convert_to_tensor(feature_vec, dtype=tf.float64)

layer = CategoryEmbeddingLayer(var_metas=var_metas, features=features)

ret = layer(tf.expand_dims(feature_vec, axis=0))

input feature_vec shape: (1, 5)
numerical mask: [True, True, False, True, False]
input numerical_vec shape: (1, 3)
cateogry mask: [False, False, True, False, True]
input category_idx shape: (1, 2)
orgin embed shape: (1, 10)
orgin embed shape: (1, 10)
numberical vector shape: (1, 3), dtype: <dtype: 'float64'>
category embedding shape: (1, 20), dtype: <dtype: 'float64'>
concated feature vector, shape: (1, 23)


2023-02-18 14:47:34.140757: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
