we advise you to read understand_masks before going through this notebook

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names, build_input_features
from deepctr.models.sequence.attentional_pooling import AttentionalPooling

In [2]:
def get_xy_fd(hash_flag=False):
    constant_feature_columns = [SparseFeat('user', 5, embedding_dim=10, use_hash=hash_flag),
                       SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
                       # SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
                       # SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
                       DenseFeat('pay_score', 3)]

    behavior_feature_columns = [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=5 + 1, embedding_dim=8, embedding_name='item_id'),
                         maxlen=4, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', 5 + 1, embedding_dim=4, embedding_name='cate_id'),
                         maxlen=4, length_name="seq_length"),
        DenseFeat('hist_dense1', 4),
        DenseFeat('hist_dense2', 4)]

    behavior_sparse_indicator = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2, 3, 4])
    ugender = np.array([0, 0, 1, 1, 0])
    # iid = np.array([1, 2, 3])  # 0 is mask value
    # cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([[0.1, 0.2, 0.3], [0.2, 0.2, 0.3], [0.3, 0.2, 0.3],
                      [0.4, 0.2, 0.3], [0.5, 0.2, 0.3]])

    hist_iid = np.array([[1, 2, 3, 0], [2, 2, 3, 0], [3, 2, 0, 0],
                         [4, 5, 0, 0], [5, 1, 2, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 2, 0], [3, 2, 0, 0],
                             [4, 2, 0, 0], [5, 2, 2, 0]])
    dense1 = np.array([[0.5, 0.1, 0.2, 0], [0.7, 0.6, 0.3, 0], [0.3, 0.2, 0, 0],
                       [0.1, 0.1, 0, 0], [0.2, 0.1, 0.2, 0]])
    dense2 = np.array([[0.2, 0.2, 0.2, 0], [0.5, 0.1, 0.1, 0], [0.1, 0.2, 0, 0],
                       [0.4, 0.2, 0, 0], [0.3, 0.1, 0.1, 0]])

    behavior_length = np.array([3, 3, 2, 2, 3])

    feature_dict = {'user': uid,
                    'gender': ugender,
                    # 'item_id': iid, 'cate_id': cate_id,
                    'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id,
                    'pay_score': score, "seq_length": behavior_length,
                    'hist_dense1': dense1, 'hist_dense2': dense2}

    x = {name: feature_dict[name] for name in get_feature_names(
        constant_feature_columns + behavior_feature_columns)}
    y = np.array([1, 0, 1, 1, 0])
    return x, y, constant_feature_columns, behavior_feature_columns, behavior_sparse_indicator


def make_list(features):
    new_feats = {}
    for name, value in features.items():
        if name in ['y']:
            continue
        elif (name.find('hist') < 0) and (name != 'pay_score'):
            new_feats[name] = value
        else:
            ini = tf.ones_like(value, dtype=tf.int32)
            end = tf.strings.length(value) - 2
            value = tf.strings.substr(value, ini, end)
            value = tf.strings.split(value, ',').to_tensor()
            if name in  ['hist_dense1', 'hist_dense2', 'pay_score']:
                value = tf.strings.to_number(value)
            else:
                value = tf.strings.to_number(value, out_type=tf.int32)
            new_feats[name] = value
    return new_feats, features['y']


def stack_constant_dense(features, label):
    new = {feature.name: features[feature.name] for feature in constant_dense_feature_columns}
    return tf.concat(list(new.values()), -1)


def stack_sequence_dense(features, label):
    new = {feature.name: features[feature.name] for feature in varlen_dense_feature_columns}
    return tf.stack(list(new.values()), -1)

constant_feature_columns:
user (None,), gender(None,), pay_score(None, 3)

behavior_feature_columns(L=4):
hist_item_id(None, L), hist_cate_id(None, L)
hist_dense1(None, L), hist_dense2(None, L)

behavior_sparse_indicator: "item_id", "cate_id"
used to indicate sparse features that need to be embedded

x: dict, feature array
{user: (None, ), gender: (None, ), pay_score: (None, 3),
 hist_item_id: (None, L), hist_cate_id: (None, L),
 hist_dense1: (None, L), hist_dense2: (None, L),
 seq_length: (None,)}

L is the max length for all the samples, but some short samples
 are padded by 0. So, seq_length is used to indicate the actual length of
 each sample.

y: label array (None, )

In [3]:
# toy sequence data
x, y, constant_feature_columns, behavior_feature_columns, behavior_sparse_indicator = get_xy_fd()
# df = pd.DataFrame()
# for name, value in x.items():
#     print(name)
#     df[name] = value.tolist()
# df['y'] = y
# df.to_csv('../data/sequence/toy_sequence.csv', index=None)

In [4]:
# fit的时候每个epoch即便没有把dataset消费完
# 在下一个epoch会在剩下的里面开始消费，而不是重头开始
# shuffle=true保证遍历完dataset后，再次遍历的时候顺序是重新打乱的
csv_train_ds = tf.data.experimental.make_csv_dataset(
    '../data/sequence/toy_sequence.csv',
    batch_size=2,
    shuffle_seed=2,
    shuffle=True,
    ignore_errors=True,)
csv_train_adapt_ds = tf.data.experimental.make_csv_dataset(
    '../data/sequence/toy_sequence.csv',
    batch_size=2,
    shuffle=False,
    ignore_errors=True,
    num_epochs=1,)

# validation的时候，每个epoch即便没有把dataset消费完
# 在下一个epoch会重头开始
# shuffle=false保证重头开始的时候，顺序依然和上次一样
csv_val_ds = tf.data.experimental.make_csv_dataset(
    '../data/sequence/toy_sequence.csv',
    batch_size=2,
    shuffle=False,
    shuffle_seed=2,
    num_epochs=1,
    ignore_errors=True,)

The dataset has only 5 samples, with batch size of 2. So it only has 3 batches.
When you restricted the dataset to only iterate over 1 epoch, you can only
get 1 epoch of batches(3 batches) even if you specify 5 batches

In [5]:
for i, batch in enumerate(csv_train_adapt_ds.take(5)):
    print(i)
    print(batch)

0
OrderedDict([('user', <tf.Tensor: shape=(2,), dtype=int32, numpy=array([0, 1])>), ('gender', <tf.Tensor: shape=(2,), dtype=int32, numpy=array([0, 0])>), ('pay_score', <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'[0.1, 0.2, 0.3]', b'[0.2, 0.2, 0.3]'], dtype=object)>), ('hist_item_id', <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'[1, 2, 3, 0]', b'[2, 2, 3, 0]'], dtype=object)>), ('seq_length', <tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 3])>), ('hist_cate_id', <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'[1, 2, 2, 0]', b'[2, 2, 2, 0]'], dtype=object)>), ('hist_dense1', <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'[0.5, 0.1, 0.2, 0.0]', b'[0.7, 0.6, 0.3, 0.0]'], dtype=object)>), ('hist_dense2', <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'[0.2, 0.2, 0.2, 0.0]', b'[0.5, 0.1, 0.1, 0.0]'], dtype=object)>), ('y', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 0.], dtype=float32)>)])
1
OrderedDict([('user', <tf.Tensor: shape=(2,), dty

In [6]:
# sequence features are saved as string in csv. so we need to map it to array
csv_train_ds_mapped = csv_train_ds.map(make_list)
csv_train_adapt_ds_mapped = csv_train_adapt_ds.map(make_list)
csv_val_ds_mapped = csv_val_ds.map(make_list)

In [7]:
for batch, label in csv_val_ds_mapped.take(3):
    print(batch)
    print(label)

{'user': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([0, 1])>, 'gender': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([0, 0])>, 'pay_score': <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.1, 0.2, 0.3],
       [0.2, 0.2, 0.3]], dtype=float32)>, 'hist_item_id': <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[1, 2, 3, 0],
       [2, 2, 3, 0]])>, 'seq_length': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 3])>, 'hist_cate_id': <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[1, 2, 2, 0],
       [2, 2, 2, 0]])>, 'hist_dense1': <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[0.5, 0.1, 0.2, 0. ],
       [0.7, 0.6, 0.3, 0. ]], dtype=float32)>, 'hist_dense2': <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[0.2, 0.2, 0.2, 0. ],
       [0.5, 0.1, 0.1, 0. ]], dtype=float32)>}
tf.Tensor([1. 0.], shape=(2,), dtype=float32)
{'user': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3])>, 'gender': <tf.Tensor: shape=(2,), dtype=int32, numpy=arra

In [8]:
labels = []
users = []
for feature, label in csv_val_ds_mapped.take(3):
    print('x', feature['user'])
    print('y', label)
    labels.append(label)
    users.append(feature['user'])

x tf.Tensor([0 1], shape=(2,), dtype=int32)
y tf.Tensor([1. 0.], shape=(2,), dtype=float32)
x tf.Tensor([2 3], shape=(2,), dtype=int32)
y tf.Tensor([1. 1.], shape=(2,), dtype=float32)
x tf.Tensor([4], shape=(1,), dtype=int32)
y tf.Tensor([0.], shape=(1,), dtype=float32)


In [9]:
tf.concat(labels, axis=0)

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 1., 1., 0.], dtype=float32)>

In [10]:
tf.concat(users, axis=0)


<tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 1, 2, 3, 4])>

In [11]:
features = build_input_features(constant_feature_columns + behavior_feature_columns)
constant_dense_feature_columns = list(
    filter(lambda x: isinstance(x, DenseFeat), constant_feature_columns) if constant_feature_columns else [])
varlen_dense_feature_columns = list(
    filter(lambda x: isinstance(x, DenseFeat), behavior_feature_columns) if behavior_feature_columns else [])

For large dataset, normalization of dense features might be done by static op.

constant dense features (pay_score-(None, 3)) is extracted out as csv_constant_adapt,
and normalization is applied at the last axis.

sequence dense features (hist_dense1-(None, L), hist_dense2-(None, L)) are concatenated as csv_constant_adapt
(None, L, 2), and normalization is applied at the last axis

In [12]:
csv_sequence_adapt = csv_train_adapt_ds_mapped.map(stack_sequence_dense)
csv_constant_adapt = csv_train_adapt_ds_mapped.map(stack_constant_dense)

In [13]:
for batch in csv_constant_adapt.take(2):
    print(batch)

tf.Tensor(
[[0.1 0.2 0.3]
 [0.2 0.2 0.3]], shape=(2, 3), dtype=float32)
tf.Tensor(
[[0.3 0.2 0.3]
 [0.4 0.2 0.3]], shape=(2, 3), dtype=float32)


In [14]:
constant_normalizer = tf.keras.layers.experimental.preprocessing.Normalization(axis=-1)
constant_normalizer.adapt(csv_constant_adapt)

In [15]:
constant_normalizer.mean


<tf.Variable 'mean:0' shape=(3,) dtype=float32, numpy=array([0.3, 0.2, 0.3], dtype=float32)>

In [16]:
for batch in csv_sequence_adapt.take(2):
    print(batch)

tf.Tensor(
[[[0.5 0.2]
  [0.1 0.2]
  [0.2 0.2]
  [0.  0. ]]

 [[0.7 0.5]
  [0.6 0.1]
  [0.3 0.1]
  [0.  0. ]]], shape=(2, 4, 2), dtype=float32)
tf.Tensor(
[[[0.3 0.1]
  [0.2 0.2]
  [0.  0. ]
  [0.  0. ]]

 [[0.1 0.4]
  [0.1 0.2]
  [0.  0. ]
  [0.  0. ]]], shape=(2, 4, 2), dtype=float32)


In [17]:
sequence_normalizer = tf.keras.layers.experimental.preprocessing.Normalization(axis=-1)
sequence_normalizer.adapt(csv_sequence_adapt)

In [18]:
sequence_normalizer.mean

<tf.Variable 'mean:0' shape=(2,) dtype=float32, numpy=array([0.18 , 0.135], dtype=float32)>

In [19]:
model = AttentionalPooling(constant_feature_columns, behavior_feature_columns, behavior_sparse_indicator,
                           sequence_dense_normalizer=sequence_normalizer,
                           constant_dense_normalizer=constant_normalizer,
                           dnn_hidden_units=[4, 4], dnn_dropout=0.6)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, clipvalue=1.0)
model.compile(optimizer, "binary_crossentropy")
# print(model.summary())

In [20]:
history = model.fit(csv_val_ds_mapped,
                    epochs=5,
                    validation_data=csv_val_ds_mapped,
                    validation_steps=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [21]:
model.predict(csv_val_ds_mapped, steps=10)




array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)