## YouTubeNet 召回模型

## movielens 数据集处理
数据处理方式与MIND算法一样。可参考MIND部分的数据处理结果

In [1]:
import os

input_dir = "../../../data/ml-1m/"
output_dir = "../../../data/ml-1m/mind/"

train_path = os.path.join(output_dir, "train.txt")
test_path = os.path.join(output_dir, "test.txt")

### 模型结构定义

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Lambda, Layer


class SequencePoolingLayer(Layer):

    def __init__(self, mode="mean", support_mask=True, sequence_mask_length=50, **kwargs):

        if mode not in ["mean", "max", "sum"]:
            raise ValueError("mode must be `mean`, `max` or `sum` !")

        self.mode = mode
        self.eps = tf.constant(1e-8, tf.float32)
        self.support_mask = support_mask
        self.sequence_mask_length = sequence_mask_length

        super(SequencePoolingLayer, self).__init__(**kwargs)


    def build(self, input_shape):
        super(SequencePoolingLayer, self).build(input_shape)


    def call(self, input_hist_seq_list, **kwargs):

        hist_user_embedding_list, hist_user_behavior_length = input_hist_seq_list

        if not self.support_mask:

            if self.mode == "max":
                return tf.reduce_max(hist_user_embedding_list, 1, keepdims=True)

            mode_sum = tf.reduce_sum(hist_user_embedding_list, 1, keepdims=True)

            if self.mode == "sum":
                return mode_sum

            if self.mode == "mean":
                return tf.divide(mode_sum, self.sequence_mask_length + self.eps)


        if self.support_mask:

            # mask matrix
            mask_list = tf.sequence_mask(hist_user_behavior_length, self.sequence_mask_length, dtype=tf.float32)

            # transpose mask matrix
            mask_transpose_list = tf.transpose(mask_list, (0, 2, 1))
            embedding_dim = hist_user_embedding_list.shape[-1]

            # expand mask matrix
            mask_tile_list = tf.tile(mask_transpose_list, [1, 1, embedding_dim])


            # max
            if self.mode == "max":
                hist = hist_user_embedding_list - (1-mask_tile_list) * 1e9
                return tf.reduce_max(hist, 1, keepdims=True)


            mode_sum = tf.reduce_sum(hist_user_embedding_list * mask_tile_list, 1, keepdims=True)

            # sum
            if self.mode == "sum":
                return mode_sum

            # mean
            if self.mode == "mean":
                hist_user_behavior_length = tf.reduce_sum(mask_list, axis=-1, keepdims=True)

                return tf.divide(mode_sum, \
                    tf.cast(hist_user_behavior_length, tf.float32) + self.eps)

            
    def compute_output_shape(self, input_shape):
        return (None, 1, input_shape[0][-1])
        
        
    def config(self):
        config = {"mode": self.mode, "support_mask": self.support_mask, \
            "sequence_mask_length": self.sequence_mask_length}

        base_config = super(SequencePoolingLayer, self).get_config()

        return dict(list(base_config.items()) + list(config.items))

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, concatenate, Dense, Dropout

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


def YouTubeNet(
    sparse_input_length=1,
    dense_input_length=1,
    sparse_seq_input_length=50,
    
    embedding_dim = 64,
    neg_sample_num = 10, 
    user_hidden_unit_list = [128, 64]
    ):

    # 1. Input layer
    user_id_input_layer = Input(shape=(sparse_input_length, ), name="user_id_input_layer")
    gender_input_layer = Input(shape=(sparse_input_length, ), name="gender_input_layer")
    age_input_layer = Input(shape=(sparse_input_length, ), name="age_input_layer")
    occupation_input_layer = Input(shape=(sparse_input_length, ), name="occupation_input_layer")
    zip_input_layer = Input(shape=(sparse_input_length, ), name="zip_input_layer")
    
    
    user_click_item_seq_input_layer = Input(shape=(sparse_seq_input_length, ), name="user_click_item_seq_input_layer")
    user_click_item_seq_length_input_layer = Input(shape=(sparse_input_length, ), name="user_click_item_seq_length_input_layer")
    
    
    pos_item_sample_input_layer = Input(shape=(sparse_input_length, ), name="pos_item_sample_input_layer")
    neg_item_sample_input_layer = Input(shape=(neg_sample_num, ), name="neg_item_sample_input_layer")


    
    # 2. Embedding layer
    user_id_embedding_layer = Embedding(6040+1, embedding_dim, mask_zero=True, name='user_id_embedding_layer')(user_id_input_layer)
    gender_embedding_layer = Embedding(2+1, embedding_dim, mask_zero=True, name='gender_embedding_layer')(gender_input_layer)
    age_embedding_layer = Embedding(7+1, embedding_dim, mask_zero=True, name='age_embedding_layer')(age_input_layer)
    occupation_embedding_layer = Embedding(21+1, embedding_dim, mask_zero=True, name='occupation_embedding_layer')(occupation_input_layer)
    zip_embedding_layer = Embedding(3439+1, embedding_dim, mask_zero=True, name='zip_embedding_layer')(zip_input_layer)
    
    item_id_embedding_layer = Embedding(3706+1, embedding_dim, mask_zero=True, name='item_id_embedding_layer')
    pos_item_sample_embedding_layer = item_id_embedding_layer(pos_item_sample_input_layer)
    neg_item_sample_embedding_layer = item_id_embedding_layer(neg_item_sample_input_layer)
    
    user_click_item_seq_embedding_layer = item_id_embedding_layer(user_click_item_seq_input_layer)
    #user_click_item_seq_embedding_layer = SequencePoolingLayer(sequence_mask_length=sparse_seq_input_length)\
    #    ([user_click_item_seq_embedding_layer, user_click_item_seq_length_input_layer])

    user_click_item_seq_embedding_layer  = tf.reduce_mean(user_click_item_seq_embedding_layer, 1, keepdims=True)

    

    ### ********** ###
    # user part
    ### ********** ###

    # 3. Concat "sparse" embedding & "sparse_seq" embedding
    user_embedding_layer = concatenate([user_id_embedding_layer, gender_embedding_layer, age_embedding_layer,
                                       occupation_embedding_layer, zip_embedding_layer, user_click_item_seq_embedding_layer], 
                                       axis=-1)


    for i, u in enumerate(user_hidden_unit_list):
        user_embedding_layer = Dense(u, activation="relu", name="FC_{0}".format(i+1))(user_embedding_layer)
        #user_embedding_layer = Dropout(0.3)(user_embedding_layer)
        
    
    ### ********** ###
    # item part
    ### ********** ###

    item_embedding_layer = concatenate([pos_item_sample_embedding_layer, neg_item_sample_embedding_layer], \
                                       axis=1)
    
    item_embedding_layer = tf.transpose(item_embedding_layer, [0,2,1])
    


    # Output
    dot_output = tf.matmul(user_embedding_layer, item_embedding_layer) 
    dot_output = tf.nn.softmax(dot_output) # 输出11个值，index为0的值是正样本，负样本的索引位置为[1-10]
    
    user_inputs_list = [user_id_input_layer, gender_input_layer, age_input_layer, \
                        occupation_input_layer, zip_input_layer, \
                        user_click_item_seq_input_layer, user_click_item_seq_length_input_layer]
    
    item_inputs_list = [pos_item_sample_input_layer, neg_item_sample_input_layer]

    model = Model(inputs = user_inputs_list + item_inputs_list,
                  outputs = dot_output)
    
    #print(model.summary())
    #tf.keras.utils.plot_model(model, to_file='YouTubeNet_model.png', show_shapes=True)



    model.__setattr__("user_input", user_inputs_list)
    model.__setattr__("user_embedding", user_embedding_layer)
    
    model.__setattr__("item_input", pos_item_sample_input_layer)
    model.__setattr__("item_embedding", pos_item_sample_embedding_layer)
    
    return model

### 模型训练

In [2]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

from data_generator import file_generator

In [3]:
# 1. Load data

train_path = train_path
val_path =  test_path
batch_size = 1000

n_train = sum([1 for i in open(train_path)])
n_val = sum([1 for i in open(val_path)])

train_steps = n_train / batch_size
train_steps_ = n_train // batch_size
validation_steps = n_val / batch_size
validation_steps_ = n_val // batch_size


train_generator = file_generator(train_path, batch_size)
val_generator = file_generator(val_path, batch_size)

steps_per_epoch = train_steps_ if train_steps==train_steps_ else train_steps_ + 1
validation_steps = validation_steps_ if validation_steps==validation_steps_ else validation_steps_ + 1

print("n_train: ", n_train)
print("n_val: ", n_val)

print("steps_per_epoch: ", steps_per_epoch)
print("validation_steps: ", validation_steps)

n_train:  988129
n_val:  6040
steps_per_epoch:  989
validation_steps:  7


In [8]:
# 2. Train model

early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
callbacks = [early_stopping_cb]


model = YouTubeNet()

model.compile(loss='sparse_categorical_crossentropy', \
    optimizer=Adam(lr=1e-3), \
    metrics=['sparse_categorical_accuracy'])
# loss="sparse_categorical_accuracy"的应用方式参见：https://mp.weixin.qq.com/s/H4ET0bO_xPm8TNqltMt3Fg



history = model.fit(train_generator, \
                    epochs=2, \
                    steps_per_epoch = steps_per_epoch, \
                    callbacks = callbacks,
                    validation_data = val_generator, \
                    validation_steps = validation_steps, \
                    shuffle=True
                   )



model_path = "../../../data/ml-1m/youtube/model/YouTubeNet_model.h5"


model.save_weights(model_path)

Epoch 1/2
Epoch 2/2


#### 模型预测

In [11]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model

from data_generator import init_output


# 1. Load model

re_model = YouTubeNet()
re_model.load_weights(model_path)




# 2. Load data

user_id, gender, age, occupation, zip, \
        hist_movie_id, hist_len, pos_movie_id, neg_movie_id = init_output()

with open(val_path, 'r') as f:
    for line in f.readlines():

        buf = line.strip().split('\t')

        user_id.append(int(buf[0]))
        gender.append(int(buf[1]))
        age.append(int(buf[2]))
        occupation.append(int(buf[3]))
        zip.append(int(buf[4]))
        hist_movie_id.append(np.array([int(i) for i in buf[5].strip().split(",")]))
        hist_len.append(int(buf[6]))
        pos_movie_id.append(int(buf[7]))
        

user_id = np.array(user_id, dtype='int32')
gender = np.array(gender, dtype='int32')
age = np.array(age, dtype='int32')
occupation = np.array(occupation, dtype='int32')
zip = np.array(zip, dtype='int32')
hist_movie_id = np.array(hist_movie_id, dtype='int32')
hist_len = np.array(hist_len, dtype='int32')
pos_movie_id = np.array(pos_movie_id, dtype='int32')



# 3. Generate user features for testing and full item features for retrieval

test_user_model_input = [user_id, gender, age, occupation, zip, hist_movie_id, hist_len]
all_item_model_input = list(range(0, 3706+1))

user_embedding_model = Model(inputs=re_model.user_input, outputs=re_model.user_embedding)
item_embedding_model = Model(inputs=re_model.item_input, outputs=re_model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)


user_embs = np.reshape(user_embs, (-1, 64))
item_embs = np.reshape(item_embs, (-1, 64))

print(user_embs[:2])

(6040, 1, 64)
(3707, 1, 64)
[[3.7275229  0.77930844 0.         0.         0.         3.0729573
  0.6435612  0.         0.         0.         0.         0.36303705
  0.         0.         0.         0.         1.1476315  1.9882993
  0.         0.51794946 1.6225233  0.8736892  0.7667916  0.
  0.97278404 0.         0.9066596  0.39286005 0.         1.0374975
  1.3326021  1.3058554  0.         1.1266472  1.0915216  1.4909037
  3.4928114  0.91215193 0.         0.30906323 0.         2.2884986
  0.         0.08073004 2.4222858  0.26510972 0.         0.08399317
  0.5417748  1.5366259  0.         1.343622   2.1907806  0.
  0.         2.3240263  0.         3.224169   0.         0.
  1.44854    0.         0.5975563  0.        ]
 [0.         9.178487   1.9666482  0.         1.2365543  0.
  0.2566222  2.4942667  1.0633035  1.7844172  0.         0.
  0.         0.         0.76999754 5.2367306  0.         1.0767261
  0.9573877  0.8556204  0.         0.         0.         0.41146037
  0.         1.4084