# MIND

## movielens 数据集处理

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model

import random
from tqdm import tqdm
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
input_dir = "../../../data/ml-1m/"
output_dir = "../../../data/ml-1m/mind/"

train_path = os.path.join(output_dir, "train.txt")
test_path = os.path.join(output_dir, "test.txt")



In [3]:
def gen_data_set(data, negsample=0):

    data.sort_values("Timestamp", inplace=True)
    item_ids = data['MovieID'].unique()

    train_set = []
    test_set = []
    for reviewerID, histlist in tqdm(data.groupby('UserID')):
        pos_list = histlist['MovieID'].tolist()
        rating_list = histlist['Rating'].tolist()

        if negsample > 0:
            candidate_set = list(set(item_ids) - set(pos_list))
            neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True)
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]
            if i != len(pos_list) - 1:
                train_set.append((reviewerID, hist[::-1], pos_list[i], 1,len(hist[::-1]),rating_list[i]))
                for negi in range(negsample):
                    train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1])))
            else:
                test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1]),rating_list[i]))

    random.shuffle(train_set)
    random.shuffle(test_set)

    print(len(train_set),len(test_set))

    return train_set,test_set

def gen_model_input(train_set,user_profile,seq_max_len):

    train_uid = np.array([line[0] for line in train_set])
    train_seq = [line[1] for line in train_set]
    train_iid = np.array([line[2] for line in train_set])
    train_label = np.array([line[3] for line in train_set])
    train_hist_len = np.array([line[4] for line in train_set])

    train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', value=0)
    train_model_input = {"UserID": train_uid, "MovieID": train_iid, "hist_movie_id": train_seq_pad,
                         "hist_len": train_hist_len}

    for key in ["Gender", "Age", "Occupation", "Zip-code"]:
        train_model_input[key] = user_profile.loc[train_model_input['UserID']][key].values

    return train_model_input,train_label

In [4]:
# Loda Data
users_path = os.path.join(input_dir, "users.dat")
movies_path = os.path.join(input_dir, "movies.dat")
ratings_path = os.path.join(input_dir, "ratings.dat")
users_fields = "UserID::Gender::Age::Occupation::Zip-code".split("::")
movies_fields = "MovieID::Title::Genres".split("::")
ratings_fields="UserID::MovieID::Rating::Timestamp".split("::")
users = pd.read_csv(users_path, sep="::", header=None, engine="python",encoding="latin1", names=users_fields)
movies = pd.read_csv(movies_path, sep="::", header=None, engine="python",encoding="latin1", names=movies_fields)
ratings = pd.read_csv(ratings_path, sep="::", header=None, engine="python",encoding="latin1", names=ratings_fields)

data = pd.merge(pd.merge(ratings, movies), users)

In [5]:
#print("users:", type(users), users.shape, '\n', users.head(3))
#print("movies:", type(movies), movies.shape, '\n', movies.head(3))
#print("ratings:", type(ratings), ratings.shape, '\n', ratings.head(3))
print("data:", type(data), data.shape, '\n', data.head(3))

data: <class 'pandas.core.frame.DataFrame'> (1000209, 10) 
    UserID  MovieID  Rating  Timestamp                                   Title  \
0       1     1193       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       1      661       3  978302109        James and the Giant Peach (1996)   
2       1      914       3  978301968                     My Fair Lady (1964)   

                         Genres Gender  Age  Occupation Zip-code  
0                         Drama      F    1          10    48067  
1  Animation|Children's|Musical      F    1          10    48067  
2               Musical|Romance      F    1          10    48067  


In [6]:
# 2. Label Encoding for sparse features, 
# and process sequence features with `gen_date_set` and `gen_model_input`
sparse_features = ["MovieID", "UserID", "Gender", "Age", "Occupation", "Zip-code"]
SEQ_LEN = 50
negsample = 0


features = ['UserID', 'MovieID', 'Gender', 'Age', 'Occupation', 'Zip-code']
feature_max_idx = {}

for feature in features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1
    feature_max_idx[feature] = data[feature].max() + 1


user_profile = data[["UserID", "Gender", "Age", "Occupation", "Zip-code"]].drop_duplicates('UserID')
item_profile = data[["MovieID"]].drop_duplicates('MovieID')

user_profile.set_index("UserID", inplace=True)
user_item_list = data.groupby("UserID")['MovieID'].apply(list)
print("data:", type(data), data.shape, '\n', data.head(3))
#print("user_profile:", type(user_profile), user_profile.shape, '\n', user_profile.head(3))
#print("item_profile:", type(item_profile), item_profile.shape, '\n', item_profile.head(3))
#print("user_item_list:", type(user_item_list), user_item_list.shape, '\n', user_item_list.head(3))


data: <class 'pandas.core.frame.DataFrame'> (1000209, 10) 
    UserID  MovieID  Rating  Timestamp                                   Title  \
0       1     1105       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       1      640       3  978302109        James and the Giant Peach (1996)   
2       1      854       3  978301968                     My Fair Lady (1964)   

                         Genres  Gender  Age  Occupation  Zip-code  
0                         Drama       1    1          11      1589  
1  Animation|Children's|Musical       1    1          11      1589  
2               Musical|Romance       1    1          11      1589  


In [7]:
train_set, test_set = gen_data_set(data, negsample)
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)


100%|██████████| 6040/6040 [00:10<00:00, 566.28it/s] 


988129 6040


In [8]:
#print('train_model_input', type(train_model_input), train_model_input)
print('train_label',type(train_label),len(train_label))

train_label <class 'numpy.ndarray'> 988129


In [9]:
train_neg_sample_list = []
test_neg_sample_list = []
all_movie_list = set(data['MovieID'])
neg_sample_num = 10

for i in tqdm(range(len(train_label))):
    a = set(train_model_input['hist_movie_id'][i] + train_model_input['MovieID'][i])
    neg_list = random.sample(list(all_movie_list - a), neg_sample_num)
    train_neg_sample_list.append(np.array(neg_list))
    
for i in tqdm(range(len(test_label))):
    a = set(test_model_input['hist_movie_id'][i] + test_model_input['MovieID'][i])
    neg_list = random.sample(list(all_movie_list - a), neg_sample_num)
    test_neg_sample_list.append(np.array(neg_list))

100%|██████████| 988129/988129 [01:57<00:00, 8441.30it/s]
100%|██████████| 6040/6040 [00:00<00:00, 8537.24it/s]


In [10]:
print('train_neg_sample_list', type(train_neg_sample_list), len(train_neg_sample_list),train_neg_sample_list[0])

train_neg_sample_list <class 'list'> 988129 [1234  292  957  694 1480 1177 3304  465  415  238]


In [11]:
# 4. Write to .txt
train = open(train_path, "w")
test = open(test_path, "w")

for i in range(len(train_label)):
    a = train_model_input["UserID"][i]
    b = train_model_input["Gender"][i]
    c = train_model_input["Age"][i]
    d = train_model_input["Occupation"][i]
    e = train_model_input["Zip-code"][i]
    f = train_model_input["hist_movie_id"][i]
    g = train_model_input["hist_len"][i]
    
    h = train_model_input["MovieID"][i]
    m = train_neg_sample_list[i]
    
    train.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"\
               %(str(a), str(b), str(c), str(d), str(e), ','.join([str(ii) for ii in f]), str(g), str(h), ','.join([str(ii) for ii in m])))
    
train.close()



for i in range(len(test_label)):
    a = test_model_input["UserID"][i]
    b = test_model_input["Gender"][i]
    c = test_model_input["Age"][i]
    d = test_model_input["Occupation"][i]
    e = test_model_input["Zip-code"][i]
    f = test_model_input["hist_movie_id"][i]
    g = test_model_input["hist_len"][i]
    
    h = test_model_input["MovieID"][i]
    m = test_neg_sample_list[i]
    
    test.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"\
               %(str(a), str(b), str(c), str(d), str(e), ','.join([str(ii) for ii in f]), str(g), str(h), ','.join([str(ii) for ii in m])))
    
test.close()

#### 产出的数据格式如下
第 1 列 user_id	用户id

第 2 列	gender	用户性别

第 3 列	age	用户年龄

第 4 列	occupation	用户工作

第 5 列	zip	用户邮编

第 6 列	hist_movie_id	用户历史观看电影序列

第 7 列	hist_len	用户历史观看电影长度

第 8 列	pos_movie_id	用户下一步观看的电影（正样本）

第 9 列	neg_movie_id	用户下一步未观看的电影（抽样作为负样本）

### 模型定义与训练

In [7]:
import tensorflow as tf

from tensorflow.keras.layers import Layer
from tensorflow.keras.initializers import RandomNormal, Zeros


class SequencePoolingLayer(Layer):
    """The SequencePoolingLayer is used to apply pooling operation(sum,mean,max) on variable-length sequence feature/multi-value feature.

      Input shape
        - A list of two  tensor [seq_value,seq_len]

        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``

        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.

      Output shape
        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.

      Arguments
        - **mode**:str.Pooling operation to be used,can be sum,mean or max.

        - **supports_masking**:If True,the input need to support masking.
    """

    def __init__(self, mode='mean', supports_masking=False, **kwargs):

        if mode not in ['sum', 'mean', 'max']:
            raise ValueError("mode must be sum or mean")
        self.mode = mode
        self.eps = tf.constant(1e-8, tf.float32)
        super(SequencePoolingLayer, self).__init__(**kwargs)

        self.supports_masking = supports_masking

    def build(self, input_shape):
        if not self.supports_masking:
            self.seq_len_max = int(input_shape[0][1])
        super(SequencePoolingLayer, self).build(input_shape)  # Be sure to call this somewhere!
        print('input_shape',input_shape, self.seq_len_max)

    def call(self, seq_value_len_list, mask=None, **kwargs):
        if self.supports_masking:
            if mask is None:
                raise ValueError(
                    "When supports_masking=True,input must support masking")
            uiseq_embed_list = seq_value_len_list
            mask = tf.cast(mask, tf.float32)  # tf.to_float(mask)
            user_behavior_length = tf.reduce_sum(mask, axis=-1, keepdims=True)
            mask = tf.expand_dims(mask, axis=2)
        else:
            uiseq_embed_list, user_behavior_length = seq_value_len_list

            mask = tf.sequence_mask(user_behavior_length,
                                    self.seq_len_max, dtype=tf.float32)
            mask = tf.transpose(mask, (0, 2, 1))

        embedding_size = uiseq_embed_list.shape[-1]

        mask = tf.tile(mask, [1, 1, embedding_size])

        if self.mode == "max":
            hist = uiseq_embed_list - (1-mask) * 1e9
            return tf.reduce_max(hist, 1, keepdims=True)

        hist = tf.reduce_sum(uiseq_embed_list * mask, 1, keepdims=False)

        if self.mode == "mean":
            hist = tf.divide(hist, tf.cast(user_behavior_length, tf.float32) + self.eps)

        hist = tf.expand_dims(hist, axis=1)
        return hist

    def compute_output_shape(self, input_shape):
        if self.supports_masking:
            return (None, 1, input_shape[-1])
        else:
            return (None, 1, input_shape[0][-1])

    def compute_mask(self, inputs, mask):
        return None

    def get_config(self, ):
        config = {'mode': self.mode, 'supports_masking': self.supports_masking}
        base_config = super(SequencePoolingLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
        
        

class LabelAwareAttention(Layer):
    def __init__(self, k_max, pow_p=1, **kwargs):
        self.k_max = k_max
        self.pow_p = pow_p
        super(LabelAwareAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        # Be sure to call this somewhere!
        self.embedding_size = input_shape[0][-1]
        super(LabelAwareAttention, self).build(input_shape)

    def call(self, inputs, training=None, **kwargs):
        keys = inputs[0]
        query = inputs[1]
        weight = tf.reduce_sum(keys * query, axis=-1, keepdims=True)
        weight = tf.pow(weight, self.pow_p)  # [x,k_max,1]
        print('keys',keys)
        print('query',query)
        print('weight',weight)


        if len(inputs) == 3:
            k_user = tf.cast(tf.maximum(
                1.,
                tf.minimum(
                    tf.cast(self.k_max, dtype="float32"),  # k_max
                    tf.math.log1p(tf.cast(inputs[2], dtype="float32")) / tf.math.log(2.)  # hist_len
                )
            ), dtype="int64")
            print('k_user',k_user)
            
            seq_mask = tf.transpose(tf.sequence_mask(k_user, self.k_max), [0, 2, 1])
            padding = tf.ones_like(seq_mask, dtype=tf.float32) * (-2 ** 32 + 1)  # [x,k_max,1]
            weight = tf.where(seq_mask, weight, padding)
            print('seq_mask',seq_mask)
            print('padding',padding)
            print('weight',weight)
        
        weight = tf.nn.softmax(weight, name="weight")
        output = tf.reduce_sum(keys * weight, axis=1)
        print('attention output', output)

        return output

    def compute_output_shape(self, input_shape):
        return (None, self.embedding_size)

    def get_config(self, ):
        config = {'k_max': self.k_max, 'pow_p': self.pow_p}
        base_config = super(LabelAwareAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
        
        

class CapsuleLayer(Layer):
    def __init__(self, input_units, out_units, max_len, k_max, iteration_times=3,
                 init_std=1.0, **kwargs):
        self.input_units = input_units
        self.out_units = out_units
        self.max_len = max_len
        self.k_max = k_max
        self.iteration_times = iteration_times
        self.init_std = init_std
        super(CapsuleLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        print('input_shape', type(input_shape),input_shape)
        self.routing_logits = self.add_weight(shape=[1, self.k_max, self.max_len],
                                              initializer=RandomNormal(stddev=self.init_std),
                                              trainable=False, name="B", dtype=tf.float32)
        print('routing_logits', self.routing_logits.shape)
        self.bilinear_mapping_matrix = self.add_weight(shape=[self.input_units, self.out_units],
                                                       initializer=RandomNormal(stddev=self.init_std),
                                                       name="S", dtype=tf.float32)
        print('bilinear_mapping_matrix', self.bilinear_mapping_matrix.shape)

        super(CapsuleLayer, self).build(input_shape)

    def call(self, inputs, **kwargs):
        behavior_embddings, seq_len = inputs
        batch_size = tf.shape(behavior_embddings)[0]
        seq_len_tile = tf.tile(seq_len, [1, self.k_max])
        print('seq_len_tile', seq_len_tile.shape)


        for i in range(self.iteration_times):
            mask = tf.sequence_mask(seq_len_tile, self.max_len)
            print('mask', mask.shape, mask)
            pad = tf.ones_like(mask, dtype=tf.float32) * (-2 ** 32 + 1)
            routing_logits_with_padding = tf.where(mask, tf.tile(self.routing_logits, [batch_size, 1, 1]), pad)
            weight = tf.nn.softmax(routing_logits_with_padding)
            behavior_embdding_mapping = tf.tensordot(behavior_embddings, self.bilinear_mapping_matrix, axes=1)
            Z = tf.matmul(weight, behavior_embdding_mapping)
            interest_capsules = squash(Z)
            
            delta_routing_logits = tf.reduce_sum(
                tf.matmul(interest_capsules, tf.transpose(behavior_embdding_mapping, perm=[0, 2, 1])),
                axis=0, keepdims=True
            )
            print('interest_capsules', interest_capsules.shape,interest_capsules)
            print('behavior_embdding_mapping', behavior_embdding_mapping.shape,behavior_embdding_mapping)
            print('delta_routing_logits', delta_routing_logits.shape,delta_routing_logits)
            self.routing_logits.assign_add(delta_routing_logits)

        interest_capsules = tf.reshape(interest_capsules, [-1, self.k_max, self.out_units])
        return interest_capsules

    def compute_output_shape(self, input_shape):
        return (None, self.k_max, self.out_units)

    def get_config(self, ):
        config = {'input_units': self.input_units, 'out_units': self.out_units, 'max_len': self.max_len,
                  'k_max': self.k_max, 'iteration_times': self.iteration_times, "init_std": self.init_std}
        base_config = super(CapsuleLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    


def squash(inputs):
    vec_squared_norm = tf.reduce_sum(tf.square(inputs), axis=-1, keepdims=True)
    scalar_factor = vec_squared_norm / (1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + 1e-8)
    vec_squashed = scalar_factor * inputs
    
    return vec_squashed

In [8]:
# create model

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, concatenate, Flatten, Dense, Dropout

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


def tile_user_otherfeat(user_other_feature, k_max):
        return tf.tile(tf.expand_dims(user_other_feature, -2), [1, k_max, 1])


def mind(
    sparse_input_length=1,
    dense_input_length=1,
    sparse_seq_input_length=50,
    
    embedding_dim = 64,
    neg_sample_num = 10,
    user_hidden_unit_list = [128, 64],
    k_max = 5,
    p = 1,
    dynamic_k = True
    ):
    

    
    # 1. Input layer
    user_id_input_layer = Input(shape=(sparse_input_length, ), name="user_id_input_layer")
    gender_input_layer = Input(shape=(sparse_input_length, ), name="gender_input_layer")
    age_input_layer = Input(shape=(sparse_input_length, ), name="age_input_layer")
    occupation_input_layer = Input(shape=(sparse_input_length, ), name="occupation_input_layer")
    zip_input_layer = Input(shape=(sparse_input_length, ), name="zip_input_layer")
    
    
    user_click_item_seq_input_layer = Input(shape=(sparse_seq_input_length, ), name="user_click_item_seq_input_layer")
    user_click_item_seq_length_input_layer = Input(shape=(sparse_input_length, ), name="user_click_item_seq_length_input_layer")
    
    
    pos_item_sample_input_layer = Input(shape=(sparse_input_length, ), name="pos_item_sample_input_layer")
    neg_item_sample_input_layer = Input(shape=(neg_sample_num, ), name="neg_item_sample_input_layer")


    
    # 2. Embedding layer
    user_id_embedding_layer = Embedding(6040+1, embedding_dim, mask_zero=True, name='user_id_embedding_layer')(user_id_input_layer)
    gender_embedding_layer = Embedding(2+1, embedding_dim, mask_zero=True, name='gender_embedding_layer')(gender_input_layer)
    age_embedding_layer = Embedding(7+1, embedding_dim, mask_zero=True, name='age_embedding_layer')(age_input_layer)
    occupation_embedding_layer = Embedding(21+1, embedding_dim, mask_zero=True, name='occupation_embedding_layer')(occupation_input_layer)
    zip_embedding_layer = Embedding(3439+1, embedding_dim, mask_zero=True, name='zip_embedding_layer')(zip_input_layer)
    
    item_id_embedding_layer = Embedding(3706+1, embedding_dim, mask_zero=True, name='item_id_embedding_layer')
    pos_item_sample_embedding_layer = item_id_embedding_layer(pos_item_sample_input_layer)
    neg_item_sample_embedding_layer = item_id_embedding_layer(neg_item_sample_input_layer)
    
    user_click_item_seq_embedding_layer = item_id_embedding_layer(user_click_item_seq_input_layer)

    

    
    ### ********** ###
    # 3. user part
    ### ********** ###
    
    # 3.1 pooling layer
    user_click_item_seq_embedding_layer_pooling = SequencePoolingLayer()\
        ([user_click_item_seq_embedding_layer, user_click_item_seq_length_input_layer])
    
    print("user_click_item_seq_embedding_layer_pooling", user_click_item_seq_embedding_layer_pooling)
    
    
    # 3.2 capsule layer
    high_capsule = CapsuleLayer(input_units=embedding_dim,
                                out_units=embedding_dim, max_len=sparse_seq_input_length,
                                k_max=k_max)\
                        ([user_click_item_seq_embedding_layer, user_click_item_seq_length_input_layer])
    
    print("high_capsule: ", high_capsule)
    

    # 3.3 Concat "sparse" embedding & "sparse_seq" embedding, and tile embedding
    other_user_embedding_layer = concatenate([user_id_embedding_layer, gender_embedding_layer, \
                                                        age_embedding_layer, occupation_embedding_layer, \
                                                        zip_embedding_layer, user_click_item_seq_embedding_layer_pooling], 
                                       axis=-1)
                                    
    print('other_user_embedding_layer0', other_user_embedding_layer)

    other_user_embedding_layer = tf.tile(other_user_embedding_layer, [1, k_max, 1])
            
    print("other_user_embedding_layer: ", other_user_embedding_layer)
    
    
    
    # 3.4 user dnn part
    user_deep_input = concatenate([other_user_embedding_layer, high_capsule], axis=-1)
    print("user_deep_input0: ", user_deep_input)

    
    for i, u in enumerate(user_hidden_unit_list):
        user_deep_input = Dense(u, activation="relu", name="FC_{0}".format(i+1))(user_deep_input)
        #user_deep_input = Dropout(0.3)(user_deep_input)
        
    print("user_deep_input: ", user_deep_input)
    

    if dynamic_k:
        user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p, )(\
                                    [user_deep_input, pos_item_sample_embedding_layer, user_click_item_seq_length_input_layer])
    else:
        user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p, )(\
                                    [user_deep_input, pos_item_sample_embedding_layer])
    
    print("user_embedding_final0: ", user_embedding_final)

    user_embedding_final = tf.expand_dims(user_embedding_final, 1)
    print("user_embedding_final: ", user_embedding_final)
    
    
    
    ### ********** ###
    # 4. item part
    ### ********** ###

    item_embedding_layer = concatenate([pos_item_sample_embedding_layer, neg_item_sample_embedding_layer], \
                                       axis=1)
    
    item_embedding_layer = tf.transpose(item_embedding_layer, [0,2,1])
    
    print("item_embedding_layer: ", item_embedding_layer)




    ### ********** ###
    # 5. Output
    ### ********** ###
    
    dot_output = tf.matmul(user_embedding_final, item_embedding_layer)
    dot_output = tf.nn.softmax(dot_output) # 输出11个值，index为0的值是正样本，负样本的索引位置为[1-10]
    
    print('dot_output', dot_output)
    
    user_inputs_list = [user_id_input_layer, gender_input_layer, age_input_layer, \
                        occupation_input_layer, zip_input_layer, \
                        user_click_item_seq_input_layer, user_click_item_seq_length_input_layer]
    
    item_inputs_list = [pos_item_sample_input_layer, neg_item_sample_input_layer]

    model = Model(inputs = user_inputs_list + item_inputs_list,
                  outputs = dot_output)
    
    
    #print(model.summary())
    #tf.keras.utils.plot_model(model, to_file='MIND_model.png', show_shapes=True)


    model.__setattr__("user_input", user_inputs_list)
    model.__setattr__("user_embedding", user_deep_input)
    
    model.__setattr__("item_input", pos_item_sample_input_layer)
    model.__setattr__("item_embedding", pos_item_sample_embedding_layer)
    
    return model

In [9]:
from tensorflow.keras.callbacks import EarlyStopping
from data_generator import file_generator

# 1. Load data

train_path = train_path
val_path = test_path
batch_size = 1000

n_train = sum([1 for i in open(train_path)])
n_val = sum([1 for i in open(val_path)])

train_steps = n_train / batch_size
train_steps_ = n_train // batch_size
validation_steps = n_val / batch_size
validation_steps_ = n_val // batch_size


train_generator = file_generator(train_path, batch_size)
val_generator = file_generator(val_path, batch_size)

steps_per_epoch = train_steps_ if train_steps==train_steps_ else train_steps_ + 1
validation_steps = validation_steps_ if validation_steps==validation_steps_ else validation_steps_ + 1

print("n_train: ", n_train)
print("n_val: ", n_val)

print("steps_per_epoch: ", steps_per_epoch)
print("validation_steps: ", validation_steps)

n_train:  988129
n_val:  6040
steps_per_epoch:  989
validation_steps:  7


In [10]:
# 2. Train model


early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
callbacks = [early_stopping_cb]


model = mind()

model.compile(loss='sparse_categorical_crossentropy', \
    optimizer=Adam(lr=1e-3), \
    metrics=['sparse_categorical_accuracy'])
    
# loss="sparse_categorical_accuracy"的应用方式参见：https://mp.weixin.qq.com/s/H4ET0bO_xPm8TNqltMt3Fg



history = model.fit(train_generator, \
                    epochs=2, \
                    steps_per_epoch = steps_per_epoch, \
                    callbacks = callbacks, 
                    validation_data = val_generator, \
                    validation_steps = validation_steps, \
                    shuffle=True
                   )
                   
                   
model.save_weights('mind_model.h5')

input_shape [TensorShape([None, 50, 64]), TensorShape([None, 1])] 50
user_click_item_seq_embedding_layer_pooling KerasTensor(type_spec=TensorSpec(shape=(None, 1, 64), dtype=tf.float32, name=None), name='sequence_pooling_layer_1/ExpandDims:0', description="created by layer 'sequence_pooling_layer_1'")
input_shape <class 'list'> [TensorShape([None, 50, 64]), TensorShape([None, 1])]
routing_logits (1, 5, 50)
bilinear_mapping_matrix (64, 64)
seq_len_tile (None, 5)
mask (None, 5, 50) Tensor("capsule_layer_1/SequenceMask/Less:0", shape=(None, 5, 50), dtype=bool)
interest_capsules (None, 5, 64) Tensor("capsule_layer_1/mul_1:0", shape=(None, 5, 64), dtype=float32)
behavior_embdding_mapping (None, 50, 64) Tensor("capsule_layer_1/Tensordot:0", shape=(None, 50, 64), dtype=float32)
delta_routing_logits (1, 5, 50) Tensor("capsule_layer_1/Sum_1:0", shape=(1, 5, 50), dtype=float32)
mask (None, 5, 50) Tensor("capsule_layer_1/SequenceMask_1/Less:0", shape=(None, 5, 50), dtype=bool)
interest_capsules (N

## 预测

In [12]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from data_generator import init_output



# 1. Load model

re_model = mind()
re_model.load_weights('mind_model.h5')

print(re_model.summary())



# 2. Load data

user_id, gender, age, occupation, zip, \
        hist_movie_id, hist_len, pos_movie_id, neg_movie_id = init_output()

with open("test.txt", 'r') as f:
    for line in f.readlines():

        buf = line.strip().split('\t')

        user_id.append(int(buf[0]))
        gender.append(int(buf[1]))
        age.append(int(buf[2]))
        occupation.append(int(buf[3]))
        zip.append(int(buf[4]))
        hist_movie_id.append(np.array([int(i) for i in buf[5].strip().split(",")]))
        hist_len.append(int(buf[6]))
        pos_movie_id.append(int(buf[7]))
        

user_id = np.array(user_id, dtype='int32')
gender = np.array(gender, dtype='int32')
age = np.array(age, dtype='int32')
occupation = np.array(occupation, dtype='int32')
zip = np.array(zip, dtype='int32')
hist_movie_id = np.array(hist_movie_id, dtype='int32')
hist_len = np.array(hist_len, dtype='int32')
pos_movie_id = np.array(pos_movie_id, dtype='int32')




# 3. Generate user features for testing and full item features for retrieval

test_user_model_input = [user_id, gender, age, occupation, zip, hist_movie_id, hist_len]
all_item_model_input = list(range(0, 3706+1))

user_embedding_model = Model(inputs=re_model.user_input, outputs=re_model.user_embedding)
item_embedding_model = Model(inputs=re_model.item_input, outputs=re_model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)


user_embs = np.reshape(user_embs, (-1, 64))
item_embs = np.reshape(item_embs, (-1, 64))

print(user_embs[:2])
print(item_embs.shape)

input_shape [TensorShape([None, 50, 64]), TensorShape([None, 1])] 50
user_click_item_seq_embedding_layer_pooling KerasTensor(type_spec=TensorSpec(shape=(None, 1, 64), dtype=tf.float32, name=None), name='sequence_pooling_layer_3/ExpandDims:0', description="created by layer 'sequence_pooling_layer_3'")
input_shape <class 'list'> [TensorShape([None, 50, 64]), TensorShape([None, 1])]
routing_logits (1, 5, 50)
bilinear_mapping_matrix (64, 64)
seq_len_tile (None, 5)
mask (None, 5, 50) Tensor("capsule_layer_3/SequenceMask/Less:0", shape=(None, 5, 50), dtype=bool)
interest_capsules (None, 5, 64) Tensor("capsule_layer_3/mul_1:0", shape=(None, 5, 64), dtype=float32)
behavior_embdding_mapping (None, 50, 64) Tensor("capsule_layer_3/Tensordot:0", shape=(None, 50, 64), dtype=float32)
delta_routing_logits (1, 5, 50) Tensor("capsule_layer_3/Sum_1:0", shape=(1, 5, 50), dtype=float32)
mask (None, 5, 50) Tensor("capsule_layer_3/SequenceMask_1/Less:0", shape=(None, 5, 50), dtype=bool)
interest_capsules (N