In [None]:
!pip uninstall tensorflow==2.7.0 -y
!pip install -U keras==2.2.4 tensorflow-gpu==1.14.0 h5py==2.10.0

In [None]:
import tensorflow
print(tensorflow.__version__)
print(tensorflow.test.gpu_device_name())

### Hyperparams (hypers.py)

In [None]:
import os
import numpy as np
npratio = 4
MAX_SENTENCE = 30
MAX_ALL = 50
MAX_SENT_LENGTH=30
MAX_SENTS=50

### Additional functions

In [None]:
def get_raw_entity_list_dict():
    raw_entity_list_dict = []
    set_news = set()

    with open('/content/data/train_set_large/news.tsv') as f:
            a_lines = f.readlines()
    with open('/content/data/val_set_large/news.tsv') as f:
            a_lines += f.readlines()
    with open('/content/data/test_set_large/news.tsv') as f:
            a_lines += f.readlines()

    for l in a_lines:
        l = l.strip().split('\t')
        if l[0] not in set_news:
            raw_entity_list_dict.append({"doc_id":l[0],"entities":json.loads(l[-2])})
            set_news.add(l[0])
    return raw_entity_list_dict


def mini_get_raw_entity_list_dict():
    raw_entity_list_dict = []
    set_news = set()

    with open('/content/data/train_set/news.tsv') as f:
            a_lines = f.readlines()
    with open('/content/data/val_set/news.tsv') as f:
            a_lines += f.readlines()

    for l in a_lines:
        l = l.strip().split('\t')
        if l[0] not in set_news:
            raw_entity_list_dict.append({"doc_id":l[0],"entities":json.loads(l[-2])})
            set_news.add(l[0])
    return raw_entity_list_dict

In [None]:
def get_raw_pretrained_entity_embedding():
    import numpy as np
    raw_entity_embedding = {}
    set_entity = set()

    with open('/content/data/train_set_large/entity_embedding.vec','r') as f:
        tmp_a = f.readlines()

    with open('/content/data/val_set_large/entity_embedding.vec','r') as f:
        tmp_a += f.readlines()

    with open('/content/data/test_set_large/entity_embedding.vec','r') as f:
        tmp_a += f.readlines()

    for l in tmp_a:
        l = l.strip().split('\t')
        if l[0] not in set_entity:
            set_entity.add(l[0])
            raw_entity_embedding[l[0]] = np.array(l[1:]).astype(np.float)

    return raw_entity_embedding


def mini_get_raw_pretrained_entity_embedding():
    import numpy as np
    raw_entity_embedding = {}
    set_entity = set()

    with open('data/train_set/entity_embedding.vec','r') as f:
        tmp_a = f.readlines()

    with open('/content/data/val_set/entity_embedding.vec','r') as f:
        tmp_a += f.readlines()
        
    for l in tmp_a:
        l = l.strip().split('\t')
        if l[0] not in set_entity:
            set_entity.add(l[0])
            raw_entity_embedding[l[0]] = np.array(l[1:]).astype(np.float)

    return raw_entity_embedding

### Data preparation (ProcessRawData.ipynb)

In [None]:
!wget https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip -P data/train_set_large/
!wget https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip -P data/val_set_large/
!wget https://mind201910small.blob.core.windows.net/release/MINDlarge_test.zip -P data/test_set_large/
!unzip data/test_set_large/MINDlarge_test.zip -d data/test_set_large/
!unzip data/train_set_large/MINDlarge_train.zip -d data/train_set_large/
!unzip data/val_set_large/MINDlarge_dev.zip -d data/val_set_large/

In [None]:
# GloVe
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip /content/glove.840B.300d.zip -d /content/embedding/

In [None]:
# MIND Large
raw_data_path = './data'
output_data_path = './input_data'

if os.path.exists(output_data_path) == False:
  os.mkdir(output_data_path)

with open(os.path.join(raw_data_path,'train_set_large','news.tsv')) as f:
    news1 = f.readlines()
with open(os.path.join(raw_data_path,'val_set_large','news.tsv')) as f:
    news2 = f.readlines()
with open(os.path.join(raw_data_path,'test_set_large','news.tsv')) as f:
    news3 = f.readlines()

news = []
news_dict = {}
for l in news1 + news2 + news3:
    nid = l.strip('\n').split('\t')[0]
    if not nid in news_dict:
        news_dict[nid] = 1
        news.append(l)

with open(os.path.join(output_data_path,'docs.tsv'),'w') as f:
    for i in range(len(news)):
        f.write(news[i])

with open(os.path.join(raw_data_path,'train_set_large','behaviors.tsv')) as f:
    behaviors1 = f.readlines()
with open(os.path.join(raw_data_path,'val_set_large','behaviors.tsv')) as f:
    behaviors2 = f.readlines()
with open(os.path.join(raw_data_path,'test_set_large','behaviors.tsv')) as f:
    behaviors3 = f.readlines()

# train data + all val data for training, 10% val data for validation (or rather self-testing), test data for submission
train_behaviors = []
val_behaviors = []
test_behaviors = []
# num for fun
num = int(0.9*len(behaviors2))

# Get train data (behaviors of all train data + val data)
for i in range(len(behaviors1)):
    train_behaviors.append(behaviors1[i])
for i in range(len(behaviors2)):
    train_behaviors.append(behaviors2[i])

# Modify training portion
start_portion = int(len(train_behaviors)*0.6)
end_portion = int(len(train_behaviors)*1)
train_behaviors = train_behaviors[start_portion:end_portion]
print(f"Training start from portion {start_portion} to {end_portion}")

# Get test data
# Remember this is MIND Large (test data doesn't contain labels)
test_behaviors = behaviors3

with open(os.path.join(output_data_path,'train.tsv'),'w') as f:
    for i in range(len(train_behaviors)):
        f.write(train_behaviors[i])
with open(os.path.join(output_data_path,'val.tsv'),'w') as f:
    for i in range(len(val_behaviors)):
        f.write(val_behaviors[i])
with open(os.path.join(output_data_path,'test.tsv'),'w') as f:
    for i in range(len(test_behaviors)):
        f.write(test_behaviors[i])

Training start from portion 1565531 to 2609219


### Preprocessing (preprocessing.py)

In [None]:
data_root_path = '/content/input_data/'
embedding_path = '/content/embedding/'
KG_root_path = '/content/HieRec_KGData'

In [None]:
from datetime import datetime
import time
import random
import numpy as np
import os
from nltk.tokenize import word_tokenize
import json

def trans2tsp(timestr):
    return int(time.mktime(datetime.strptime(timestr, '%m/%d/%Y %I:%M:%S %p').timetuple()))

def newsample(nnn,ratio):
    if ratio >len(nnn):
        return random.sample(nnn*(ratio//len(nnn)+1),ratio)
    else:
        return random.sample(nnn,ratio)

def shuffle(pn,labeler,pos):
    index=np.arange(pn.shape[0])
    pn=pn[index]
    labeler=labeler[index]
    pos=pos[index]
    
    for i in range(pn.shape[0]):
        index=np.arange(npratio+1)
        pn[i,:]=pn[i,index]
        labeler[i,:]=labeler[i,index]
    return pn,labeler,pos

def read_news(path,filenames):
    news={}
    category=[]
    subcategory=[]
    news_index={}
    index=1
    word_dict={}
    word_index=1
    with open(os.path.join(path,filenames)) as f:
        lines=f.readlines()
    for line in lines:
        splited = line.strip('\n').split('\t')
        doc_id,vert,subvert,title= splited[0:4]
        news_index[doc_id]=index
        index+=1
        category.append(vert)
        subcategory.append(subvert)
        title = title.lower()
        title=word_tokenize(title)
        news[doc_id]=[vert,subvert,title]
        for word in title:
            word = word.lower()
            if not(word in word_dict):
                word_dict[word]=word_index
                word_index+=1
    category=list(set(category))
    subcategory=list(set(subcategory))
    category_dict={}
    index=1
    for c in category:
        category_dict[c]=index
        index+=1
    subcategory_dict={}
    index=1
    for c in subcategory:
        subcategory_dict[c]=index
        index+=1
    return news,news_index,category_dict,subcategory_dict,word_dict

def get_doc_input(news,news_index,category,subcategory,word_dict):
    news_num=len(news)+1
    news_title=np.zeros((news_num,MAX_SENTENCE),dtype='int32')
    news_vert=np.zeros((news_num,),dtype='int32')
    news_subvert=np.zeros((news_num,),dtype='int32')
    for key in news:    
        vert,subvert,title=news[key]
        doc_index=news_index[key]
        news_vert[doc_index]=category[vert]
        news_subvert[doc_index]=subcategory[subvert]
        for word_id in range(min(MAX_SENTENCE,len(title))):
            news_title[doc_index,word_id]=word_dict[title[word_id].lower()]
    return news_title,news_vert,news_subvert

def load_matrix(embedding_path,word_dict):
    embedding_matrix = np.zeros((len(word_dict)+1,300))
    have_word=[]
    with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f:
        while True:
            l=f.readline()
            if len(l)==0:
                break
            l=l.split()
            word = l[0].decode()
            if word in word_dict:
                index = word_dict[word]
                tp = [float(x) for x in l[1:]]
                embedding_matrix[index]=np.array(tp)
                have_word.append(word)
    return embedding_matrix,have_word

def read_clickhistory(news_index,data_root_path,filename):
    
    lines = []
    userids = []
    with open(os.path.join(data_root_path,filename)) as f:
        lines = f.readlines()
        
    sessions = []
    for i in range(len(lines)):
        _,uid,eventime, click, imps = lines[i].strip().split('\t')
        if click == '':
            clikcs = []
        else:
            clikcs = click.split()
        true_click = []
        for click in clikcs:
            if not click in news_index:
                continue
            true_click.append(click)
        pos = []
        neg = []
        for imp in imps.split():
            docid, label = imp.split('-')
            if label == '1':
                pos.append(docid)
            else:
                neg.append(docid)
        sessions.append([true_click,pos,neg])
    return sessions

def parse_user(news_index,session):
    user_num = len(session)
    user={'click': np.zeros((user_num,MAX_ALL),dtype='int32'),}
    for user_id in range(len(session)):
        tclick = []
        click, pos, neg =session[user_id]
        for i in range(len(click)):
            tclick.append(news_index[click[i]])
        click = tclick

        if len(click) >MAX_ALL:
            click = click[-MAX_ALL:]
        else:
            click=[0]*(MAX_ALL-len(click)) + click
            
        user['click'][user_id] = np.array(click)
    return user

def get_train_input(news_index,session):
    sess_pos = []
    sess_neg = []
    user_id = []
    for sess_id in range(len(session)):
        sess = session[sess_id]
        _, poss, negs=sess
        for i in range(len(poss)):
            pos = poss[i]
            neg=newsample(negs,npratio)
            sess_pos.append(pos)
            sess_neg.append(neg)
            user_id.append(sess_id)
    sess_all = np.zeros((len(sess_pos),1+npratio),dtype='int32')
    label = np.zeros((len(sess_pos),1+npratio))
    for sess_id in range(sess_all.shape[0]):
        pos = sess_pos[sess_id]
        negs = sess_neg[sess_id]
        sess_all[sess_id,0] = news_index[pos]
        index = 1
        for neg in negs:
            sess_all[sess_id,index] = news_index[neg]
            index+=1
        label[sess_id,0]=1
    user_id = np.array(user_id, dtype='int32')
    
    return sess_all, user_id, label

def get_test_input(news_index,session):
    
    Impressions = []
    userid = []
    for sess_id in range(len(session)):
        _, poss, negs = session[sess_id]
        imp = {'labels':[],
                'docs':[]}
        userid.append(sess_id)
        for i in range(len(poss)):
            docid = news_index[poss[i]]
            imp['docs'].append(docid)
            imp['labels'].append(1)
        for i in range(len(negs)):
            docid = news_index[negs[i]]
            imp['docs'].append(docid)
            imp['labels'].append(0)
        Impressions.append(imp)
        
    userid = np.array(userid,dtype='int32')
    
    return Impressions, userid,

def load_news_entity(news_index,KG_root_path):

    lines = get_raw_entity_list_dict()
    
    EntityId2Index = {}
    ctt = 1
    
    news_entity = {}
    g = []
    for i in range(len(lines)):
        d = lines[i]
        docid = d['doc_id']
        if not docid in news_index:
            continue
        news_entity[docid] = []
        entities = d['entities']
        for j in range(len(entities)):
            e = entities[j]['Label']
            eid = entities[j]['WikidataId']
            if not eid in EntityId2Index:
                EntityId2Index[eid] = ctt
                ctt += 1    
            news_entity[docid].append([e,eid,EntityId2Index[eid]])
    
    meta_news_entity = {}
    news_entity2 = {}
    
    
    news_entity_id = {}
    for nid in news_entity:
        news_entity_id[nid] = []
        for e in news_entity[nid]:
            news_entity_id[nid].append(e[-2])
        news_entity_id[nid] = set(news_entity_id[nid])
        
    
    for docid in news_entity:
        meta_news_entity[docid] = news_entity[docid]
        news_entity2[docid] = []
        for v in news_entity[docid]:
            news_entity2[docid].append(v[-1])
        news_entity2[docid] = list(set(news_entity2[docid]))[:5]
        news_entity2[docid] = news_entity2[docid] + [0]*(5-len(news_entity2[docid]))
        news_entity2[docid] = np.array(news_entity2[docid])
    
    news_entity_np = np.zeros((len(news_entity2)+1,5),dtype='int32')
    for nid in news_index:
        nix = news_index[nid]
        news_entity_np[nix] = news_entity2[nid]
        
    return news_entity_id,news_entity_np,EntityId2Index

def load_entity_embedding(KG_root_path,EntityId2Index):
    entity_emb = np.zeros((len(EntityId2Index)+1,100))
    title_entity_emb = get_raw_pretrained_entity_embedding()
    
    for eid in EntityId2Index:
        if eid in title_entity_emb:
            eix = EntityId2Index[eid]
            entity_emb[eix] = title_entity_emb[eid]
    return entity_emb

### Utils (utils.py)

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score


def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)

### Models (models.py)

In [None]:
import numpy
import keras
from keras.utils.np_utils import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding, concatenate
from keras.layers import Dense, Input, Flatten, average,Lambda

from keras.layers import *
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras import backend as K
from keras.layers import Layer, InputSpec
from keras import initializers 
from keras.utils.vis_utils import plot_model
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from keras.optimizers import *

Using TensorFlow backend.


In [None]:
class Attention(Layer):
 
    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        super(Attention, self).__init__(**kwargs)
 
    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ',
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK',
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV',
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)
 
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
 
    def call(self, x):
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x

        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))

        A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1))
        A = K.softmax(A)

        O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
 
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [None]:
def AttentivePooling(dim1,dim2):
    vecs_input = Input(shape=(dim1,dim2),dtype='float32')
    user_vecs =Dropout(0.2)(vecs_input)
    user_att = Dense(200,activation='tanh')(user_vecs)
    user_att = keras.layers.Flatten()(Dense(1)(user_att))
    user_att = Activation('softmax')(user_att)
    user_vec = keras.layers.Dot((1,1))([user_vecs,user_att])
    model = Model(vecs_input,user_vec)
    return model

def ConDot():
    vec_input = keras.layers.Input(shape=(400*2,))
    vec1 = keras.layers.Lambda(lambda x:x[:,:400])(vec_input)
    vec2 = keras.layers.Lambda(lambda x:x[:,400:])(vec_input)
    score = keras.layers.Dot(axes=-1)([vec1,vec2])
    return Model(vec_input,score)

def get_doc_encoder(title_word_embedding_matrix,entity_emb_matrix):

    news_input = Input(shape=(35,),dtype='int32')
    
    
    sentence_input = keras.layers.Lambda(lambda x:x[:,:30])(news_input)
    title_word_embedding_layer = Embedding(title_word_embedding_matrix.shape[0], 300, weights=[title_word_embedding_matrix],trainable=True)
    word_vecs = title_word_embedding_layer(sentence_input)
    droped_vecs = Dropout(0.2)(word_vecs)
    word_rep = Attention(20,20)([droped_vecs]*3)
    droped_rep = Dropout(0.2)(word_rep)
    title_vec = AttentivePooling(30,400)(droped_rep)
    
    entity_input = keras.layers.Lambda(lambda x:x[:,30:])(news_input)
    entity_embedding_layer = Embedding(entity_emb_matrix.shape[0], 100, weights=[entity_emb_matrix],trainable=True)
    entity_vecs = entity_embedding_layer(entity_input)
    droped_vecs = Dropout(0.2)(entity_vecs)
    entity_rep = Attention(5,20)([droped_vecs]*3)
    droped_rep = Dropout(0.2)(entity_rep)
    entity_vec = AttentivePooling(5,100)(droped_rep)
    
    vec = keras.layers.Concatenate(axis=-1)([title_vec,entity_vec])
    vec = keras.layers.Dense(400)(vec)
    
    
    sentEncodert = Model(news_input, vec)
    return sentEncodert

In [None]:
class CategoryEmbLayer(Layer):
    
    def __init__(self,n, **kwargs):
        super(CategoryEmbLayer, self).__init__(**kwargs)
        self.n = n
        
     
    def build(self, input_shape):
        trainable = True
        if self.n>1:
            self.W = self.add_weight(name='W',
                                  shape=(self.n,400),
                                  initializer=keras.initializers.Constant(value=np.zeros((self.n,400))),
                                  trainable=trainable)
        else:
            self.W = self.add_weight(name='W',
                                  shape=(400,),
                                  initializer=keras.initializers.Constant(value=np.zeros((400,))),
                                  trainable=trainable)
            
    def call(self,x):
        return x+self.W
        
    def compute_output_shape(self, input_shape):
        return input_shape


class Weighter(Layer):
     
    def __init__(self, **kwargs):
        super(Weighter, self).__init__(**kwargs)
        

     
    def build(self, input_shape):
        trainable = False
        self.w1 = self.add_weight(name='w1',
                                  shape=(1,),
                                  initializer=keras.initializers.Constant(value=0.15/0.15),
                                  trainable=trainable)

        self.w2 = self.add_weight(name='w2',
                                  shape=(1,),
                                  initializer=keras.initializers.Constant(value=0.15/0.15),
                                  trainable=trainable)
        
        self.w3 = self.add_weight(name='w3',
                                  shape=(1,),
                                  initializer=keras.initializers.Constant(value=0.7/0.15),
                                  trainable=trainable)

        
        super(Weighter, self).build(input_shape)
        
        
    def call(self,x):

        return self.w1*x[0]+self.w2*x[1]+self.w3*x[2]
        
    def compute_output_shape(self, input_shape):

        return input_shape[0]

In [None]:
def HirUserEncoder(category_dict,subcategory_dict):
    
    AttTrainable = True
    
    clicked_title_input = Input(shape=(50,400,), dtype='float32')
    
    clicked_vert_input = Input(shape=(len(category_dict),50,), dtype='float32')
    clicked_vert_mask_input = Input(shape=(len(category_dict),), dtype='float32')
    
    clicked_subvert_input = Input(shape=(len(subcategory_dict),50,), dtype='float32')
    clicked_subvert_mask_input = Input(shape=(len(subcategory_dict),), dtype='float32')
    
    vert_subvert_mask_input = Input(shape=(len(category_dict),len(subcategory_dict)),dtype='float32')

    vert_num_input = Input(shape=(len(category_dict),),dtype='int32')
    subvert_num_input = Input(shape=(len(subcategory_dict),),dtype='int32')

    subvert_num_embedding_layer = Embedding(51, 128,trainable=True)
    subvert_num_scorer = Dense(1)


    vert_num_embedding_layer = subvert_num_embedding_layer #Embedding(51, 128,trainable=True)
    vert_num_scorer = subvert_num_scorer

    title_vecs = clicked_title_input
    
    trainable = True
    
    user_subvert_att = Dense(1,trainable=trainable,use_bias=False,kernel_initializer=keras.initializers.Constant(value=np.zeros((400,1))),)(title_vecs)

    user_subvert_att = keras.layers.Reshape((50,))(user_subvert_att)
    user_subvert_att = keras.layers.RepeatVector(len(subcategory_dict))(user_subvert_att)
    user_subvert_att = keras.layers.Lambda(lambda x:x[0]-100*(1-x[1]))([user_subvert_att,clicked_subvert_input])    
    user_subvert_att = keras.layers.Activation('softmax')(user_subvert_att) #(300,50)

    user_subvert_att = keras.layers.Lambda(lambda x:x[0]*x[1])([user_subvert_att,clicked_subvert_input]) #(300,400)
    user_subvert_rep = keras.layers.Dot(axes=[-1,-2])([user_subvert_att,title_vecs]) #（300,400)
    user_subvert_rep = CategoryEmbLayer(len(subcategory_dict))(user_subvert_rep)  #（300,400) 
    
    subvert_num_emb = subvert_num_embedding_layer(subvert_num_input)
    subvert_num_score = subvert_num_scorer(subvert_num_emb)
    subvert_num_score = Reshape((len(subcategory_dict),))(subvert_num_score) #(300,)   
    
    user_vert_att = Dense(1,trainable=trainable,use_bias=False,kernel_initializer=keras.initializers.Constant(value=np.zeros((400,1))))(user_subvert_rep)
    user_vert_att = Reshape((len(subcategory_dict),))(user_vert_att) #(300,)
    user_vert_att = Add()([user_vert_att,subvert_num_score]) #(300,)
    
    user_vert_att = RepeatVector(len(category_dict))(user_vert_att) #(18,300)
    user_vert_att = Lambda(lambda x:x[0]-100*(1-x[1]))([user_vert_att,vert_subvert_mask_input]) #(18,300)
    user_vert_att = Softmax()(user_vert_att)
    
    user_vert_rep = keras.layers.Dot(axes=[-1,-2])([user_vert_att,user_subvert_rep]) #(18,400)
    user_vert_rep = CategoryEmbLayer(len(category_dict))(user_vert_rep) #(18,400)

    user_global_att = Dense(1,trainable=trainable,use_bias=False,kernel_initializer=keras.initializers.Constant(value=np.zeros((400,1))))(user_vert_rep)
    user_global_att = Reshape((len(category_dict),))(user_global_att) #(18,)

    vert_num_emb = vert_num_embedding_layer(vert_num_input)
    vert_num_score = vert_num_scorer(vert_num_emb)
    vert_num_score = Reshape((len(category_dict),))(vert_num_score) #(18,1)   

    user_global_att = Add()([user_global_att,vert_num_score]) #(18,)
    user_global_att = Lambda(lambda x:x[0]-100*(1-x[1]))([user_global_att,clicked_vert_mask_input]) #(18,)
    user_global_att = Softmax()(user_global_att)
    
        
    user_global_rep = Dot(axes=[-1,-2])([user_global_att,user_vert_rep]) #(400,)
    
    return Model([clicked_title_input,clicked_vert_input,clicked_vert_mask_input,clicked_subvert_input,clicked_subvert_mask_input,vert_subvert_mask_input,vert_num_input,subvert_num_input],
                 [user_subvert_rep,user_vert_rep,user_global_rep])

In [None]:
def create_model(category_dict,subcategory_dict,title_word_embedding_matrix,entity_emb_matrix):
    MAX_LENGTH = 35    
    news_encoder = get_doc_encoder(title_word_embedding_matrix,entity_emb_matrix)

    user_encoder = HirUserEncoder(category_dict,subcategory_dict)
    
    clicked_title_input = Input(shape=(50,35,), dtype='int32')
    clicked_vert_input = Input(shape=(len(category_dict),50,), dtype='float32')
    clicked_vert_mask_input = Input(shape=(len(category_dict),), dtype='float32')
    clicked_subvert_input = Input(shape=(len(subcategory_dict),50,), dtype='float32')
    clicked_subvert_mask_input = Input(shape=(len(subcategory_dict),), dtype='float32')
    vert_subvert_mask_input = Input(shape=(len(category_dict),len(subcategory_dict)), dtype='float32')
    
    title_inputs = Input(shape=(1+npratio,35,),dtype='int32') 
    vert_inputs = Input(shape=(1+npratio,len(category_dict),),dtype='float32')  #(2,18)
    subvert_inputs = Input(shape=(1+npratio,len(subcategory_dict),),dtype='float32')  #(2,18)
    
    vert_num_input = Input(shape=(len(category_dict),),dtype='int32')
    subvert_num_input = Input(shape=(len(subcategory_dict),),dtype='int32')
    
    rw_vert_input = Input(shape=(1+npratio,),dtype='float32')
    rw_subvert_input = Input(shape=(1+npratio,),dtype='float32')

    clicked_title_vecs = TimeDistributed(news_encoder)(clicked_title_input)
    news_vecs = TimeDistributed(news_encoder)(title_inputs)
    
    news_vecs = Dropout(0.25)(news_vecs)
    clicked_title_vecs = Dropout(0.25)(clicked_title_vecs)

    user_subvert_rep,user_vert_rep,user_global_rep = user_encoder([clicked_title_vecs,clicked_vert_input,clicked_vert_mask_input,clicked_subvert_input,clicked_subvert_mask_input,vert_subvert_mask_input,vert_num_input,subvert_num_input])
    
    
    vs_user_vec = keras.layers.Dot(axes=(-1,-2))([vert_inputs,user_vert_rep]) #(batch_size,1+npratio,400)
    svs_user_vec = keras.layers.Dot(axes=(-1,-2))([subvert_inputs,user_subvert_rep]) #(batch_size,1+npratio,400)


    score1 = keras.layers.Dot(axes=-1)([news_vecs,user_global_rep])

    vs_vecs = keras.layers.Concatenate(axis=-1)([news_vecs,vs_user_vec])
    score2 = TimeDistributed(ConDot())(vs_vecs)
    score2 = keras.layers.Reshape((1+npratio,))(score2)
    
    svs_vecs = keras.layers.Concatenate(axis=-1)([news_vecs,svs_user_vec])
    score3 = TimeDistributed(ConDot())(svs_vecs)
    score3 = keras.layers.Reshape((1+npratio,))(score3)
    
    
    score2 = Multiply()([rw_vert_input,score2])
    score3 = Multiply()([rw_subvert_input,score3])

    rwer = Weighter()
    scores = rwer([score1,score2,score3])
    
    
    logits = keras.layers.Activation(keras.activations.softmax,name = 'recommend')(scores)     

    model = Model([title_inputs,vert_inputs,subvert_inputs,
                   clicked_title_input,clicked_vert_input,clicked_vert_mask_input,
                   clicked_subvert_input,clicked_subvert_mask_input,
                   vert_subvert_mask_input,vert_num_input,subvert_num_input,
                  rw_vert_input,rw_subvert_input],logits) # max prob_click_positive
    model.compile(loss=['categorical_crossentropy'],
                  optimizer=Adam(lr=0.0001,amsgrad=True),
                  metrics=['acc'])

    
    return model,news_encoder,user_encoder,rwer

### Pipelines (Main.ipynb)

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import tensorflow as tf
import keras.backend as KTF
# import tensorflow.compat.v1.keras.backend as KTF
# config = tf.compat.v1.ConfigProto()
# config = tf.ConfigProto()
# config.gpu_options.allow_growth=True
# session = tf.compat.v1.Session(config=config)
# session = tf.Session(config=config)
 
# KTF.set_session(session)

In [None]:
news,news_index,category_dict,subcategory_dict,word_dict = read_news(data_root_path,'docs.tsv')
news_title,news_vert,news_subvert=get_doc_input(news,news_index,category_dict,subcategory_dict,word_dict)
news_entity,news_entity_np,EntityId2Index = load_news_entity(news_index,KG_root_path)
news_info = np.concatenate([news_title,news_entity_np],axis=-1)

In [None]:
train_session = read_clickhistory(news_index,data_root_path,'train.tsv')
train_user = parse_user(news_index,train_session)
train_sess, train_user_id, train_label = get_train_input(news_index,train_session)

In [None]:
title_word_embedding_matrix, have_word = load_matrix(embedding_path,word_dict)
entity_emb_matrix = load_entity_embedding(KG_root_path,EntityId2Index)

In [None]:
index2nid = {}
for nid, nix in news_index.items():
    index2nid[nix] = nid

In [None]:
vert_subvert_mask_table = np.zeros((1,len(category_dict),len(subcategory_dict)))
for nid in range(1,len(news_vert)):
    v = news_vert[nid]-1
    sv = news_subvert[nid]-1
    vert_subvert_mask_table[0,v,sv] = 1

In [None]:
from keras.utils import Sequence

class get_hir_train_generator(Sequence):
    def __init__(self,mask_prob,news_scoring,index2nid,news_vert, subvert,news_entity, news_entity_id, clicked_news,user_id, news_id, label, batch_size):
        self.news_emb = news_scoring
        self.vert = news_vert
        self.subvert = subvert
        self.entity = news_entity
        self.entity_id = news_entity_id
        self.index2nid = index2nid
        
        self.clicked_news = clicked_news

        self.user_id = user_id
        self.doc_id = news_id
        self.label = label
        
        self.mask_prob = mask_prob
        
        self.batch_size = batch_size
        self.ImpNum = self.label.shape[0]
        
    def __len__(self):
        return int(np.ceil(self.ImpNum / float(self.batch_size)))
    
    def __get_news(self,docids):
        news_emb = self.news_emb[docids]
        vert = self.vert[docids]
        subvert = self.subvert[docids]
        entity = self.entity[docids]
        return news_emb, vert, subvert, entity
        

    def __getitem__(self, idx):
        start = idx*self.batch_size
        ed = (idx+1)*self.batch_size
        if ed> self.ImpNum:
            ed = self.ImpNum
            
        label = self.label[start:ed]

        doc_ids = self.doc_id[start:ed]
        title, vert, subvert, entity = self.__get_news(doc_ids)
        
        user_ids = self.user_id[start:ed]
        clicked_ids = self.clicked_news[user_ids]
        user_title, user_vert, user_subvert, user_entity = self.__get_news(clicked_ids)
        
        vert_subvert_mask_input = np.zeros((len(user_subvert),len(category_dict),len(subcategory_dict),))
        for bid in range(len(user_subvert)):
            for nid in range(len(user_subvert[bid])):
                sv = user_subvert[bid][nid]
                if sv ==0:
                    continue
                sv -= 1
                vert_subvert_mask_input[bid,:,sv] = 1
        vert_subvert_mask_input = vert_subvert_mask_input*vert_subvert_mask_table

        
        
        user_vert = keras.utils.to_categorical(user_vert,len(category_dict)+1)
        user_vert = user_vert.transpose((0,2,1))
        user_vert = user_vert[:,1:,:]
        user_vert_mask = user_vert.sum(axis=-1)
        
        vert = keras.utils.to_categorical(vert,len(category_dict)+1)
        vert = vert[:,:,1:]
        
        user_subvert = keras.utils.to_categorical(user_subvert,len(subcategory_dict)+1)
        user_subvert = user_subvert.transpose((0,2,1))
        user_subvert = user_subvert[:,1:,:]
        user_subvert_mask = user_subvert.sum(axis=-1)
                
        subvert = keras.utils.to_categorical(subvert,len(subcategory_dict)+1)
        subvert = subvert[:,:,1:]
    
        user_vert_num = np.array(user_vert.sum(axis=-1),dtype='int32')
        user_subvert_num = np.array(user_subvert.sum(axis=-1),dtype='int32')

        user_subvert_mask = np.array(user_subvert_mask>0,dtype='float32')
        user_vert_mask = np.array(user_vert_mask>0,dtype='float32')
        vert_subvert_mask_input = np.array(vert_subvert_mask_input>0,dtype='float32')
        
        rw_vert = user_vert_num/(user_vert_num.sum(axis=-1).reshape((len(user_vert_num),1))+10**(-8)) #(bz,18)
        rw_subvert = user_subvert_num/(user_subvert_num.sum(axis=-1).reshape((len(user_subvert_num),1))+10**(-8)) #(bz,300)
        
        
        
        rw_vert = rw_vert.reshape((rw_vert.shape[0],1,rw_vert.shape[1]))
        rw_subvert = rw_subvert.reshape((rw_subvert.shape[0],1,rw_subvert.shape[1])) #(bz,1,18)
        
        rw_vert = (rw_vert*vert).sum(axis=-1)
        rw_subvert = (rw_subvert*subvert).sum(axis=-1)
        
        train_mask = np.random.uniform(0,1,size=(ed-start,1)) > self.mask_prob
        train_mask = np.array(train_mask,dtype='float32')
        
        rw_vert = rw_vert*train_mask
        rw_subvert = rw_subvert*train_mask

        return ([title,vert,subvert,user_title, user_vert,user_vert_mask,user_subvert,user_subvert_mask,vert_subvert_mask_input,user_vert_num,user_subvert_num,rw_vert,rw_subvert],[label])
    
    
class get_hir_user_generator(Sequence):
    def __init__(self,news_emb,news_vert,news_subvert,news_entity, clicked_news,batch_size):
        self.news_emb = news_emb
        self.vert = news_vert
        self.subvert = news_subvert
        self.entity = news_entity
        
        self.clicked_news = clicked_news

        self.batch_size = batch_size
        self.ImpNum = self.clicked_news.shape[0]
        
    def __len__(self):
        return int(np.ceil(self.ImpNum / float(self.batch_size)))
    
    
    def __get_news(self,docids):
        news_emb = self.news_emb[docids]
        vert = self.vert[docids]
        subvert = self.subvert[docids]
        entity = self.entity[docids]
        return news_emb, vert, subvert, entity
    
    
    def __getitem__(self, idx):
        start = idx*self.batch_size
        ed = (idx+1)*self.batch_size
        if ed> self.ImpNum:
            ed = self.ImpNum
        
        clicked_ids = self.clicked_news[start:ed]
        user_title, user_vert, user_subvert, user_entity = self.__get_news(clicked_ids)
        
        vert_subvert_mask_input = np.zeros((len(user_subvert),len(category_dict),len(subcategory_dict),))
        for bid in range(len(user_subvert)):
            for nid in range(len(user_subvert[bid])):
                sv = user_subvert[bid][nid]
                if sv ==0:
                    continue
                sv -= 1
                vert_subvert_mask_input[bid,:,sv] = 1
        vert_subvert_mask_input = vert_subvert_mask_input*vert_subvert_mask_table

        
        
        user_vert = keras.utils.to_categorical(user_vert,len(category_dict)+1)
        user_vert = user_vert.transpose((0,2,1))
        user_vert = user_vert[:,1:,:]
        user_vert_mask = user_vert.sum(axis=-1)
        
        
        user_subvert = keras.utils.to_categorical(user_subvert,len(subcategory_dict)+1)
        user_subvert = user_subvert.transpose((0,2,1))
        user_subvert = user_subvert[:,1:,:]
        user_subvert_mask = user_subvert.sum(axis=-1)
        
        user_vert_num = np.array(user_vert.sum(axis=-1),dtype='int32')
        user_subvert_num = np.array(user_subvert.sum(axis=-1),dtype='int32')
        
        user_subvert_mask = np.array(user_subvert_mask>0,dtype='float32')
        user_vert_mask = np.array(user_vert_mask>0,dtype='float32')
        vert_subvert_mask_input = np.array(vert_subvert_mask_input>0,dtype='float32')

        return [user_title, user_vert,user_vert_mask,user_subvert,user_subvert_mask,vert_subvert_mask_input,user_vert_num,user_subvert_num]

In [None]:
model,news_encoder,user_encoder,rews = create_model(category_dict,subcategory_dict,title_word_embedding_matrix,entity_emb_matrix)

### Load checkpoint 

In [None]:
checkpath = '12_13_hierec_v14_portion_30_60.hdf5'
model.load_weights(checkpath)

### Pipeline (Continue)

In [None]:
train_generator = get_hir_train_generator(0.9999,news_info,index2nid,news_vert,news_subvert,news_entity_np,news_entity,train_user['click'],train_user_id,train_sess,train_label,16)
model.fit_generator(train_generator,epochs=1,verbose=1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/1

### Save checkpoint 

In [None]:
checkpath = '12_14_hierec_v14_portion_60_100.hdf5'
model.save_weights(checkpath)