# prepare

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf 
import random
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
import keras_tuner 

print(tf.test.is_gpu_available())
print(tf.config.list_physical_devices('GPU'))
config = tf.compat.v1.ConfigProto()  
config.gpu_options.allow_growth=True
sess = tf.compat.v1.Session(config=config)

def seed_tensorflow(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# parameters

In [4]:
para = {'data_path':'./input/',
        'embedding_dim':64,
        'seed':40,
        'lr':1e-5,
        'batch_size':512,
        'epochs':64,
        'verbose':1,
        'pretrain':False,
        'callback':{
                     'monitor':'val_loss',
                     'patience':5,
                     'save_model_path':"./model/test_fp.h5"
                     },
        'mlp_dims':[300,200,100],
        'mlp_act':'relu',
        'mlp_dps':[.2,.2,.2],
        'bias_dim':64,
       }

In [5]:
action = pd.read_csv(para['data_path']+"action.csv")
feed_emb = np.load(para['data_path']+"feedid_emb_64.npy")

In [None]:
feed = action.sort_values(by=['date_'])

train= feed.groupby('userid').apply(lambda x: x[:int(len(x)*0.6)]).reset_index(drop=True)
valid= feed.groupby('userid').apply(lambda x: x[int(len(x)*0.6):int(len(x)*0.8)]).reset_index(drop=True)
test= feed.groupby('userid').apply(lambda x: x[int(len(x)*0.8):]).reset_index(drop=True)

# model


In [None]:
class Fusion(tf.keras.layers.Layer):
    name = 'fusion'
    def __init__(self,
                 alpha = 1.0,
                 kl = 0.1,
                 fusion_mode='sum',
                 activation=None, **kwargs):
        super().__init__(**kwargs)
        self.alpha = alpha
        self.kl = kl
        self.fusion_mode = fusion_mode
        
    def build(self, input_shape):
        self.kernel = self.add_weight(
            name="kernel", 
            shape=(1,),
            initializer="one",
        )
        super().build(input_shape) 
        
    def call(self, inputs,training=False):
        o_base,o_bias = inputs
        
        base_logit = self._fusion_fuction(o_base,o_bias)
        bias_logit = self._fusion_fuction(self.kernel,o_bias) 
        
        base_output = tf.keras.activations.sigmoid(base_logit)
        bias_output = tf.keras.activations.sigmoid(bias_logit)
        
        kl_loss = self.kl * self._kl_loss(base_logit,bias_logit)

        if training:
            self.add_loss(kl_loss,inputs=True)
            return base_output,bias_output
        else:
            kl_loss = 0.0
            self.add_loss(kl_loss,inputs=True)
#             base_output = tf.keras.activations.sigmoid(base_logit - self.alpha*bias_logit)
            return base_output,bias_output
    
#     def compute_output_shape(self, batch_input_shape):
#         return tf.TensorShape(batch_input_shape.as_list()[:-1] + [self.units])
    
#     def get_config(self):
#         base_config = super().get_config()
#         return {**base_config, "units": self.units,
#                 "activation": keras.activations.serialize(self.activation)}   
    def _fusion_fuction(self, o1, o2):
        eps = 1e-12
        if self.fusion_mode == "sum":
            o_fusion = tf.math.log(tf.sigmoid(o1 + o2)) 
        if self.fusion_mode == "hm":
            o = tf.keras.activations.sigmoid(o1) * tf.keras.activations.sigmoid(o2)
            o_fusion = tf.math.log(o + eps) - tf.math.log1p(o)
        if self.fusion_mode == 'rubi':
            o_fusion = o1 * tf.keras.activations.sigmoid(o2)

        return o_fusion
    
    def _kl_loss(self,o1_fusion,o2_fusion):
        p_te = tf.nn.sigmoid(o1_fusion)
        p_nde = tf.nn.sigmoid(o2_fusion)
        kl_loss = -p_te*tf.math.log(p_nde)
        kl_loss = tf.reduce_mean(kl_loss)
        return kl_loss


In [None]:
seed_tensorflow(seed=para['seed'])

def get_layer(shape,name,dtype='int32',d1=None,d2=None,pretrain=None,trainable=False):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=shape, name=name+'_input', dtype=dtype))
    if d1 is None:
        d1 = pretrain.shape[0]
        d2 = pretrain.shape[1]
    model.add(tf.keras.layers.Embedding(input_dim=int(d1),
                       output_dim=int(d2),
                       weights=[pretrain] if pretrain is not None else None,
                       trainable=trainable,
                       name=name+'_embedding'))
    return model

def mlp_bias():
    uid_lay = get_layer((1,),'uid',d1 = max(action['userid'])+1,d2 = para['embedding_dim'],trainable=True)
    did_lay = get_layer((1,),'did',d1 = max(action['device'])+1,d2 = int(para['embedding_dim']/2),trainable=True)

    vid_lay = get_layer((1,),'vid',d1 = max(action['feedid'])+1,d2 = para['embedding_dim'],trainable=True)
    aid_lay = get_layer((1,),'aid',d1 = max(action['authorid'])+1,d2 = para['embedding_dim'],trainable=True)
    pre_lay = get_layer((1,),'pretrain',pretrain = feed_emb,trainable=False)        
    
    duration_lay = get_layer((1,),'duration_id',d1 = max(action['duration_level'])+1, d2=para['embedding_dim'],trainable=True)

    Lay_bi =[uid_lay,did_lay,vid_lay,aid_lay,pre_lay,duration_lay]
    lay_bi_outs = []
    for l in Lay_bi:
        lay_bi_outs += l.outputs


    vec = tf.keras.layers.concatenate(lay_bi_outs,axis=-1)
    vec = tf.squeeze(vec,axis=1)
    vec = tf.keras.layers.BatchNormalization()(vec)

    for i in range(len(para['mlp_dims'])):
        vec = tf.keras.layers.Dense(para['mlp_dims'][i],
                                    activation = 'relu',
                                    name='mlp_dense'+str(i))(vec)
        vec = tf.keras.layers.Dropout(para['mlp_dps'][i])(vec)
    
    model_inputs = []
    for l in Lay_bi:
        model_inputs += l.inputs
        
    model = tf.keras.Model(inputs=model_inputs,outputs=[vec,duration_lay.outputs])
    
    return model

def bias(duration_vec):
    print(duration_vec.shape)
    vec = tf.squeeze(duration_vec,axis=1) # x
    vec_dense = tf.keras.layers.Dense(para['bias_dim'],'relu')(vec) # h = f(x)
    vec_res = tf.keras.layers.Activation("relu")(tf.keras.layers.Add()([vec,vec_dense])) # g(x+f(x))
            
    return vec_res


In [None]:
seed_tensorflow(seed=para['seed'])
def crvdd(alpha,fusion_mode,kl):
    
    backbone = mlp_bias()

    vec = backbone.outputs[0]
    vec_b = bias(backbone.outputs[1])

    logits1 = tf.keras.layers.Dense(1)(vec)
    logits2 = tf.keras.layers.Dense(1)(vec_b)

    output1,output2 = Fusion(alpha=alpha,fusion_mode=fusion_mode,kl=kl)([logits1,logits2])

    model = tf.keras.Model(inputs=backbone.inputs,
                           outputs=[output1,output2])
    
    return model

# model pics

In [None]:
plot_model(model,show_shapes=True)

# training

In [None]:
features = ['userid','device','feedid','authorid','feedid','duration_level']

def get_input(df,is_test=False):
    X = []
    for f in features:
        X.append(df[f].values.reshape(-1,1))
    if is_test:
        y = [df['test_label'].values.reshape(-1,1)]
    else:
        y = [df['finish_playing'].values.reshape(-1,1)]
    return X,y

X_train,y_train = get_input(train,is_test=False)
X_valid,y_valid = get_input(valid,is_test=False)
X_test,y_test = get_input(test,is_test=True)

In [None]:
seed_tensorflow(seed=para['seed'])

model = crvdd(0,'sum',0.8)
o_list = ['fusion', 'fusion_1']
adam=tf.keras.optimizers.Adam(learning_rate=para['lr'])


model.compile(optimizer=adam,
                  loss={o_list[0]: 'binary_crossentropy', o_list[1]: 'binary_crossentropy'},
                  loss_weights={o_list[0]: 1., o_list[1]: 1.0},
                  metrics={o_list[0]: [tf.keras.metrics.AUC(name='auc')],
                           o_list[1]:[tf.keras.metrics.AUC(name='auc')]})
    
# es_callback = tf.keras.callbacks.EarlyStopping(monitor=para['callback']['monitor'],
#                                                mode='min',
#                                                patience=para['callback']['patience'])

# checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=para['callback']['save_model_path'],
#                                  monitor=para['callback']['monitor'],
#                                  mode = 'min',
#                                  save_weights_only=True,
#                                  save_best_only=True,) 

# hist = model.fit(X_train,
#           {o_list[0]: y_train, o_list[1]: y_train},
#           epochs= para['epochs'],
#           batch_size=para['batch_size'],
#           shuffle=True,
#           callbacks = [es_callback,checkpoint],
#           verbose=para['verbose'],
#           validation_data=(X_valid,(y_valid,y_valid)))


# evaluate

In [None]:
class Topk:
    def __init__(self,K,test_df):
        self.K = K
        self.topk = K
        self.test_df = test_df
        
    def TOPK(self,R,T):
        res = {}
        p = 4
        res['PRECISION@'+str(self.K)] = round(self.PRECISION(R,T),p)
        res['RECALL@'+str(self.K)] = round(self.RECALL(R,T),p)
        res['HR@'+str(self.K)] = round(self.HR(R,T),p)
        res['MAP@'+str(self.K)] = round(self.MAP(R,T),p)
        res['MRR@'+str(self.K)] = round(self.MRR(R,T),p)
        res['NDCG@'+str(self.K)] = round(self.NDCG(R,T),p)
        res['AUTC@'+str(self.K)] = round(self.AUTC(),p)
#         print(res)
        return res
#         return res
    
    def evaluate(self):
        temp = self.test_df[['userid','feedid','play','test_label','pred']]
        true_df = temp[temp.test_label == 1].groupby(['userid']).agg({'feedid':lambda x: \
                                           list(x)}).reset_index().sort_values(by=['userid'])  
        x=pd.DataFrame(temp[~temp.userid.isin(true_df['userid'])]['userid'].drop_duplicates())
        x['feedid'] = [[] for i in range(len(x))]
#         true_df = pd.concat([true_df,x])
        temp = temp[temp.userid.isin(true_df['userid'])]
        temp = temp.sort_values(by=['userid','pred'],ascending=False)
        rank_df = temp.groupby(['userid']).agg({'feedid':lambda x: list(x)}).reset_index().sort_values(by=['userid'])
        rank_df['top'+str(self.topk)] = rank_df['feedid'].apply(lambda x: x[:self.topk] if len(x)>=self.topk else x)

        assert len(true_df) == len(rank_df)

        df = pd.merge(left=true_df,right=rank_df[['userid','top'+str(self.topk)]],on=['userid'])

        assert len(df) == len(df.dropna(how='any'))

        T = df['feedid'].tolist()
        R = df['top'+str(self.topk)].tolist()

        res = self.TOPK(R,T)
        print("[%.4f , %.4f ,%.4f,%.4f]" %(res['RECALL@'+str(self.topk)],
                                           res['MAP@'+str(self.topk)],
                                           res['NDCG@'+str(self.topk)],
                                          res['AUTC@'+str(self.topk)]))
#         print(res)
#         return res
        
    def PRECISION(self,R,T):
        assert len(R) == len(T)
        res = 0
        for i in range(len(R)):
            res += len(set(R[i])&set(T[i])) / len(R[0])
        return res/len(R)

    def RECALL(self,R,T):
        assert len(R) == len(T)
        res = 0
        for i in range(len(R)):
            if len(T[i]) > 0:
                res += len(set(R[i])&set(T[i])) / len(T[i])
        return res/len(R)
    
    def HR(self,R,T):
        assert len(R) == len(T)
        up = 0
        down = len(R)
        for i in range(len(R)):
            if len(set(R[i])&set(T[i])) > 0:
                up += 1
        return up / down
    
    def MAP(self,R,T):
        assert len(R) == len(T)
        up = 0
        down = len(R)
        for i in range(len(R)):
            temp = 0
            hit = 0
            for j in range(len(R[i])):
                if R[i][j] in T[i]:
                    hit += 1
                    temp += hit/(j+1)
            if hit >0:
                up += temp/len(T[i])
        return up / down   
    
    def MRR(self,R,T):
        assert len(R) == len(T)
        up = 0
        down = len(R)
        for i in range(len(R)):
            index = -1
            for j in range(len(R[i])):
                if R[i][j] in T[i]:
                    index = R[i].index(R[i][j])
                    break
            if index != -1:
                up += 1/(index+1)
        return up / down
    def dcg(self,hits):
        res=0
        for i in range(len(hits)):
            res += (hits[i]/np.log2(i+2))
        return res

    def NDCG(self,R,T):
        assert len(R) == len(T)
        up = 0
        down = len(R)
        for i in range(len(R)):
            hits = []
            for j in range(len(R[i])):
                if R[i][j] in T[i]:
                    hits += [1.0]
                else:
                    hits += [0.0]
            if sum(hits) > 0:
                up += (self.dcg(hits) / (self.dcg([1.0 for i in range(len(T[i]))])+1)) #来自wiki的定义，idcg应该是对目标排序。
        return up / down 
    
    def AUTC(self):
        t = self.test_df.groupby(['userid']).apply(lambda x:self.play_time(x,self.topk))
        return t.sum() / len(t)
    def play_time(self,x,topk):
        temp = x.sort_values(by=['pred'],ascending=False)[:topk]
        rank_time = temp['play'].sum()
        sum_time = x['play'].sum()
        return rank_time / sum_time

In [None]:
# model.load_weights('./参数/alpha/sum/sum_0.1_wo_kl_0.8_fp.h5')
pred = model.predict(X_test)
test['base'] = pred[0]
test['bias'] = pred[1]
test['pred'] = test['base'] - test['bias']
test['preds'] = tf.math.sigmoid(test['pred'])

Topk(3,test).evaluate()
Topk(5,test).evaluate()

In [None]:
test.groupby(['duration_level'])['preds'].mean().tolist()

In [None]:
temp=test.sort_values(by=['userid','pred'],ascending=False)
maps = test[['feedid','duration_level']].drop_duplicates(['feedid'])

topk=5
rank_df = temp.groupby(['userid']).agg({'feedid':lambda x: list(x)}).reset_index().sort_values(by=['userid'])
rank_df['top'+str(topk)] = rank_df['feedid'].apply(lambda x: maps[maps.feedid.isin(x[:topk])]['duration_level'].tolist() if len(x)>=topk else x)

x=rank_df['top5'].tolist()
temp = []
for i in x:
    temp += i
fre = []
for i in range(6):
    fre += [temp.count(i)]
fre