In [1]:
!pip install keras==2.2.4

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting keras==2.2.4
  Downloading http://mirrors.tencentyun.com/pypi/packages/5e/10/aa32dad071ce52b5502266b5c659451cfd6ffcbf14e6c8c4f16c0ff5aaab/Keras-2.2.4-py2.py3-none-any.whl (312 kB)
[K     |████████████████████████████████| 312 kB 824 kB/s eta 0:00:01
Installing collected packages: keras
Successfully installed keras-2.2.4


In [1]:
from keras.models import Model
from keras.optimizers import SGD,Adam,RMSprop
# from keras.layers import Dense, Input, LSTM, Embedding,Dropout,Bidirectional,Flatten
from keras.layers import *
import os

# from __future__ import print_function
from keras import backend as K
from keras.engine.topology import Layer
import h5py

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
# Position_Embedding
#! -*- coding: utf-8 -*-
#%%

class Position_Embedding(Layer):
 
    def __init__(self, size=None, mode='sum', **kwargs):
        self.size = size #必须为偶数
        self.mode = mode
        super(Position_Embedding, self).__init__(**kwargs)
 
    def call(self, x):
        if (self.size == None) or (self.mode == 'sum'):
            self.size = int(x.shape[-1])
        batch_size,seq_len = K.shape(x)[0],K.shape(x)[1]
        position_j = 1. / K.pow(10000., \
                                 2 * K.arange(self.size / 2, dtype='float32' \
                               ) / self.size)
        position_j = K.expand_dims(position_j, 0)
        position_i = K.cumsum(K.ones_like(x[:,:,0]), 1)-1 #K.arange不支持变长，只好用这种方法生成
        position_i = K.expand_dims(position_i, 2)
        position_ij = K.dot(position_i, position_j)
        position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
        if self.mode == 'sum':
            return position_ij + x
        elif self.mode == 'concat':
            return K.concatenate([position_ij, x], 2)
 
    def compute_output_shape(self, input_shape):
        if self.mode == 'sum':
            return input_shape
        elif self.mode == 'concat':
            return (input_shape[0], input_shape[1], input_shape[2]+self.size)

In [3]:
# attention

class Attention(Layer):
 
    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        super(Attention, self).__init__(**kwargs)
 
    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ',
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK',
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV',
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)
 
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
 
    def call(self, x):
        #如果只传入Q_seq,K_seq,V_seq，那么就不做Mask
        #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len，那么对多余部分做Mask
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        #对Q、K、V做线性变换
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
        #计算内积，然后mask，然后softmax
        A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1))
        A = K.softmax(A)
        #输出并mask
        O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
 
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [4]:

def buid_model():
    # LSTM 模型
    print('lstm model start...\n')

    # 标题输入：接收一个含有 200 个整数的序列，每个整数在 1 到 3812202 之间。
    S_inputs1 = Input(shape=(200,),name='main_input1',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_ad/embeddings_matrix_ad_size_32_w200_count_0.h5','r') as f:
        embeddings_matrix1 = np.array(f.get('embeddings_matrix'))
    embeddings1 = Embedding(input_dim = len(embeddings_matrix1), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix1], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs1)

    S_inputs2 = Input(shape=(200,),name='main_input2',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_advertiser/embeddings_matrix_advertiser_size_32_w200_count_5.h5','r') as f:
        embeddings_matrix2 = np.array(f.get('embeddings_matrix'))
    embeddings2 = Embedding(input_dim = len(embeddings_matrix2), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix2], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs2)
    
    S_inputs3 = Input(shape=(200,),name='main_input3',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_creative_id/embeddings_matrix_creative_size_32_w200_count_5.h5','r') as f:
        embeddings_matrix3 = np.array(f.get('embeddings_matrix'))
    embeddings3 = Embedding(input_dim = len(embeddings_matrix3), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix3], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs3)
    
    emb = keras.layers.concatenate([embeddings1, embeddings2, embeddings3])
    O_seq = Attention(8,4)([emb,emb,emb])
    O_seq = GlobalAveragePooling1D()(O_seq)
    outputs = Dense(10, activation='softmax', name='main_output')(O_seq)

    # 定义一个具有两个输入输出的模型
    model = keras.models.Model(inputs=[S_inputs1,S_inputs2,S_inputs3],#,auxiliary_input],
                               outputs=[outputs])  # 这里的输入输出顺序与fit时一致就好
#     opt = RMSprop(lr=0.01,  clipnorm=1.0)
    opt = Adam(lr=0.005)
    model.compile(optimizer=opt,
                  sample_weight_mode='None',#"temporal",
                  loss={'main_output': 'categorical_crossentropy'},
                 metrics=['accuracy'])
    print(model.summary())
    return model

def data_load():
    print('loading data ... \n')

    with h5py.File('../../get_w2v_feat/w2v1_pre_ad/word_train_ad_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
        
    label = pd.read_csv('../../train_preliminary/user.csv').sort_values(by=['user_id'])
    
    y_age0 = label['age'].values - 1
    y_age = keras.utils.np_utils.to_categorical(y_age0, num_classes=10)
    y_gender = label['gender'].values - 1
    
    print('get data ... \n')

    return data,y_age,y_age0,y_gender

def load_data2():
    
    with h5py.File('../../get_w2v_feat/w2v1_pre_advertiser/word_train_advertiser_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
    return data

def load_data3():
    
    with h5py.File('../../get_w2v_feat/w2v1_pre_creative_id/word_train_creative_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
    return data



In [5]:
data1,y_age,y_age0,y_gender = data_load()
data2 = load_data2()
data3 = load_data3()

def get_filename_for_saving(save_dir):
    return os.path.join(save_dir,
                        "cross_multi_attention_aver_age_adm_w200_{val_loss:.3f}-{val_acc:.3f}-{epoch:03d}-{loss:.3f}-{acc:.3f}.hdf5")

print('lstm model fit...\n')
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=get_filename_for_saving(''),
    save_best_only=False)
stopping = keras.callbacks.EarlyStopping(patience=8)
reduce_lr = keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=2, min_lr=0.0001)


from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
cvscores = []
for train, test in kfold.split(data1,y_age0):
    
    model = buid_model()
    print('lstm model geted...\n')
    
    # Fit the model
    model.fit({'main_input1': data1[train],'main_input2': data2[train],'main_input3': data3[train] },#,'aux_input': train_x_sta},
              {'main_output': y_age[train]},
              epochs=1, 
              batch_size=256,
              validation_data=({'main_input1': data1[test],'main_input2': data2[test],'main_input3': data3[test]},
                {'main_output': y_age[test]}),
              callbacks=[checkpointer, reduce_lr, stopping])
    # evaluate the model
#     scores = model.evaluate(X[test], Y[test], verbose=0)
#     print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
#     cvscores.append(scores[1] * 100)
# print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

loading data ... 

get data ... 

lstm model fit...

lstm model start...







__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input1 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input2 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input3 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 32)      32319296    main_input1[0][0]                
_____________________________

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



lstm model start...

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input1 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input2 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input3 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 200, 32)      32319296    main_input1[0][0]                
________________________________________________________________________________________

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [6]:

def buid_model_gender():
    # LSTM 模型
    print('lstm model start...\n')

    # 标题输入：接收一个含有 200 个整数的序列，每个整数在 1 到 3812202 之间。
    S_inputs1 = Input(shape=(200,),name='main_input1',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_ad/embeddings_matrix_ad_size_32_w200_count_0.h5','r') as f:
        embeddings_matrix1 = np.array(f.get('embeddings_matrix'))
    embeddings1 = Embedding(input_dim = len(embeddings_matrix1), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix1], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs1)

    S_inputs2 = Input(shape=(200,),name='main_input2',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_advertiser/embeddings_matrix_advertiser_size_32_w200_count_5.h5','r') as f:
        embeddings_matrix2 = np.array(f.get('embeddings_matrix'))
    embeddings2 = Embedding(input_dim = len(embeddings_matrix2), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix2], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs2)
    
    S_inputs3 = Input(shape=(200,),name='main_input3',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_creative_id/embeddings_matrix_creative_size_32_w200_count_5.h5','r') as f:
        embeddings_matrix3 = np.array(f.get('embeddings_matrix'))
    embeddings3 = Embedding(input_dim = len(embeddings_matrix3), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix3], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs3)
    
    S_inputs4 = Input(shape=(200,),name='main_input4',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_industry/embeddings_matrix_industry_size_32_w200_count_5.h5','r') as f:
        embeddings_matrix4 = np.array(f.get('embeddings_matrix'))
    embeddings4 = Embedding(input_dim = len(embeddings_matrix4), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix4], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs4)
    S_inputs5 = Input(shape=(200,),name='main_input5',dtype='int32')
    with h5py.File('../../get_w2v_feat/w2v1_pre_product_id/embeddings_matrix_product_id_size_32_w200_count_5.h5','r') as f:
        embeddings_matrix5 = np.array(f.get('embeddings_matrix'))
    embeddings5 = Embedding(input_dim = len(embeddings_matrix5), # 字典长度
                                output_dim = 32, # 词向量 长度（32）
                                weights=[embeddings_matrix5], # 重点：预训练的词向量系数
                                input_length=200, # 每句话的 最大长度（必须padding） 
                                trainable=True, # 是否在 训练的过程中 更新词向量
                                mask_zero = False)(S_inputs5)

    
#     emb = keras.layers.concatenate([O_seq1, O_seq2, O_seq3])
    emb = keras.layers.concatenate([embeddings1, embeddings2, embeddings3, embeddings4, embeddings5])

    O_seq = Attention(8,4)([emb,emb,emb])
#     O_seq = Attention(8,8)([O_seq,O_seq,O_seq])

    O_seq = GlobalAveragePooling1D()(O_seq)

#     O_seq = Dropout(0.5)(O_seq)#尽量不要用

#     lstm_out = Bidirectional(LSTM(10,activation='softsign',return_sequences=True))(O_seq)
#     lstm_out = GlobalAveragePooling1D()(lstm_out)

    outputs = Dense(1, activation='sigmoid', name='main_output')(O_seq)

    # 定义一个具有两个输入输出的模型
    model = keras.models.Model(inputs=[S_inputs1,S_inputs2,S_inputs3,S_inputs4,S_inputs5],#,auxiliary_input],
                               outputs=[outputs])  # 这里的输入输出顺序与fit时一致就好
#     opt = RMSprop(lr=0.01,  clipnorm=1.0)
    opt = Adam(lr=0.005)
    model.compile(optimizer=opt,
                  sample_weight_mode='None',#"temporal",
                  loss={'main_output': 'binary_crossentropy'},
                 metrics=['accuracy'])
    print(model.summary())
    return model


def data_load():
    print('loading data ... \n')

    with h5py.File('../../get_w2v_feat/w2v1_pre_ad/word_train_ad_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
        
    label = pd.read_csv('../../train_preliminary/user.csv').sort_values(by=['user_id'])
    
    y_age = label['age'].values - 1
    y_age = keras.utils.np_utils.to_categorical(y_age, num_classes=10)
    y_gender = label['gender'].values - 1
    
    print('get data ... \n')

    return data,y_age,y_gender

def load_data2():
    
    with h5py.File('../../get_w2v_feat/w2v1_pre_advertiser/word_train_advertiser_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
    return data

def load_data3():
    
    with h5py.File('../../get_w2v_feat/w2v1_pre_creative_id/word_train_creative_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
    return data

def load_data4():
    
    print('loading data1 ... \n')
    with h5py.File('../../get_w2v_feat/w2v1_pre_industry/word_train_industry_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
    return data

def load_data5():
    
    print('loading data1 ... \n')
    with h5py.File('../../get_w2v_feat/w2v1_pre_product_id/word_train_product_id_w2v_w200.h5', 'r') as f:
        data = np.array(f.get('word_data'))
    return data



In [None]:
data1,y_age,y_gender = data_load()
data2 = load_data2()
data3 = load_data3()
data4 = load_data4()
data5 = load_data5()

def get_filename_for_saving(save_dir):
    return os.path.join(save_dir,
                        "cross_multi_attention_aver_gender_adm_w200_{val_loss:.3f}-{val_acc:.3f}-{epoch:03d}-{loss:.3f}-{acc:.3f}.hdf5")

print('lstm model fit...\n')
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=get_filename_for_saving(''),
    save_best_only=False)
stopping = keras.callbacks.EarlyStopping(patience=8)
reduce_lr = keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=2, min_lr=0.0001)


from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
cvscores = []
for train, test in kfold.split(data1,y_gender):
    
    model = buid_model_gender()
    print('lstm model geted...\n')
    
    # Fit the model
    model.fit({'main_input1': data1[train],'main_input2': data2[train],'main_input3': data3[train],
              'main_input4': data4[train],'main_input5': data5[train]},#,'aux_input': train_x_sta},
              {'main_output': y_gender[train]},
              epochs=1, 
              batch_size=256,
              validation_data=({'main_input1': data1[test],'main_input2': data2[test],'main_input3': data3[test],
                'main_input4': data4[test],'main_input5': data5[test]},#,'aux_input': train_x_sta},
                {'main_output': y_gender[test]}),
              callbacks=[checkpointer, reduce_lr, stopping])
    # evaluate the model
#     scores = model.evaluate(X[test], Y[test], verbose=0)
#     print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
#     cvscores.append(scores[1] * 100)
# print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

loading data ... 

get data ... 

loading data1 ... 

loading data1 ... 

lstm model fit...

lstm model start...

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input1 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input2 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input3 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input4 (InputLayer)        (None, 200)          0                                        

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



lstm model start...

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input1 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input2 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input3 (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
main_input4 (InputLayer)        (None, 200)          0                                            
________________________________________________________________________________________