In [2]:
!pip install keras

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting keras
  Downloading http://mirrors.tencentyun.com/pypi/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377 kB)
[K     |████████████████████████████████| 377 kB 835 kB/s eta 0:00:01     |██████████████████████████      | 307 kB 835 kB/s eta 0:00:01
Installing collected packages: keras
Successfully installed keras-2.3.1


In [6]:
from keras.layers import Dense, Input, LSTM, Embedding, BatchNormalization,Bidirectional
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd
import numpy as np
import keras
import h5py
from keras.optimizers import RMSprop


def get_filename_for_saving(save_dir):
    return os.path.join(save_dir,
                        "lstm_comb_age_{val_loss:.3f}-{val_accuracy:.3f}-{epoch:03d}-{loss:.3f}-{accuracy:.3f}.hdf5")


def buid_model():
    # LSTM 模型
    print('lstm model start...\n')

    # 标题输入：接收一个含有 200 个整数的序列，每个整数在 1 到 3812202 之间。
    main_input1 = Input(shape=(200,), name='main_input1', dtype='int32')
    emb1 = Embedding(output_dim=15, input_dim=3812203, input_length=200,mask_zero = True)(main_input1)
    
    main_input2 = Input(shape=(200,), name='main_input2', dtype='int32')
    emb2 = Embedding(output_dim=15, input_dim=62966, input_length=200,mask_zero = True)(main_input2)
    
    main_input3 = Input(shape=(200,), name='main_input3', dtype='int32')
    emb3 = Embedding(output_dim=15, input_dim=4445721, input_length=200,mask_zero = True)(main_input3)
    
    emb = keras.layers.concatenate([emb1, emb2, emb3])
    
    lstm_out = Bidirectional(LSTM(10,activation='softsign'))(emb)
    
    
    # 堆叠全连接
#     den_x = Dense(32, activation='relu')(lstm_out)
    
#     # 辅助输入数据
#     auxiliary_input = Input(shape=(48,), name='aux_input')
#     # 将辅助输入与lstm输出连接起来
    
#     lstm_out = Dense(30, activation='relu')(lstm_out)
#     lstm_out = BatchNormalization()(lstm_out)
    # 辅助损失函数
#     auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)
    # 主输出
    main_output = Dense(10, activation='softmax', name='main_output')(lstm_out)

    # 定义一个具有两个输入输出的模型
    model = keras.models.Model(inputs=[main_input1,main_input2,main_input3],
                               outputs=[main_output])  # 这里的输入输出顺序与fit时一致就好
    
    opt = RMSprop(lr=0.01,  clipnorm=1.0)
    model.compile(optimizer=opt,
                  loss={'main_output': 'categorical_crossentropy'},
                 metrics=['accuracy'])

    print('lstm model geted...\n')
    return model


def data_load():
    print('loading data ... \n')

    with h5py.File('lstm_model_ad_id/word_train_ad.h5', 'r') as f:
        data = np.array(f.get('word_data'))
        
    label = pd.read_csv('../train_preliminary/user.csv').sort_values(by=['user_id'])

    train_x, test_x, train_y, test_y = train_test_split(data, label, test_size=0.2, random_state=2020)

    train_y_age = train_y['age'].values - 1
    train_y_age = keras.utils.np_utils.to_categorical(train_y_age, num_classes=10)
    train_y_gender = train_y['gender'].values - 1

    test_y_age = test_y['age'].values - 1
    test_y_age = keras.utils.np_utils.to_categorical(test_y_age, num_classes=10)
    test_y_gender = test_y['gender'].values - 1

    print('get data ... \n')

    return train_x, test_x, train_y_age, train_y_gender,test_y_age,test_y_gender

def load_data2():
    
    with h5py.File('lstm_model_advertiser_id/word_train_advertiser_id.h5', 'r') as f:
        data = np.array(f.get('word_data'))

    train_x, test_x= train_test_split(data, test_size=0.2, random_state=2020)
    return train_x, test_x

def load_data3():
    
    with h5py.File('lstm_model_creative_id/word_train_creative_id.h5', 'r') as f:
        data = np.array(f.get('word_data'))

    train_x, test_x= train_test_split(data, test_size=0.2, random_state=2020)
    return train_x, test_x

In [4]:
model = buid_model()

lstm model start...

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
lstm model geted...



In [14]:
print(model.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 15)           944490    
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                1040      
_________________________________________________________________
main_output (Dense)          (None, 10)                110       
Total params: 945,640
Trainable params: 945,640
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
train_x, test_x, train_y_age, train_y_gender,test_y_age,test_y_gender = data_load()

train_x2, test_x2 = load_data2()

train_x3, test_x3 = load_data3()

loading data ... 

get data ... 



In [7]:
print('lstm model fit...\n')
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=get_filename_for_saving(''),
    save_best_only=False)
stopping = keras.callbacks.EarlyStopping(patience=8)
reduce_lr = keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=2, min_lr=0.0001)

model.fit({'main_input1': train_x ,'main_input2': train_x2,'main_input3': train_x3},
          {'main_output': train_y_age},
          epochs=100,
          batch_size=256,
          validation_data=({'main_input1': test_x,'main_input2': test_x2,'main_input3': test_x3},
                           {'main_output': test_y_age}),
          callbacks=[checkpointer, reduce_lr, stopping])

lstm model fit...


Train on 720000 samples, validate on 180000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
  3840/720000 [..............................] - ETA: 1:06:21 - loss: 0.5666 - accuracy: 0.8156

KeyboardInterrupt: 

In [None]:



# pre = model.predict(test_x,verbose=1)

# #评估结果
# from sklearn.metrics import confusion_matrix, classification_report
# y_ = np.reshape(np.argmax(test_y,axis=1),[-1])
# pre_ = np.reshape(np.argmax(pre, axis=1),[-1])
# #每个类的各项指标
# cm = confusion_matrix(y_, pre_)
# # np.set_printoptions(precision=3)
# cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# print(cm_normalized)
# print(classification_report(y_, pre_))