自行更换模型配置路径

6类模型:
1. biobert v1.1 * 3
2. biobert v1.0 pubmed_pmc * 2
3. biobert_dish * 1
4. biobert_v1.0 pubmd
5. biobert_v1.0 pmc
6. scibert

In [None]:
#coding:utf-8
from keras.callbacks import *
import numpy as np
from tqdm import tqdm
import time
import logging
from sklearn.model_selection import StratifiedKFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.optimizers import Adam
import keras.backend.tensorflow_backend as KTF
import tensorflow as tf
import pandas as pd
from keras.utils.np_utils import to_categorical
from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.layers import Input, Embedding, LSTM, Dense

In [None]:
# chinese_wwm_ext_L-12_H-768_A-12

config = tf.ConfigProto()
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
session = tf.Session(config=config)
# 设置session
KTF.set_session(session)

config_path = './biobert_v1.1_pubmed/bert_config.json'
checkpoint_path = './biobert_v1.1_pubmed/bert_model.ckpt'
dict_path = './biobert_v1.1_pubmed/vocab.txt'
# MAX_LEN = 224
MAX_LEN = 300
n_class = 2

token_dict = {}
with open(dict_path, 'r', encoding='utf-8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
tokenizer = Tokenizer(token_dict, cased=True)

class data_generator:
    def __init__(self, data, batch_size=16):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data[0]) // self.batch_size
        if len(self.data[0]) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            X1, X2, y = self.data
            idxs = list(range(len(self.data[0])))
            # np.random.shuffle(idxs)
            T, T_, Y = [], [], []
            for c, i in enumerate(idxs):
                achievements = X1[i]
                requirements = X2[i]
                t, t_ = tokenizer.encode(first=achievements, second=requirements, max_len=MAX_LEN)
                T.append(t)
                T_.append(t_)
                Y.append(y[i])
                if len(T) == self.batch_size or i == idxs[-1]:
                    T = np.array(T)
                    T_ = np.array(T_)
                    Y = np.array(Y)
#                     print(Y)
                    yield [T, T_], Y
                    T, T_, Y = [], [], []

def get_model():
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
    for l in bert_model.layers:
        l.trainable = True

    T1 = Input(shape=(None,))
    T2 = Input(shape=(None,))
    
    T = bert_model([T1, T2])

    T = Lambda(lambda x: x[:, 0])(T)
    
    output = Dense(n_class, activation='softmax')(T)

    model = Model([T1, T2], output)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=Adam(1e-5),  # 用足够小的学习率
        metrics=['accuracy']
    )
    return model

train = pd.read_csv('./new_data/train_data3_bm2540_tfidf20_6.csv')
# test = pd.read_csv('./new_data/stage2_test_data3_bm2580_tfidf20_8.csv')
# test = pd.read_csv('./new_data/stage2_test_data3_bm2590_tfidf20_10.csv')
# test = pd.read_csv('./new_data/stage2_test_data3_bm25100_tfidf20_11.csv')

# test = test[:256]

train_achievements = train['text'].values
train_requirements = train['text_b'].values

labels = train['label'].astype(int).values
labels_cat = to_categorical(labels,2)
labels_cat = labels_cat.astype(np.int32)

test_achievements = test['text'].values
test_requirements = test['text_b'].values
test_cat = to_categorical([0 for x in test_achievements],n_class)

n_flods = 5

skf = StratifiedKFold(n_splits=n_flods, shuffle=True, random_state=256)

oof_train = np.zeros((len(train), labels_cat.shape[1]), dtype=np.float32)
oof_test = np.zeros((len(test), labels_cat.shape[1]), dtype=np.float32)

oof_test_list = []

# submit_D = data_generator([test_achievements, test_requirements, test_cate, to_categorical([0 for x in test_achievements],n_class)],batch_size=32)
submit_D = data_generator([test_achievements, test_requirements, test_cat],batch_size=512)

for fold, (train_index, valid_index) in enumerate(skf.split(train_achievements, labels)):
#     if fold >=1 and fold != 3:
    if fold >= 0:
        K.clear_session()   
        # logger.info('================     fold {}        ==============='.format(fold))
        x1 = train_achievements[train_index]
        x2 = train_requirements[train_index]
        y = labels_cat[train_index]

        val_x1 = train_achievements[valid_index]
        val_x2 = train_requirements[valid_index]
        val_y = labels[valid_index]
        val_cat = labels_cat[valid_index]

        train_D = data_generator([x1, x2, y],batch_size=8)
        valid_D = data_generator([val_x1, val_x2, val_cat],batch_size=512)

        checkpointer = ModelCheckpoint(filepath="./checkpoint_%d.hdf5" % (fold), monitor='val_acc', verbose=True,
                                       save_best_only=True, mode='auto')
        early = EarlyStopping(monitor='val_acc', patience=2, verbose=0, mode='max')

        model = get_model()
        #model.load_weights("./checkpoint_0_bert5_single_fold.hdf5")
        if fold==0:print(model.summary())

        if fold >= 0:   
            model.fit_generator(train_D.__iter__(),
                                steps_per_epoch=len(train_D),
#                                 steps_per_epoch=1000,
                                epochs=1,
                                validation_data=valid_D.__iter__(),
#                                 validation_steps=len(valid_D),
                                validation_steps=1,
                                callbacks=[checkpointer, early],
    #                             callbacks=[checkpointer],
                                verbose=True
                               )
        model.load_weights("./checkpoint_%d.hdf5" % (fold))
#         test_y = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=1)
#         oof_train[valid_index] = test_y

        tmp_tmp = model.predict_generator(submit_D.__iter__(), steps=len(submit_D), verbose=1)
        pd.DataFrame(tmp_tmp).to_csv('tmp/biobert6_fold'+str(fold)+'_dataset3.csv', index=False)
#         oof_test_list.append(tmp_tmp)
        oof_test += tmp_tmp / n_flods