# 模型融合&&trick
我们比赛中使用的stacking模型结构如下图所示

  ![img](../img/stacking.png)
 
### Snapshot Emsemble
   在stacking第二层模型中我们还加入了深度融合的方法，[论文地址](https://arxiv.org/abs/1704.00109)
   
### Pesudo Labeling
   我们使用的另外一个trick就是pesudo-labeling 方法，它适用于所有给定测试集的比赛 [教程](https://shaoanlu.wordpress.com/2017/04/10/a-simple-pseudo-labeling-function-implementation-in-keras/)


In [9]:
# 导入相应的包
import pickle
import glob
from config import Config
from keras.utils import np_utils
from keras.layers import *
from model.snapshot import SnapshotCallbackBuilder
from keras.models import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

In [10]:
TRAIN_X = '../data/All_cut_train_text.txt'
TEXT_X = '../data/' + 'News_cut_test_text.txt'
config = Config()

#### 准备基本特征和OOF文件

In [23]:
def data_prepare():
    oof_filename = []
    test_filename = []

    

    # load text feature
    train_y = []



    with open(TRAIN_X, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            line = line.split('\t')
            label = int(line[2])
            train_y.append(label)

    with open(config.FEATURES_test_FILE, 'rb') as f:
        test_features = pickle.load(f)
    with open(config.FEATURES_FILE, 'rb') as f:
        features = pickle.load(f)

    with open(config.OCR_FEATURES_test_FILE, 'rb') as f:
        ocr_test_features = pickle.load(f)
    with open(config.OCR_FEATURES_FILE, 'rb') as f:
        ocr_features = pickle.load(f)
        
    scaler = MinMaxScaler()
    all_feature = np.concatenate([features, test_features, ocr_test_features, ocr_features], axis=0)
    scaler.fit(all_feature)
    features = scaler.transform(features)
    test_features = scaler.transform(test_features)
    ocr_features = scaler.transform(ocr_features)
    ocr_test_features = scaler.transform(ocr_test_features)

    train_y = np_utils.to_categorical(train_y)

    with open('../data/train_x_250.pkl', 'rb') as f:
        train_x = pickle.load(f)

    with open('../data/' + 'test_x_250.pkl', 'rb') as f:
        test_x = pickle.load(f)
        
    # 联合OCR提取的特征
    with open('../data/ocr_train_x_250.pkl', 'rb') as f:
        train_ocr_x = pickle.load(f)

    with open('../data/ocr_test_x_250.pkl', 'rb') as f:
        test_ocr_x = pickle.load(f)
        
    # load oof train and oof test
    filenames = glob.glob('../data/result/*oof*')
    for filename in filenames:
        oof_filename.append(filename)
        test_filename.append(filename.replace('_oof_', '_oof_'))

    oof_data = []
    test_data = []

    for tra, tes in zip(oof_filename, test_filename):
        with open(tra, 'rb') as f:
            oof_data.extend(pickle.load(f)[:len(train_x)])
        with open(tes, 'rb') as f:
            test_data.extend(pickle.load(f)[:len(test_x)])
            
    train_x = np.concatenate((train_x, train_ocr_x, features, ocr_features, oof_data[:len(train_x)]), axis=-1)
    test_x = np.concatenate((test_x, test_ocr_x, test_features, ocr_test_features, test_data[:len(test_x)]), axis=-1)

    train = {}
    test = {}
    train = train_x
    test  = test_x
    return train, train_y, test

In [24]:
train, train_y, test = data_prepare()




#### 这里只是使用了简单的DNN来做模型stacking

In [25]:
def get_model(train_x):
    input_shape = Input(shape=(train_x.shape[1],), name='news')
    x = Dense(256, activation='relu')(input_shape)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(3, activation="softmax")(x)
    res_model = Model(inputs=[input_shape], outputs=x)
    return res_model

def check_accuracy(pred, label, test_index):
    right = 0
    total = 0
    for count, re in enumerate(pred):
        cc = test_index[count]
        if cc >= 48480:
            continue
        total += 1
        flag = np.argmax(re)
        if int(flag) == int(np.argmax(label[count])):
            right += 1
    return right / total

In [26]:
BATCH_SIZE = 64


#### 准备stacking模型

In [31]:
# 第一次stacking
def stacking_first(train, train_y, test):
    savepath = './stack_/'
    if not os.path.exists(savepath):
        os.mkdir(savepath)
    count_kflod = 0
    num_folds = 6
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
    predict = np.zeros((test.shape[0], 3))
    oof_predict = np.zeros((train.shape[0], 3))
    scores = []
    for train_index, test_index in kf.split(train):

        kfold_X_train = {}
        kfold_X_valid = {}

        y_train, y_test = train_y[train_index], train_y[test_index]

        kfold_X_train, kfold_X_valid = train[train_index], train[test_index]

        model_prefix = savepath + 'DNN' + str(count_kflod)
        if not os.path.exists(model_prefix):
            os.mkdir(model_prefix)

        M = 4  # number of snapshots
        alpha_zero = 1e-3  # initial learning rate
        snap_epoch = 16
        snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)

        res_model = get_model(train)
        res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1,  class_weight=class_weight)
        res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1,
                      validation_data=(kfold_X_valid, y_test),
                      callbacks=snapshot.get_callbacks(model_save_place=model_prefix))

        evaluations = []
        for i in os.listdir(model_prefix):
            if '.h5' in i:
                evaluations.append(i)

        preds1 = np.zeros((test.shape[0], 3))
        preds2 = np.zeros((len(kfold_X_valid), 3))
        for run, i in enumerate(evaluations):
            res_model.load_weights(os.path.join(model_prefix, i))
            preds1 += res_model.predict(test, verbose=1) / len(evaluations)
            preds2 += res_model.predict(kfold_X_valid, batch_size=128) / len(evaluations)

        predict += preds1 / num_folds
        oof_predict[test_index] = preds2

        accuracy = check_accuracy(oof_predict[test_index], y_test, test_index)
        print('the kflod cv is : ', str(accuracy))
        count_kflod += 1
        scores.append(accuracy)
    print('total scores is ', np.mean(scores))
    return predict


In [32]:
predicts = stacking_first(train, train_y, test)


Train on 40400 samples, validate on 8080 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
the kflod cv is :  0.7391089108910891
Train on 40400 samples, validate on 8080 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
the kflod cv is :  0.7397277227722773
Train on 40400 samples, validate on 8080 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
the kflod cv is :  0.7320544554455446
Train on 40400 samples, validate on 8080 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 

### 这里使用pesudo-labeling方法
具体思路如下图所示
![img](../img/pesudo.png)

In [34]:
# 使用pseudo-labeling做第二次stacking
def stacking_pseudo(train, train_y, test, results):
    answer = np.zeros((results.shape[0], 1))
    for count in range(len(results)):
        answer[count] = np.argmax(results[count])
    answer = np_utils.to_categorical(answer)
    train_y = np.concatenate([train_y, answer], axis=0)
    train = np.concatenate([train, test], axis=0)


    savepath = './pesudo_/'
    if not os.path.exists(savepath):
        os.mkdir(savepath)
    count_kflod = 0
    num_folds = 6
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
    predict = np.zeros((test.shape[0], 3))
    oof_predict = np.zeros((train.shape[0], 3))
    scores = []
    for train_index, test_index in kf.split(train):

        kfold_X_train = {}
        kfold_X_valid = {}

        y_train, y_test = train_y[train_index], train_y[test_index]

        kfold_X_train, kfold_X_valid = train[train_index], train[test_index]

        model_prefix = savepath + 'DNN' + str(count_kflod)
        if not os.path.exists(model_prefix):
            os.mkdir(model_prefix)

        M = 4  # number of snapshots
        alpha_zero = 1e-3  # initial learning rate
        snap_epoch = 16
        snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)

        res_model = get_model(train)
        res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1,  class_weight=class_weight)
        res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1,
                      validation_data=(kfold_X_valid, y_test),
                      callbacks=snapshot.get_callbacks(model_save_place=model_prefix))

        evaluations = []
        for i in os.listdir(model_prefix):
            if '.h5' in i:
                evaluations.append(i)
        print(evaluations)

        preds1 = np.zeros((test.shape[0], 3))
        preds2 = np.zeros((len(kfold_X_valid), 3))
        for run, i in enumerate(evaluations):
            res_model.load_weights(os.path.join(model_prefix, i))
            preds1 += res_model.predict(test, verbose=1) / len(evaluations)
            preds2 += res_model.predict(kfold_X_valid, batch_size=128) / len(evaluations)

        predict += preds1 / num_folds
        oof_predict[test_index] = preds2

        accuracy = check_accuracy(oof_predict[test_index], y_test, test_index)
        print('the kflod cv is : ', str(accuracy))
        count_kflod += 1
        scores.append(accuracy)
    print('total scores is ', np.mean(scores))
    return predict


In [35]:
predicts = stacking_pseudo(train, train_y, test, predicts)


Train on 40429 samples, validate on 8086 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
['Model-1.h5', 'Model-4.h5', 'Model-3.h5', 'Model-2.h5']
the kflod cv is :  0.744988864142539
Train on 40429 samples, validate on 8086 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
['Model-1.h5', 'Model-4.h5', 'Model-3.h5', 'Model-2.h5']
the kflod cv is :  0.7317405298341173
Train on 40429 samples, validate on 8086 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
['Model-1.h5', 'Model-4.h5', 'Model-3.h5', 'Model-2.h5']
the kflod cv is :  0.7370961752692164
Train on 40429 sampl

In [36]:
def save_result(predict):
    with open('../data/pickle.pkl', 'wb') as f:
        pickle.dump(predict, f)

    results = predict
    count_zero = 0
    count_two = 0
    count_one = 0
    with open(TEXT_X, 'r', encoding='utf-8') as f, open('../data/' + 'result.txt', 'w', encoding='utf-8') as d:
        lines = f.readlines()
        for count, line in enumerate(lines):
            line = line.strip()
            line = line.split('\t')
            id = line[0]
            flag = np.argmax(results[count])
            if flag == 1:
                count_one += 1
            elif flag == 0:
                count_zero += 1
            elif flag == 2:
                count_two += 1
            d.write(id + '\t' + str(flag) + '\t' + 'NULL' + '\t' + 'NULL')
            d.write('\n')
    print(count_one)
    print(count_one / len(results))
    print(count_zero / len(results))
    print(count_two / len(results))

In [37]:
save_result(predicts)

3
0.08571428571428572
0.6571428571428571
0.2571428571428571
