## 线下模型加权融合
模拟梯度下降的方法对各个模型进行线性加权融合。

In [1]:
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np
import pickle
import os
import sys
import time 
sys.path.append('..')
from evaluator import score_eval

# # 求 softmax
def _softmax(score):
    """对一个样本的输出类别概率进行 softmax 归一化.
    score: arr.shape=[1999].
    """
    max_sc = np.max(score)   # 最大分数
    score = score - max_sc
    exp_sc = np.exp(score)
    sum_exp_sc = np.sum(exp_sc)
    softmax_sc = exp_sc / sum_exp_sc
    return softmax_sc    # 归一化的结果
    
def softmax(scores):
    """对所有样本的输出概率进行 softmax 归一化处理。
    scores: arr.shape=[n_sample, 1999].
    """
    softmax_scs = map(_softmax, scores)
    return np.asarray(softmax_scs)

marked_labels_list = np.load('data/marked_labels_list.npy')
scores_path = 'local_scores/'

## 不同模型的表现能力

In [7]:
time0 = time.time()
scores_names =[
    'p1-1-bigru-512.npy',
    'p1-2-bigru-512-true.npy',
    'textcnn-fc-drop-title-content-256-3457-drop0.5.npy',
    'f1-1-cnn-256-23457-11.npy',

    'han-cnn-title-content-256-345.npy',
    'han-cnn-title-content-256-23457-1234.npy',
    'm7-rnn-cnn-256-100.npy',
    'p2-1-rnn-cnn-256-256.npy',

    'p3-2-cnn-256-2357.npy',
    'p3-cnn-512-23457.npy',
    'textcnn-fc-drop-title-content-256-345.npy',   # 提高了两个千分点
    'textcnn-fc-drop-title-content-256-3457-drop0.2.npy',

    'm9-han-bigru-title-content-512-30.npy',
    'm9-2-han-bigru-title-content-512-30.npy',
    'han-bigru-title-content-256-30.npy',
    'm8-han-bigru-256-30.npy',

    'attention-bigru-title-content-256.npy',
    'm7-2-rnn-cnn-128-100.npy',
    'textcnn-fc-title-content-256-345.npy',
    'm1-2-fasttext-topicinfo.npy',

    'ch3-1-cnn-256-2345.npy',
    'ch3-2-cnn-256-23457.npy', 
    'ch4-1-han-bigru-256-52.npy',    
    'ch5-1-2embed-rnn256-cnn2345.npy',

    'p4-1-han-bigru-256.npy',
    'ch6-1-han-cnn-2345-1234.npy',
    'p5-1-2embed-rnn256-cnn2345.npy',
    'ch5-2-2embed-rnn512-cnn3457.npy',

    'c1-1-cnn-max-256-23457.npy',
    'c1-2-cnn-256-345710.npy',     
    'c2-1-bigru-256.npy',
    
    'textcnn-fc-drop-title-content-256-345-cross3cross0.npy',
    'textcnn-fc-drop-title-content-256-345-cross3cross1.npy',
    'textcnn-fc-drop-title-content-256-345-cross3cross2.npy',
    'p3-3-cnn-max-256-345710.npy',
    'textcnn-title-256-len50.npy',
    'ch7-1-2embed-rnn256-hcnn-2345-1234.npy',
]  
   

soft_scores_path = 'local_scores/'
scores_name_num = len(scores_names)
f1s = list()
for i in xrange(scores_name_num):
    scores_name = scores_names[i]
    print('%d/%d, scores_name= %s' % (i+1, scores_name_num, scores_name))
    score = np.vstack(np.load(soft_scores_path + scores_name))
    score = softmax(score)
    predict_labels_list = map(lambda label: label.argsort()[-1:-6:-1], score) # 取最大的5个下标
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('f1=%g' % f1)
    f1s.append(f1)
    
weights = [  9.75938817,   8.63945014,  2.98289344,   3.72323394,   5.04378259,
   0.06551187,  -0.79412528,  -0.21665029,   4.90162676,   1.17452791,
  -1.46124679,  -0.25384273,   5.50925013,   2.84186738,  -0.93016907,
   5.16519035,  -0.47061662,   2.75998217,   2.58152296,  -1.24553333,
   2.43288558,   6.17376317,   5.59323762,  10.46123521,   5.29952925,
   3.72042086,   5.46707444,   5.51516916,   5.82352659,   1.27847427,
  -0.52930247,  -1.99052155,  -3.0938045,   -2.07007845,   4.19963813,
   2.10593832,   1.74174258]
    
df_result = pd.DataFrame({'model_name': scores_names, 'f1': f1s, 'weight': weights})

1/37, scores_name= p1-1-bigru-512.npy
f1=0.413952
2/37, scores_name= p1-2-bigru-512-true.npy
f1=0.413378
3/37, scores_name= textcnn-fc-drop-title-content-256-3457-drop0.5.npy
f1=0.40947
4/37, scores_name= f1-1-cnn-256-23457-11.npy
f1=0.412223
5/37, scores_name= han-cnn-title-content-256-345.npy
f1=0.410232
6/37, scores_name= han-cnn-title-content-256-23457-1234.npy
f1=0.409706
7/37, scores_name= m7-rnn-cnn-256-100.npy
f1=0.406457
8/37, scores_name= p2-1-rnn-cnn-256-256.npy
f1=0.410234
9/37, scores_name= p3-2-cnn-256-2357.npy
f1=0.40933
10/37, scores_name= p3-cnn-512-23457.npy
f1=0.406415
11/37, scores_name= textcnn-fc-drop-title-content-256-345.npy
f1=0.408236
12/37, scores_name= textcnn-fc-drop-title-content-256-3457-drop0.2.npy
f1=0.404698
13/37, scores_name= m9-han-bigru-title-content-512-30.npy
f1=0.408178
14/37, scores_name= m9-2-han-bigru-title-content-512-30.npy
f1=0.403398
15/37, scores_name= han-bigru-title-content-256-30.npy
f1=0.408218
16/37, scores_name= m8-han-bigru-256-30

In [13]:
df_result = df_result.loc[:, ['model_name', 'f1', 'weight']]
df_result = df_result.sort_values('weight', ascending=False)
df_result.to_csv('model_f1.csv', index=False)
df_result

Unnamed: 0,model_name,f1,weight
23,ch5-1-2embed-rnn256-cnn2345.npy,0.399414,10.461235
0,p1-1-bigru-512.npy,0.413952,9.759388
1,p1-2-bigru-512-true.npy,0.413378,8.63945
21,ch3-2-cnn-256-23457.npy,0.397764,6.173763
28,c1-1-cnn-max-256-23457.npy,0.413798,5.823527
22,ch4-1-han-bigru-256-52.npy,0.397745,5.593238
27,ch5-2-2embed-rnn512-cnn3457.npy,0.398458,5.515169
12,m9-han-bigru-title-content-512-30.npy,0.408178,5.50925
26,p5-1-2embed-rnn256-cnn2345.npy,0.406252,5.467074
24,p4-1-han-bigru-256.npy,0.407782,5.299529


### 手动初始化
初始化根据单模型的表现能力赋值，也可以通过直接平均加权来探索单模型对于整体的提高贡献来赋值。

In [17]:
time0 = time.time()
scores_names =[
    'p1-1-bigru-512.npy',
    'p1-2-bigru-512-true.npy',
    'textcnn-fc-drop-title-content-256-3457-drop0.5.npy',
    'f1-1-cnn-256-23457-11.npy',

    'han-cnn-title-content-256-345.npy',
    'han-cnn-title-content-256-23457-1234.npy',
    'm7-rnn-cnn-256-100.npy',
    'p2-1-rnn-cnn-256-256.npy',

    'p3-2-cnn-256-2357.npy',
    'p3-cnn-512-23457.npy',
    'textcnn-fc-drop-title-content-256-345.npy',   # 提高了两个千分点
    'textcnn-fc-drop-title-content-256-3457-drop0.2.npy',

    'm9-han-bigru-title-content-512-30.npy',
    'm9-2-han-bigru-title-content-512-30.npy',
    'han-bigru-title-content-256-30.npy',
    'm8-han-bigru-256-30.npy',

    'attention-bigru-title-content-256.npy',
    'm7-2-rnn-cnn-128-100.npy',
    'textcnn-fc-title-content-256-345.npy',
    'm1-2-fasttext-topicinfo.npy',

    'ch3-1-cnn-256-2345.npy',
    'ch3-2-cnn-256-23457.npy', 
    'ch4-1-han-bigru-256-52.npy',    
    'ch5-1-2embed-rnn256-cnn2345.npy',

    'p4-1-han-bigru-256.npy',
    'ch6-1-han-cnn-2345-1234.npy',
    'p5-1-2embed-rnn256-cnn2345.npy',
    'ch5-2-2embed-rnn512-cnn3457.npy',

    'c1-1-cnn-max-256-23457.npy',
    'c1-2-cnn-256-345710.npy',     
    'c2-1-bigru-256.npy',
    
    'textcnn-fc-drop-title-content-256-345-cross3cross0.npy',
    'textcnn-fc-drop-title-content-256-345-cross3cross1.npy',
    'textcnn-fc-drop-title-content-256-345-cross3cross2.npy',
    'p3-3-cnn-max-256-345710.npy',
    'textcnn-title-256-len50.npy',
    'ch7-1-2embed-rnn256-hcnn-2345-1234.npy',
]  
   

weights = [  9.75938817,   8.63945014,  2.98289344,   3.72323394,   5.04378259,
   0.06551187,  -0.79412528,  -0.21665029,   4.90162676,   1.17452791,
  -1.46124679,  -0.25384273,   5.50925013,   2.84186738,  -0.93016907,
   5.16519035,  -0.47061662,   2.75998217,   2.58152296,  -1.24553333,
   2.43288558,   6.17376317,   5.59323762,  10.46123521,   5.29952925,
   3.72042086,   5.46707444,   5.51516916,   5.82352659,   1.27847427,
  -0.52930247,  -1.99052155,  -3.0938045,   -2.07007845,   4.19963813,
   2.10593832,   1.74174258]

soft_scores_path = 'local_scores/'
print(len(scores_names), len(weights))
print('All %d models' % len(weights))
sum_scores = np.zeros((len(marked_labels_list), 1999), dtype=float)
scores_name_num = len(scores_names)
for i in xrange(len(weights)):
    scores_name = scores_names[i]
    print('%d/%d, scores_name= %s' % (i+1, scores_name_num, scores_name))
    score = np.vstack(np.load(soft_scores_path + scores_name))
    score = softmax(score)
    sum_scores = sum_scores + score* weights[i]
print('sum_scores.shape=',sum_scores.shape)
predict_labels_list = map(lambda label: label.argsort()[-1:-6:-1], sum_scores) # 取最大的5个下标
predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
print('local valid p=%g, r=%g, f1=%g;' % ( precision, recall, f1))
print('Finished , costed time %g s' % (time.time() - time0))
last_f1 = f1

37 37
All 37 models
1/37, scores_name= p1-1-bigru-512.npy
2/37, scores_name= p1-2-bigru-512-true.npy
3/37, scores_name= textcnn-fc-drop-title-content-256-3457-drop0.5.npy
4/37, scores_name= f1-1-cnn-256-23457-11.npy
5/37, scores_name= han-cnn-title-content-256-345.npy
6/37, scores_name= han-cnn-title-content-256-23457-1234.npy
7/37, scores_name= m7-rnn-cnn-256-100.npy
8/37, scores_name= p2-1-rnn-cnn-256-256.npy
9/37, scores_name= p3-2-cnn-256-2357.npy
10/37, scores_name= p3-cnn-512-23457.npy
11/37, scores_name= textcnn-fc-drop-title-content-256-345.npy
12/37, scores_name= textcnn-fc-drop-title-content-256-3457-drop0.2.npy
13/37, scores_name= m9-han-bigru-title-content-512-30.npy
14/37, scores_name= m9-2-han-bigru-title-content-512-30.npy
15/37, scores_name= han-bigru-title-content-256-30.npy
16/37, scores_name= m8-han-bigru-256-30.npy
17/37, scores_name= attention-bigru-title-content-256.npy
18/37, scores_name= m7-2-rnn-cnn-128-100.npy
19/37, scores_name= textcnn-fc-title-content-256-3

### 模拟梯度下降方式进行权重调整
- 在初期可以把学习率设置大些，比如 0.5;
- 后期可能权重基本不能变动或者线下的 f1 值变动不大，这时候需要手动调整一下权重，比如把权值千分位及后面的小数点全部去掉，加入扰动以后 f1 值继续取得一定的提高。
- 迭代到一定程度会对线下数据集过拟合，所以需要线上评测来验证最后的结果。

In [None]:
from multiprocessing import  Pool

time0 = time.time()
last_f1 = f1
best_f1 = f1
lr = 0.15


def get_update_weight(score_name):
    """根据线下验证集的 f1 值变化趋势来调整模型的融合权重。
    Args:
        score_name: 需要调整的模型。
    Returns:
        lr: 模型的权重变化。
    """
    global lr   # 权重调整率
    score = np.vstack(np.load(soft_scores_path + score_name))
    new_score = sum_scores + score*lr
    predict_labels_list = map(lambda label: label.argsort()[-1:-6:-1], new_score) # 取最大的5个下标
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    if f1 > last_f1:
        return lr
    else:
        new_score = sum_scores - score*lr
        predict_labels_list = map(lambda label: label.argsort()[-1:-6:-1], new_score) # 取最大的5个下标
        predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
        precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
        if f1 > last_f1:
            return -lr
    return 0.0
    
# 更新权重 
f1_list = list()
w_list = list()
decay1 = 0.995
decay2 = 0.95
decay = decay1
for i in xrange(200):
    if i == 50:
        decay = decay2    # 增加下降速度
    lr = lr * decay
    p = Pool(8)
    weights = np.asarray(weights)
    print('=='*10, i, lr)
    print('LAST_F1=', last_f1)
    update_w = p.map(get_update_weight, scores_names)
    update_w = np.asarray(update_w)
    p.close()#关闭进程池，不再接受新的进程
    p.join()#主进程阻塞等待子进程的退出, 必须要退出以后才能更新里边的全局变量
    print('update_w=', update_w)
    weights =  weights + update_w              # 更新
    print('new_w=', weights)
    sum_scores = np.zeros((100000, 1999), dtype=float)
    for i in xrange(len(weights)):       # 新的权重组合
        scores_name = scores_names[i]
        score = np.vstack(np.load(soft_scores_path + scores_name))
        score = softmax(score)
        sum_scores = sum_scores + score*weights[i]     # 新的 sum_scores
    predict_labels_list = map(lambda label: label.argsort()[-1:-6:-1], sum_scores) # 取最大的5个下标
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('NEW_F1=',f1)
    if f1 > best_f1:
        best_f1 = f1
        np.save('best_weights.npy', weights)
    f1_list.append(f1)
    w_list.append(weights)
    last_f1 = f1 # 更新 f1 
    print('**Best_f1=%f; Speed: %g s / epoch.' % (best_f1, time.time() - time0))
    time0 = time.time()


LAST_F1= 0.431113164863
update_w= [ 0.14925  0.       0.      -0.14925 -0.14925  0.      -0.14925  0.
  0.14925  0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.      -0.14925  0.       0.      -0.14925  0.       0.
  0.       0.       0.       0.       0.       0.       0.       0.       0.
  0.14925  0.     ]
new_w= [ 13.21925  12.3       3.3       6.27075   7.40075  -1.03      0.62075
   0.13      5.99925   4.12     -1.62      1.31      7.58      2.98     -0.61
   5.75      1.24      4.85      4.57      0.37      3.18075   9.82      8.26
  14.24075   7.57      6.33      8.29      8.9       9.79      2.69      0.29
  -0.58     -5.12     -2.13      5.14      3.40925   3.88   ]
NEW_F1= 0.431089025211
**Best_f1=0.431113; Speed: 336.325 s / epoch.
LAST_F1= 0.431089025211
update_w= [-0.14850375  0.14850375 -0.14850375  0.          0.14850375 -0.14850375
  0.14850375 -0.14850375 -0.14850375 -0.14850375  0.14850375 -0.14850375
  0.14850375 -0.14850375

Process PoolWorker-423:
Process PoolWorker-421:
Process PoolWorker-420:
Process PoolWorker-424:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/common/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/home/common/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/home/common/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/home/common/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
    self.run()
    self.run()
  File "/home/common/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/home/common/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/home/common/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/home/common/anaconda2/lib/pyth