In [1]:
import pandas as pd
from minepy import MINE
from joblib  import Parallel,delayed
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## 1.展示36个xgb的mic矩阵

In [7]:
def mic(x,y):
    '''
    描述：
        计算两个序列的mic
    参数：
        输入格式为dataframe
    '''
    if type(x) == pd.DataFrame:
        x = x.score.values.ravel()
        y = y.score.values.ravel()
    m = MINE(est ='mic_e')
    m.compute_score(x,y)
    return m.mic()

def cal_row(x, pred_list, n_jobs=-1):
    '''
    描述：
        计算一组序列中两两元素序列的的mic，返回list
    '''
    if n_jobs == -1:
        n_jobs = len(pred_list)
    result = Parallel(n_jobs=len(pred_list),verbose=10)(delayed(mic)(x,y) for y in pred_list)
    return result

def cal_matrix(pred_list, n_jobs = -1):
    '''
    描述：
        计算mic matrix，思路是两层for循环，内层循环在cal_row函数中实现
        缺点是计算量很大，会占用很多进程，可以通过自由设置njob来解决
    '''
    if n_jobs == -1:
        n_jobs = len(pred_list)
    
    result = Parallel(n_jobs=n_jobs,verbose=10)(delayed(cal_row)(x, pred_list, n_jobs) for x in pred_list)
    return np.array(result)

def plot_mic_matrix(mic_matrix,ticks):
    '''
    描述：
        根据mic矩阵绘制热力图
    '''
    plt.figure(figsize=(20,20))
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    sns.heatmap(mic_matrix,linewidths=0.1,vmax=1.0,
            square=True,linecolor='white',annot=True,xticklabels=ticks,yticklabels =ticks)

In [9]:
%%time
file_names = [file for file in os.listdir('./preds/') if file[0]!='.']
file_names = sorted(file_names,key=lambda x:int(x.split('.')[0][3:]))
pred_list = []
for file in file_names:
    pred = pd.read_csv(os.path.join('./preds/',file))
    pred_list.append(pred)
    
mic_matrix = cal_matrix(pred_list,4)
joblib.dump(mic_matrix,'mic_matrix')

In [None]:
plot_mic_matrix(mic_matrix,file_names)

### 2.blending

In [10]:
# 给file_names排序
file_names = [file for file in os.listdir('./preds/') if file[0]!='.']
file_names = sorted(file_names,key=lambda x:int(x.split('.')[0][3:]))

In [11]:
# 根据热力图，挑选相关度较低的预测结果
index = [0,1,3,11,16,17,21,24,26,30,32]
pred = pd.DataFrame()
pred_0 = pd.read_csv(os.path.join('./preds/',file_names[0]))
pred['id'] = pred_0.id
pred_prob = pred_0.score
for idx in index:
    pred_tmp = pd.read_csv(os.path.join('./preds/',file_names[idx]))
    pred_prob+=pred_tmp.score
pred['prob'] = pred_prob.values
pred.to_csv('avg_pred.txt',index=False) 

### 线上valid-acu
auc: 0.83046852887673