In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import glob
import pandas as pd
import numpy as np
HOME = os.path.expanduser('~')
sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
sys.path.append(f"../py/")
import MS_utils
import utils, ml_utils, kaggle_utils
from utils import logger_func
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()
import time

# Columns
key, target, ignore_list = MS_utils.get_basic_var()

2019-03-04 13:22:05,380 utils 346 [INFO]    [logger_func] start 


In [8]:
import re
from sklearn.metrics import roc_auc_score, mean_squared_error

#========================================================================
# CV CHECKER 
# Data Load
def get_cv_score(base_path='../input/base_group*', model_path=''):
    base = utils.read_df_pkl(base_path)[[key, target]].set_index(key)
    train = base[~base[target].isnull()]

    tmp = utils.read_pkl_gzip(model_path)
    if 'pred_mean' in tmp.columns:
        pred_col = 'pred_mean'
    elif 'prediction' in tmp.columns:
        pred_col = 'prediction'
    tmp = tmp.reset_index()[[key, pred_col]].set_index(key)
    
    train['pred'] = tmp[pred_col]

    y_train = train[target].values
    y_pred = train['pred'].values
    
    if metric=='rmse':
        score = np.sqrt(mean_squared_error(y_train, y_pred))
    elif metric=='auc':
        score = roc_auc_score(y_train, y_pred)
    print(f"{metric}: {score}")
    
    return score
#========================================================================


#========================================================================
# 相関

def get_corr_ensemble(path, base, out_part = ''):
    '''
    1. ベースモデルとの相関を個別に取得する
    2. ベースモデルにpath_listのモデルの予測をjoinし、相関行列を取得する
    '''
    
    if key in base.columns:
        base = base.set_index(key)
    
    #========================================================================
    # 個別に相関を見ていく場合
    if str(type(path)).count('str'):
    
        if path.count('CV1'):
            return 0, ''
#         if path.count('binary'):
#             return 0, ''
        tmp = utils.read_pkl_gzip(path)
    
        try:
            tmp = tmp.reset_index()[[key, 'pred_mean']].set_index(key)
            base['tmp_pred'] = tmp.reset_index()['pred_mean']
        except KeyError:
            tmp = tmp.reset_index()[[key, 'prediction']].set_index(key)
            base['tmp_pred'] = tmp['prediction']
    
        if path.count('LB'):
            try:
                cv2 = re.search(r'CV([^/.]*)_LB.gz', path.replace('.', '-')).group(1)
            except AttributeError:
                return 0, ''
        else:
            cv2 = re.search(r'CV([^/.]*)', path.replace('.', '-')).group(1)
    
        corr = np.corrcoef(base['base_pred'], base['tmp_pred'].values).min()
    
        if corr>0 and corr<0.98:
            logger.info(f"CORR: {corr} | CV{cv1[:7]} vs CV{cv2[:7]}")
            
            return corr, path
        else:
            return 0, ''
    #========================================================================
    
    #========================================================================
    # まとめて相関行列を見たい場合
    elif str(type(path)).count('list'):
        path_list = path
        for path in path_list:
        
            tmp = utils.read_pkl_gzip(path)

            if 'pred_mean' in tmp.columns:
                pred_col = 'pred_mean'
            elif 'prediction' in tmp.columns:
                pred_col = 'prediction'
                
            tmp = tmp.reset_index()[[key, pred_col]].set_index(key)
                
            if path.count('LB'):
                try:
                    cv2 = re.search(r'CV([^/.]*)_LB', path.replace('.', '-')).group(1)
                except AttributeError:
                    print(path.replace('.', '-'))
                    continue
            else:
                cv2 = re.search(r'CV([^/.]*)', path.replace('.', '-')).group(1)
                
            base[f'pr_{cv2[:9]}'] = tmp[pred_col]
            
        drop_cols = [col for col in base.columns if col.count('country_') or col in ignore_list]
        base.drop(drop_cols, axis=1, inplace=True)
        base.sort_index(axis=1, inplace=True)
        mx_corr = base.corr(method='pearson')
        
        return mx_corr
#========================================================================

In [9]:
base_path = '../input/base_group*'
base = utils.read_df_pkl(base_path)[[key, target]].set_index(key)
ens_list = glob.glob('../ensemble/lgb/*_lgb_*.gz')
out_part = 'all'
top_corr = get_corr_ensemble(path=ens_list, base=base, out_part=out_part)
print(top_corr.shape)
display(top_corr.head())

(6, 6)


Unnamed: 0,pr_0-69268-g,pr_0-73311,pr_0-73875,pr_0-74117,pr_0-74212,pr_0-74426
pr_0-69268-g,1.0,0.759289,0.736999,0.758554,0.779508,0.748036
pr_0-73311,0.759289,1.0,0.916064,0.911821,0.951586,0.899496
pr_0-73875,0.736999,0.916064,1.0,0.910716,0.913838,0.921034
pr_0-74117,0.758554,0.911821,0.910716,1.0,0.952783,0.976492
pr_0-74212,0.779508,0.951586,0.913838,0.952783,1.0,0.939812


In [15]:
#========================================================================
# Get Low Corr Model For Ensemble
select_list = []
thres_corr = 0.97
# thres_corr = 0.985
base_model = top_corr.iloc[0].index[0]
select_list.append(base_model)

def ensemble_corr_checker(model):
    df_corr = top_corr.loc[model].reset_index()
    
    current_idx = list(top_corr.index).index(model)
    df_corr = df_corr.iloc[current_idx:, ]
    best_cv = 100
    best_corr = 0
    for _, rows in df_corr.iterrows():
        cv = rows[0]
        corr = rows[1]
        if cv.count('3-60') or cv.count('3-7') or cv.count('3-8'):
            continue
        
        if cv not in select_list and corr<thres_corr:
            tmp = top_corr.loc[select_list, cv]
            cnt = tmp[tmp>=thres_corr].sum()
            if cnt==0:
                return cv, corr
    
    return '', 0

# cv, corr = ensemble_corr_checker(base_model)
# if corr>=thres_corr:
#     continue
    
is_no_stack = 0
is_loop = True
while is_loop:
    for model in select_list:    
        cv, corr = ensemble_corr_checker(base_model)
        if len(select_list)==1:
            if corr:
                select_list.append(cv)
        elif len(select_list)>1:
            cv, corr = ensemble_corr_checker(select_list[-1])
            if corr:
                select_list.append(cv)
            else:
                is_loop=False
#========================================================================

In [16]:
print(len(select_list))
print(select_list)
df_select = top_corr.loc[select_list, select_list]
# tmp = df_select['lb3664'].iloc[:9]
# select_list = list(tmp.index)
# select_list
df_select

45
['pr_3-6154197', 'pr_3-6195263', 'pr_3-6216744', 'pr_3-6221003', 'pr_3-6236254', 'pr_3-6239069', 'pr_3-6243721', 'pr_3-6277513', 'pr_3-6297296', 'pr_3-6316576', 'pr_3-6319548', 'pr_3-6333204', 'pr_3-6335126', 'pr_3-6358877', 'pr_3-6377968', 'pr_3-6388111', 'pr_3-6390038', 'pr_3-6394066', 'pr_3-6397627', 'pr_3-6401138', 'pr_3-6405007', 'pr_3-6406494', 'pr_3-6410169', 'pr_3-6418387', 'pr_3-6418563', 'pr_3-6419588', 'pr_3-6421311', 'pr_3-6421744', 'pr_3-6423881', 'pr_3-6430089', 'pr_3-6436952', 'pr_3-6438394', 'pr_3-6439728', 'pr_3-6441394', 'pr_3-6445607', 'pr_3-6445624', 'pr_3-6448721', 'pr_3-6470229', 'pr_3-6484876', 'pr_3-6488963', 'pr_3-6506615', 'pr_3-6514559', 'pr_3-6536747', 'pr_3-6557007', 'pr_3-6569439']


Unnamed: 0,pr_3-6154197,pr_3-6195263,pr_3-6216744,pr_3-6221003,pr_3-6236254,pr_3-6239069,pr_3-6243721,pr_3-6277513,pr_3-6297296,pr_3-6316576,pr_3-6319548,pr_3-6333204,pr_3-6335126,pr_3-6358877,pr_3-6377968,pr_3-6388111,pr_3-6390038,pr_3-6394066,pr_3-6397627,pr_3-6401138,pr_3-6405007,pr_3-6406494,pr_3-6410169,pr_3-6418387,pr_3-6418563,pr_3-6419588,pr_3-6421311,pr_3-6421744,pr_3-6423881,pr_3-6430089,pr_3-6436952,pr_3-6438394,pr_3-6439728,pr_3-6441394,pr_3-6445607,pr_3-6445624,pr_3-6448721,pr_3-6470229,pr_3-6484876,pr_3-6488963,pr_3-6506615,pr_3-6514559,pr_3-6536747,pr_3-6557007,pr_3-6569439
pr_3-6154197,1.0,0.965365,0.960435,0.962805,0.952636,0.942651,0.962099,0.915516,0.947538,0.946314,0.950182,0.938905,0.934038,0.945462,0.933794,0.930257,0.897368,0.889727,0.919702,0.924274,0.922536,0.919174,0.925383,0.921807,0.916099,0.919699,0.917982,0.92169,0.894228,0.923796,0.894242,0.890548,0.890348,0.919679,0.919294,0.894373,0.919866,0.922275,0.896263,0.895315,0.879844,0.912337,0.897514,0.904549,0.898446
pr_3-6195263,0.965365,1.0,0.963512,0.965474,0.955198,0.931878,0.965677,0.910793,0.950326,0.953347,0.951347,0.941613,0.936828,0.93218,0.920972,0.939026,0.893161,0.883971,0.925461,0.931947,0.927711,0.92671,0.915074,0.927727,0.922832,0.924802,0.924723,0.927576,0.889774,0.926369,0.888773,0.886321,0.886589,0.925719,0.925267,0.890436,0.925124,0.916843,0.892348,0.890959,0.874715,0.913465,0.901434,0.900061,0.895678
pr_3-6216744,0.960435,0.963512,1.0,0.963711,0.957339,0.931845,0.966408,0.918132,0.955625,0.951274,0.957567,0.966394,0.941277,0.936206,0.92776,0.941306,0.898918,0.891389,0.914053,0.942985,0.915304,0.931617,0.92789,0.915875,0.911909,0.916351,0.913469,0.914606,0.894532,0.949906,0.896372,0.892954,0.892628,0.915305,0.915794,0.89535,0.914546,0.923721,0.899771,0.898267,0.880084,0.923477,0.912785,0.908745,0.908951
pr_3-6221003,0.962805,0.965474,0.963711,1.0,0.954998,0.928477,0.965896,0.908045,0.95089,0.957253,0.949723,0.94267,0.935267,0.927277,0.91551,0.930557,0.888687,0.880548,0.911111,0.924204,0.912835,0.917565,0.911726,0.914387,0.910391,0.91254,0.911519,0.91237,0.885379,0.925284,0.885329,0.883613,0.881939,0.911815,0.913031,0.885018,0.912135,0.912493,0.888856,0.887222,0.869527,0.912898,0.89864,0.894792,0.891342
pr_3-6236254,0.952636,0.955198,0.957339,0.954998,1.0,0.929134,0.960624,0.913862,0.967603,0.941974,0.950743,0.936978,0.966991,0.930185,0.926758,0.935094,0.893106,0.886009,0.903109,0.931472,0.905116,0.928939,0.916274,0.905005,0.904606,0.904534,0.903524,0.905483,0.890375,0.920856,0.889925,0.887623,0.888147,0.90591,0.906263,0.891046,0.905762,0.914643,0.895061,0.891074,0.87459,0.932237,0.905487,0.904225,0.896251
pr_3-6239069,0.942651,0.931878,0.931845,0.928477,0.929134,1.0,0.935531,0.964915,0.923496,0.918395,0.933115,0.910748,0.917035,0.924311,0.925654,0.917617,0.947242,0.939801,0.884467,0.91791,0.88944,0.917167,0.914694,0.886185,0.88499,0.88646,0.884671,0.888448,0.942317,0.900635,0.943483,0.943259,0.94295,0.886753,0.88546,0.946274,0.888285,0.902903,0.941752,0.944429,0.930143,0.901934,0.890126,0.895847,0.887368
pr_3-6243721,0.962099,0.965677,0.966408,0.965896,0.960624,0.935531,1.0,0.919394,0.954714,0.956737,0.96026,0.944988,0.947957,0.934906,0.927612,0.944318,0.899588,0.890892,0.911678,0.935897,0.913566,0.933872,0.919043,0.915131,0.912237,0.913385,0.911362,0.91408,0.897622,0.932962,0.895724,0.895481,0.894853,0.914746,0.912517,0.897574,0.913316,0.926304,0.900406,0.897888,0.881313,0.925935,0.912709,0.914357,0.908655
pr_3-6277513,0.915516,0.910793,0.918132,0.908045,0.913862,0.964915,0.919394,1.0,0.912934,0.903136,0.929121,0.900716,0.909623,0.915166,0.921586,0.916814,0.960457,0.953503,0.870658,0.919592,0.8715,0.921395,0.911599,0.871444,0.868877,0.871379,0.869887,0.873803,0.948941,0.897831,0.955162,0.948733,0.949816,0.872446,0.870653,0.956957,0.873553,0.904338,0.95345,0.952825,0.940247,0.89968,0.89172,0.903878,0.896223
pr_3-6297296,0.947538,0.950326,0.955625,0.95089,0.967603,0.923496,0.954714,0.912934,1.0,0.940534,0.947601,0.937961,0.96052,0.937354,0.929137,0.94127,0.893458,0.885119,0.903715,0.932915,0.904777,0.933286,0.918316,0.90678,0.905425,0.906881,0.906458,0.906393,0.889465,0.933444,0.889937,0.887246,0.888022,0.905262,0.908448,0.891664,0.907636,0.924067,0.895312,0.892013,0.876122,0.939917,0.909963,0.908885,0.898765
pr_3-6316576,0.946314,0.953347,0.951274,0.957253,0.941974,0.918395,0.956737,0.903136,0.940534,1.0,0.945206,0.93232,0.927618,0.921305,0.908531,0.930945,0.884416,0.875842,0.906395,0.925161,0.9063,0.915573,0.908137,0.909813,0.908249,0.90863,0.908117,0.908297,0.881823,0.921704,0.88126,0.87982,0.880322,0.907038,0.908832,0.880672,0.909233,0.921923,0.88564,0.882871,0.865249,0.909332,0.897843,0.90311,0.901313


In [17]:
# ================================================================
# 相関行列から移動するpathを決める
import shutil
for path in ens_list:
    for sel in select_list:
#         sel_key = sel[3:].replace('-', '.')
        sel_key = sel[3:]
        if path.count(sel_key):
#             shutil.move(path, '../ensemble/lgb_ensemble/')
            try:
                shutil.move(path, '../ensemble/dir_stack_blend/tmp')
#                 shutil.move(path, '../ensemble/rm_outlier_ensemble/tmp/')
            except FileNotFoundError:
                pass
            pass
        else:
            pass
#             shutil.move(path, '../ensemble/used_stack/')
# ================================================================

100%|██████████| 1/1 [00:00<00:00,  2.90it/s]


Unnamed: 0_level_0,target,clf_pred,no_out_flg
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C_ID_92a2005557,-0.820283,0.000444,1.0
C_ID_3d0044924f,0.392913,0.007831,0.0
C_ID_d639edf6cd,0.688056,0.004074,0.0
C_ID_186d6a6901,0.142495,0.000797,0.0
C_ID_cdbd2c0db2,-0.159749,0.000251,1.0
