In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# 二级stacking

In [None]:
%%time
# 导入训练数据集
data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])
data_raw = pd.read_csv('../../preprocess_data_new/train_ax_nodup.csv',nrows=33465).drop(columns=['id','loan_dt','tag'])
# data_raw = pd.read_csv('../../preprocess_data/train_x_33465.csv').drop(columns=['id','loan_dt','tag'])
# data_null = pd.read_csv('../../preprocess_data/train_x_null.csv').drop(columns=['id'])
data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])
data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])

data = pd.concat([data_date,data_raw,data_null,data_tag],axis=1)
data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])
x = data.fillna(-1).values
y = data_label.values.ravel()

## 不同用途的数据集产生（3种）

In [None]:
# 1.本地验证
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=2018,test_size=0.3)

# 2. 过采样
# from imblearn.over_sampling import BorderlineSMOTE
# x_resampled, y_resampled = BorderlineSMOTE().fit_resample(x_train, y_train)

# 线上测试
# x_train = x
# y_train = y



In [215]:
def SelectModel(model_name):
    if model_name == 'GBC':
        from sklearn.ensemble import GradientBoostingClassifier
        model = GradientBoostingClassifier(loss='deviance',
                                           learning_rate =0.1,
                                           n_estimators=300,
                                           subsample=0.9,
                                           max_depth=3,
#                                            verbose=1,
                                          random_state=2018)
    elif model_name == 'XGB':
        from xgboost import XGBClassifier

        model = XGBClassifier(max_depth=6,
                              learning_rate =0.04, 
                              booster='gbtree',
                              objective='binary:logistic',
                              early_stopping_rounds=100,
                              scale_pos_weight=float(len(y_train)-np.sum(y_train))/float(np.sum(y_train)),
                              eval_metric='auc',
                              gamma=1,
                              reg_lambda=1,
                              subsample=0.9,
                              min_child_weight=1,
                              seed=2018,
                              silent=False,
                              n_jobs=24,
                              num_boost_round =300
                             )
    elif model_name == 'RFC':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=1500,
                                       n_jobs =36,
                                       max_features='sqrt',
                                       class_weight='balanced',
#                                        verbose =1,
                                       random_state=2018)
    elif model_name == 'LGB':
        from lightgbm import LGBMClassifier
        model = LGBMClassifier(boost='gbdt',
                    num_leaves=135, 
                    scale_pos_weight=float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),
                    max_depth=-1,
                    learning_rate=.05,
                    max_bin=200,
                    min_data_in_leaf= 60,
                    objective='binary',
                    metric='auc',
                    num_threads=32,
                    slient=False,
                    num_boost_round =300)
    else:
        pass
    return model

def get_oof(clf,n_folds,x_train,y_train,x_test):
    '''
    描述：
        stacking的关键代码，通过k折法训练及预测训练集，避免data leak。
    参数：
        input：
            clf: sklearn接口的学习器对象
            其他：略
    '''
    n_train = x_train.shape[0]
    n_test = x_test.shape[0]
    num_class = len(np.unique(y_train))
    kf = KFold(n_splits=n_folds,random_state=2018)
    oof_train = np.zeros((n_train,num_class-1))
    oof_test = np.zeros((n_test,num_class-1))
    
    for i,(train_index,test_index)in enumerate(kf.split(x_train,y_train)):
        print(i)
        kf_x_train = x_train[train_index]
        kf_y_train = y_train[train_index]
        kf_x_test = x_train[test_index]
        
        clf.fit(kf_x_train,kf_y_train)
        tmp_trafold = clf.predict_proba(kf_x_test)
        tmp_test = clf.predict_proba(x_test)
        if tmp_trafold.ndim != 1:
            tmp_trafold = tmp_trafold[:,1:]
            tmp_test = tmp_test[:,1:]
        oof_train[test_index] = tmp_trafold
        oof_test += tmp_test
        
    oof_test = oof_test/n_folds
    return oof_train,oof_test

# stacking——第一级

In [None]:
%%time
# 1.训练第一级中的学习器
model_list = ['XGB','RFC','LGB'] # GBC运行时间过长，所以未把它加入第一级
new_train_list = []
new_test_list = []
for model in model_list:
    clf = SelectModel(model)
    oof_train,oof_test = get_oof(clf,5,x_train,y_train,x_test)
    new_train_list.append(oof_train)
    new_test_list.append(oof_test)
# 2.整合第一级的输出作为第二级的输出
new_train = np.hstack(tuple(new_train_list))
new_test = np.hstack(tuple(new_test_list))

## stacking——第二级

In [None]:

# 在第二级中，使用多个模型来融合第一层的输出，选出效果最佳的组合
# lightgbm
from lightgbm import LGBMClassifier
lgb_model = LGBMClassifier(boost='gbdt',
            num_leaves=145, 
            scale_pos_weight=float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),
            max_depth=3,
            learning_rate=.01,
            max_bin=200,
            min_data_in_leaf= 3,
            objective='binary',
            metric='auc',
            num_threads=32,
            slient=False,
            num_boost_round =120)
lgb_model.fit(new_train,y_train)

# random forest
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier(n_estimators=800,
                               n_jobs =36,
                               criterion='gini',
                               max_depth=5,
                               min_samples_split=8,
                               max_features='sqrt',
                               class_weight='balanced',
                               verbose =1,
                               random_state=2018)
rfc_model.fit(new_train,y_train)

# xgboost
from xgboost import XGBClassifier
xgb_model = XGBClassifier(max_depth=3,
                      learning_rate =0.02, 
                      booster='gbtree',
                      objective='binary:logistic',
                      early_stopping_rounds=100,
                      scale_pos_weight=float(len(y_train)-np.sum(y_train))/float(np.sum(y_train)),
                      eval_metric='auc',
                      gamma=0.1,
                      reg_lambda=1,
                      subsample=0.9,
                      min_child_weight=1,
                      seed=2018,
                      silent=False,
                      n_jobs=24,
                      num_boost_round =250
                     )
xgb_model.fit(new_train,y_train)

# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(loss='deviance',
                                   learning_rate =0.05,
                                   n_estimators=300,
                                   subsample=0.9,
                                   max_depth=3,
                                   verbose=1,
                                  random_state=2018)
gbc_model.fit(new_train,y_train)

# logistic regression
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression(penalty='l2',
                        class_weight='balanced',
                        random_state=2018,
                        solver='liblinear',
                        max_iter=150)
lg_model.fit(new_train,y_train)


In [226]:
# stacking第2层做blending,得到stack_pred
y_pred = lgb_model.predict_proba(new_test)[:,1] + \
         rfc_model.predict_proba(new_test)[:,1] + \
         lgb_model.predict_proba(new_test)[:,1] + \
         3*gbc_model.predict_proba(new_test)[:,1] + \
         lg_model.predict_proba(new_test)[:,1]

stack_pred = pd.DataFrame({'id':test_id})
stack_pred['prob'] = y_pred
stack_pred.to_csv('stack_pred.csv',index=False)

# stacking第2层用单一模型，得到stack_pred1
y_pred = lg_model.predict_proba(new_test)[:,1] 
stack_pred = pd.DataFrame({'id':test_id})
stack_pred['prob'] = y_pred
stack_pred.to_csv('stack_pred1.csv',index=False)

线上分数：  
stack_pred AUC:0.8297  
stack_pred1 AUC: 0.82757885054343