In [29]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import logging
import warnings
from pathlib import Path 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import lightgbm as lgb 
import xgboost as xgb 
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score

# 设置显示全部列
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

### 数据预操作

In [8]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum()  / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum()  / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
data = pd.read_csv('data/data_for_model.csv')
data = reduce_mem_usage(data)  # 节约n
logging.info(f"data shape: {data.shape}")
data.head()

Memory usage of dataframe is 366.21 MB
2021-01-16 11:11:20,384 INFO: data shape: (1000000, 48)
Memory usage after optimization is: 97.27 MB
Decreased by 73.4%


Unnamed: 0,id,loanAmnt,term,interestRate,installment,employmentLength,homeOwnership,annualIncome,verificationStatus,isDefault,...,n13,n14,issueDateDT,employmentTitle_cnts,employmentTitle_rank,postCode_cnts,postCode_rank,title_cnts,title_rank,sample
0,0,35008.0,5,19.515625,918.0,2.0,2,110000.0,2,1.0,...,0.0,2.0,2587,1121,1121,2075,2075,7006,7006,train
1,1,18000.0,5,18.484375,462.0,5.0,0,46000.0,2,0.0,...,0.0,2.0,1888,125,125,3789,3789,28,28,train
2,2,12000.0,5,16.984375,298.25,8.0,0,74000.0,2,0.0,...,0.0,4.0,3044,1,1,1754,1754,393334,393334,train
3,3,11000.0,3,7.261719,341.0,10.0,1,118000.0,1,0.0,...,0.0,1.0,2983,2,2,551,551,148211,148211,train
4,4,3000.0,3,12.992188,101.0625,,1,29000.0,2,0.0,...,0.0,4.0,3196,51149,51149,1722,1722,4731,4731,train


In [10]:
#训练数据/测试数据

features_to_drop = ['id','isDefault', 'sample']

X_train = data.loc[data['sample'] == 'train', :].drop(features_to_drop, axis=1)
X_test = data.loc[data['sample'] == 'test', :].drop(features_to_drop, axis=1)
y_train = data.loc[data['sample'] == 'train', 'isDefault']

X_train.shape, X_test.shape, y_train.shape

((800000, 45), (200000, 45), (800000,))

### 模型融合

In [30]:
def lgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'min_child_weight': 0.32,
        'num_leaves': 14,
        'max_depth': 4,
        'feature_fraction': 0.81,
        'bagging_fraction': 0.61,
        'bagging_freq': 9,
        'min_data_in_leaf': 13,
        'min_split_gain': 0.27,
        'reg_alpha': 9.58,
        'reg_lambda': 4.62,
        'seed': 2020,
        'n_jobs':-1,
        'silent': True,
        'verbose': -1,
    }
    
    model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=200)
    # 计算在验证集上的得分
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))

    # 对测试集进行预测
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    return test_pred

def xgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = xgb.DMatrix(X_train_split, label=y_train_split)
    valid_matrix = xgb.DMatrix(X_val, label=y_val)
    test_matrix = xgb.DMatrix(X_test)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 1,
        'min_child_weight': 1.5,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.04,
        'tree_method': 'exact',
        'seed': 2020,
        'n_jobs': -1,
        "silent": True,
    }

    watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
    model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
    val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后xgboost单模型在验证集上的AUC：{}'.format(roc_auc))

    # 对测试集进行预测
    test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

    return test_pred

def cat_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        task_type='CPU',
        learning_rate=0.1,
        iterations=500,
        random_seed=2020,
        od_type='Iter',
        depth=7
    )
    model.fit(X_train_split, y_train_split, eval_set=(X_val, y_val), verbose=500, cat_features=col)
    val_pred = model.predict(X_val)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后catboost单模型在验证集上的AUC：{}'.format(roc_auc))

    # 对测试集进行预测
    test_pred = model.predict(X_test, prediction_type='Probability')[:, -1]
    return test_pred

In [31]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)
model_cat = Classifier(dataset=model_dataset, estimator=cat_model, name='cat', use_cache=False)

#### 使用Stacking方法进行模型融合

In [34]:
from heamy.pipeline import ModelsPipeline

pipeline = ModelsPipeline(model_cat, model_xgb, model_lgb)
pipeline

<heamy.pipeline.ModelsPipeline at 0x7fd636e2e0a0>

In [22]:
%%time 

# 构建第一层新特征， 其中K默认为5， 表示5折交叉验证， full_test=True, 对全部训练集进行训练得到基学习器，然后对测试集进行预测
stack_ds = pipeline.stack(k=5, seed=2020, full_test=True)

2021-01-16 15:06:49,228 INFO: Calculating xgb's fold #1
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69992	eval-auc:0.69903
[200]	train-auc:0.73367	eval-auc:0.72818
[400]	train-auc:0.74162	eval-auc:0.73212
[600]	train-auc:0.74716	eval-auc:0.73379
[800]	train-auc:0.75177	eval-auc:0.73481
[1000]	train-auc:0.75591	eval-auc:0.73522
[1200]	train-auc:0.75970	eval-auc:0.73558
[1400]	train-auc:0.76335	eval-auc:0.73580
[1600]	train-auc:0.76683	eval-auc:0.73584
[1755]	train-auc:0.76938	eval-auc:0.73576
调参后xgboost单模型在验证集上的AUC：0.7358874704122116
2021-01-16 15:20:49,873 INFO: Calculating xgb's fold #2
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XG

In [23]:
# 第二层使用逻辑回归进行 stack
from sklearn.linear_model import LogisticRegression

stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
test_pred = stacker.predict()
test_pred

array([0.09313586, 0.35738364, 0.71145548, ..., 0.15995634, 0.18775493,
       0.07224607])

In [24]:
# 生成提交格式的DataFrame
df_result = pd.DataFrame({'id': data.loc[data['sample'] == 'test', 'id'].values, 'isDefault': test_pred})
df_result.sort_values(by='id').head(10)

Unnamed: 0,id,isDefault
0,800000,0.093136
1,800001,0.357384
2,800002,0.711455
3,800003,0.294181
4,800004,0.380386
5,800005,0.068497
6,800006,0.264772
7,800007,0.076216
8,800008,0.780822
9,800009,0.075935


In [25]:
df_result.to_csv('data/tc/pred_stacking.csv', index=False)

#### 使用blending方法进行模型融合

In [35]:
# 构建第一层新特征，将训练集切分为8：2
blend_ds = pipeline.blend(proportion=0.2, seed=111)
blender = Classifier(dataset=blend_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
test_pred = blender.predict()
test_pred

NameError: name 'col' is not defined

In [27]:
# 生成提交格式的DataFrame
df_result = pd.DataFrame({'id': data.loc[data['sample'] == 'test', 'id'].values, 'isDefault': test_pred})
df_result.sort_values(by='id').head(10)

Unnamed: 0,id,isDefault
0,800000,0.092102
1,800001,0.356927
2,800002,0.657772
3,800003,0.284317
4,800004,0.398475
5,800005,0.067065
6,800006,0.289313
7,800007,0.076159
8,800008,0.748041
9,800009,0.075634


In [28]:
df_result.to_csv('data/tc/pred_blending_3.csv', index=False)