In [21]:
import sys,random
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_curve
from sklearn import metrics
import time

In [22]:
def timer (func):
    '''
    描述：
        装饰器函数：计时器
    '''
    def wrapper(*args,**kwargs): 
        start = time.time()
        result = func(*args,**kwargs)
        end = time.time()
        print(func.__name__+'运行时间：',end-start)
        return result
    return wrapper

## 1.加载有标签的训练集（前33465行）

In [4]:
%%time
input_dir = '../../preprocess_data_new/'
data_date = joblib.load(input_dir + 'train_ax_date.lz4')[:33465]
data_nodup = joblib.load(input_dir + 'train_ax_nodup.lz4').drop(columns=['id','loan_dt'])[:33465]
data_label = joblib.load(input_dir + 'train_y_33465.lz4')

In [14]:
# data = pd.concat([data_date,data_nodup],axis=1,ignore_index=True, copy=False)  速度过于缓慢，原因未知！
# 特征拼接
data_nodup.fillna(-1,inplace=True)
x = np.hstack((data_date.values, data_nodup.values))

y = data_label['label'].values


## 2.划分本地的训练集和测试集，测试各种学习器在本数据集上的性能

In [47]:
def SelectModel(model_name):
    '''
    描述：
        选择模型，返回初始化的模型对象（都是sklearn接口的）
    '''
    if model_name == 'GBC':
        from sklearn.ensemble import GradientBoostingClassifier
        model = GradientBoostingClassifier(loss='deviance',
                                           learning_rate =0.1,
                                           n_estimators=200,
                                           subsample=0.9,
                                           max_depth=3,
                                          random_state=2018)
    elif model_name == 'XGB':
        from xgboost import XGBClassifier

        model = XGBClassifier(max_depth=5,
                              learning_rate =0.05, 
                              booster='gbtree',
                              objective='binary:logistic',
                              early_stopping_rounds=100,
                              scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),
                              eval_metric='auc',
                              seed=2018,
                              n_jobs=8,
                              num_boost_round = 200
                             )
    elif model_name == 'RFC':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=1000,
                                       n_jobs =8,
                                       max_features='sqrt',
                                       class_weight='balanced',
                                       random_state=2018)
    elif model_name == 'LGB':
        # 参数修改记录：
        # 对比1：max_depth: -1 ->5
        # 对比3：max_depth：5->8,min_data_in_leaf:60->20
        # 对比4：max_depth：8->4,min_data_in_leaf:20->40, num_boost_round:200->300, num_leaves = 135->100，num_threads：8->24
        # 对比5：max_depth：4->3,min_data_in_leaf:40->100, num_boost_round:300->500, num_leaves = 100->200，num_threads：8->24，max_bin：200->250
        # 对比5：max_depth：4->6,min_data_in_leaf:100->150, num_boost_round:500->350, num_leaves = 200->300，num_threads：8->24，max_bin：200->250
        from lightgbm import LGBMClassifier
        model = LGBMClassifier(boost='gbdt',
                    num_leaves=300, 
                    scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),
                    max_depth=6,
                    learning_rate=.05,
                    max_bin=250,
                    min_data_in_leaf= 150,
                    objective='binary',
                    metric='auc',
                    num_threads=24,
                    slient=False,
                    num_boost_round = 350)
    else:
        pass
    return model

@timer
def train_model(x_train, x_test, y_train, y_test, model_name):
    '''
    描述:
        训练模型，并打印训练时间和auc分数
    '''
    model = SelectModel(model_name)
    model.fit(x_train,y_train)
    if not os.path.exists('./comparation_model'):
        os.mkdir('comparation_model')
    joblib.dump(model, './comparation_model/%s_model'%model_name)
    pred_test = model.predict(x_test)
    auc = metrics.roc_auc_score(y_test, pred_test)
    print('test-auc(%s):'%model_name,auc)

### 对比0（学习器性能对比）

In [33]:
#1.分割数据
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)

In [36]:
#2.训练模型 
model_names = ['RFC','XGB','LGB','GBC']
for model_name in model_names:
    train_model(x_train, x_test, y_train, y_test, model_name)


test-auc(RFC): 0.5082499734460394
train_model运行时间： 80.6416265964508
test-auc(XGB): 0.7180516756363665
train_model运行时间： 62.886208057403564




test-auc(LGB): 0.5549797310197829
train_model运行时间： 125.06994533538818
test-auc(GBC): 0.5287540295227912
train_model运行时间： 1509.906184911728


### 对比1：
把lightgbm的参数max_depth由-1改为5，发现auc由0.55提升至0.70，运行时间由125s缩短至32s，性能提升！

In [38]:
train_model(x_train, x_test, y_train, y_test, 'LGB')



test-auc(LGB): 0.7035308948000503
train_model运行时间： 31.934845447540283


### 对比2：
更换特征data_nodup 为 data_raw  
auc由0.7035变为0.6998，看起来data_nodup效果更佳，但是也有可能是因为lightgbm设置的参数

In [39]:
data_raw = joblib.load(input_dir + 'train_ax.lz4').drop(columns=['id','loan_dt'])[:33465]
data_raw.fillna(-1,inplace=True)
x = np.hstack((data_date.values, data_raw.values))

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
train_model(x_train, x_test, y_train, y_test, 'LGB')



test-auc(LGB): 0.6998177388276248
train_model运行时间： 36.55933976173401


### 对比3

In [42]:
# lightgbm 使用新的参数组合
# 1.使用data_raw
x = np.hstack((data_date.values, data_raw.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('1.使用data_raw')
train_model(x_train, x_test, y_train, y_test, 'LGB')

# 2.使用data_nodup
x = np.hstack((data_date.values, data_nodup.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('2.使用data_nodup')
train_model(x_train, x_test, y_train, y_test, 'LGB')

1.使用data_raw




test-auc(LGB): 0.6118573371362515
train_model运行时间： 75.4497618675232
2.使用data_nodup




test-auc(LGB): 0.6001546971838466
train_model运行时间： 68.99368691444397


### 对比4

In [44]:
# lightgbm 使用新的参数组合
# 1.使用data_raw
x = np.hstack((data_date.values, data_raw.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('1.使用data_raw')
train_model(x_train, x_test, y_train, y_test, 'LGB')

# 2.使用data_nodup
x = np.hstack((data_date.values, data_nodup.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('2.使用data_nodup')
train_model(x_train, x_test, y_train, y_test, 'LGB')

1.使用data_raw




test-auc(LGB): 0.7206630866063126
train_model运行时间： 24.63110637664795
2.使用data_nodup




test-auc(LGB): 0.7137290818324644
train_model运行时间： 23.690422773361206


### 对比5

In [46]:
# lightgbm 使用新的参数组合
# 1.使用data_raw
x = np.hstack((data_date.values, data_raw.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('1.使用data_raw')
train_model(x_train, x_test, y_train, y_test, 'LGB')

# 2.使用data_nodup
x = np.hstack((data_date.values, data_nodup.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('2.使用data_nodup')
train_model(x_train, x_test, y_train, y_test, 'LGB')

1.使用data_raw




test-auc(LGB): 0.7325156016736488
train_model运行时间： 29.611547231674194
2.使用data_nodup




test-auc(LGB): 0.7390925126953997
train_model运行时间： 26.568469762802124


### 对比6

In [48]:
# lightgbm 使用新的参数组合
# 1.使用data_raw
x = np.hstack((data_date.values, data_raw.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('1.使用data_raw')
train_model(x_train, x_test, y_train, y_test, 'LGB')

# 2.使用data_nodup
x = np.hstack((data_date.values, data_nodup.values))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)
print('2.使用data_nodup')
train_model(x_train, x_test, y_train, y_test, 'LGB')

1.使用data_raw




test-auc(LGB): 0.669891099438229
train_model运行时间： 42.93098282814026
2.使用data_nodup




test-auc(LGB): 0.6643234018262608
train_model运行时间： 39.20240902900696


### 对比总结
1. 各个学习器的性能（对比1、2）  
test-auc(RFC): 0.5082499734460394, train_model运行时间： 80.6416265964508  
test-auc(XGB): 0.7180516756363665, train_model运行时间： 62.886208057403564  
test-auc(LGB): 0.5549797310197829, train_model运行时间： 125.06994533538818  
test-auc(GBC): 0.5287540295227912, train_model运行时间： 1509.906184911728
2. data_nodup降维的效果验证  
通过对比2、3、4、5，我们发现：  
降维之后的数据集data_nodup和降维之前的数据集data_raw在test-auc分数基本相同，  
可见data_nodup在降低维度的同时又保留data_raw绝大部分的信息，这是非常好的！

# 4.历史的一些记录，已经没有代码了

###  4.1 (代码整理前的记录)测试本地集auc分数
保留原始数据的记录：  
rfc AUC: 0.805099223842394  
gbc AUC: 0.8293387542510773   
xgb AUC: 0.8313812914152184  
lgb AUC: 0.8430383986560764

 
linearn-svm AUC: 0.5075515879920058  线性不可分    
logistic regreesion AUC: 0.551876338694301 不适用于此类数据  


### 4.2 (代码整理前的记录)线上valid-auc分数

1.xgboost：
raw 去除nan列 + 统计null AUC: 0.82751113123698  
nodup + null + tag AUC: 0.82803008109167  
nodup + null + tag（rank融合）AUC:0.8279914450872  
nodup + null + tag (fillna(-1)) AUC:0.82979480823375  

2.GradientBoostingClassifier AUC:0.81958272831703  

3.RFC AUC:0.78512734928805   

4.20个lgb AUC:0.8079  