In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split


### 特征组合

In [3]:
%%time
# 导入训练数据集
data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])
data_raw = joblib.load('../../preprocess_data_new/train_ax_nodup.lz4').head(33465).drop(columns=['loan_dt','id','tag'])
data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])
data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])
week_df = joblib.load('../../preprocess_data_discrete/week_df.lz4').head(33465)

# maj_cnt_df = joblib.load('../../preprocess_data_discrete/maj_cnt_df.lz4').head(33465)
# cat_df = joblib.load('../../preprocess_data_discrete/cat_df.lz4').head(33465)
# discrete_df = joblib.load('../../preprocess_data_discrete/discrete_df.lz4').head(33465)
# scale_df = joblib.load('../../preprocess_data_discrete/scale_df.lz4').head(33465)
# major_df = joblib.load('../../preprocess_data_discrete/major_df.lz4').head(33465)
# day_val_df = joblib.load('../../preprocess_data_discrete/day_val_df.lz4').head(33465)

data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])

CPU times: user 21.4 s, sys: 4.26 s, total: 25.7 s
Wall time: 27.1 s


In [20]:
week_df.head()

Unnamed: 0,loan_week_day,week_day_val,week_day_state
0,2,0.066868,1
1,3,0.063787,0
2,6,0.075942,1
3,3,0.063787,0
4,6,0.075942,1


In [46]:
data = pd.concat([data_date,data_raw,data_null,data_tag],axis=1,copy=False)

In [47]:
x = data.fillna(-1).values
y = data_label.values.ravel()
x.shape

(33465, 6702)

## 数据线上和线下

In [48]:
%%time
# 本地验证
# 1.根据有标签数据，生成一个和线上测试集分布相近的本地验证集
# tag=0 取600个左右
x_train_0, x_test_0, y_train_0, y_test_0 = train_test_split(x[:30465],y[:30465],
                                                          random_state=3096,test_size=0.02,stratify=y[:30465])

# tag=1 取1500个左右 
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(x[30465:],y[30465:],
                                                            random_state=2018,test_size=0.5,stratify=y[30465:])

x_train = np.vstack((x_train_0,x_train_1))
x_test = np.vstack((x_test_0,x_test_1))
y_train = np.hstack((y_train_0,y_train_1))
y_test = np.hstack((y_test_0,y_test_1))
# 2. 过采样
# from imblearn.over_sampling import BorderlineSMOTE
# x_resampled, y_resampled = BorderlineSMOTE().fit_resample(x_train, y_train)



CPU times: user 15.7 s, sys: 1.54 s, total: 17.2 s
Wall time: 17.2 s


In [49]:
x_train.shape,x_test.shape

((31355, 6702), (2110, 6702))

In [50]:
y_train.shape,y_test.shape

((31355,), (2110,))

## 本地

In [59]:
%%time
#2.建立模型 
model_names = ['rfc','gbc','xgb','lgb']
model_name = model_names[0]
if model_name == 'rfc':
    # RandomForestClassifier
    from sklearn.ensemble import RandomForestClassifier
    rfc  =RandomForestClassifier(n_estimators=1000,oob_score=True,max_depth=100,
                                 n_jobs =40,max_features='sqrt',class_weight='balanced',verbose =1,random_state=2018)
elif model_name == 'gbc':
    from sklearn.ensemble import GradientBoostingClassifier
    gbc = GradientBoostingClassifier(loss='deviance',learning_rate =0.1,n_estimators=300,subsample=0.9,max_depth=3,verbose=1,random_state=2018)
elif model_name == 'xgb':
    import xgboost as xgb
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'early_stopping_rounds':100,
    'scale_pos_weight': float(len(data_label)-np.sum(data_label.values))/float(np.sum(data_label.values)),  # 负例样本除以正例样本
    'eval_metric': 'auc',
    'gamma':1,
    'max_depth':6,
    'lambda':1,
    'subsample':0.9,
    'colsample_bytree':0.9,
    'min_child_weight':1, 
    'eta': 0.04,
    'seed':2010,
    'nthread':32
        }
elif model_name == 'lgb':
    import lightgbm as lgb
    parameters = {
        'boost':'gbdt',
        'num_leaves':135, 
        'scale_pos_weight':float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),
        'max_depth':-1,
        'learning_rate':.05,
        'max_bin':200,
        'min_data_in_leaf' : 60,
        'objective':'binary',
        'metric':'auc',
        'verbose':1,
        'num_threads':36
    }
       
else:
    pass

#3.训练模型
print('开始训练！')
import joblib
if model_name == 'rfc':
    rfc.fit(x_train,y_train)
#     joblib.dump(rfc,'./model/rfc_model')
elif model_name == 'gbc':
    gbc.fit(x_train,y_train)
#     joblib.dump(gbc,'./model/rgbc_model')
elif model_name =='xgb':
    dtrain = xgb.DMatrix(x_train,y_train)
    dvalid = xgb.DMatrix(x_test,y_test)
    dtest = xgb.DMatrix(x_test)
    watchlist  = [(dvalid,'valid')]
    xgb_model = xgb.train(params,dtrain,num_boost_round=300,evals=watchlist)
#     xgb_model.save_model('./model/xgb_model')
elif model_name == 'lgb':
    lgb_train = lgb.Dataset(x_train, y_train.ravel())
    lgb_model = lgb.train(parameters,lgb_train,num_boost_round=300)
#     lgb_model.save_model('./model/lgb_model')
else:
    pass
# lg.fit(x_train,y_train.ravel())
# lsvm.fit(x_train,y_train)
# history = model.fit(x_train,y_train,epochs=2,batch_size=1024,class_weight = 'auto',validation_data=(x_test,y_test))

开始训练！


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 120 tasks      | elapsed:    8.6s
[Parallel(n_jobs=40)]: Done 370 tasks      | elapsed:   23.5s
[Parallel(n_jobs=40)]: Done 720 tasks      | elapsed:   44.7s
[Parallel(n_jobs=40)]: Done 1000 out of 1000 | elapsed:  1.0min finished


CPU times: user 38min 32s, sys: 2min 47s, total: 41min 19s
Wall time: 2min 50s


In [60]:
#4.预测结果
print('开始预测！')
if model_name=='rfc':
    y_pre = rfc.predict_proba(x_test)[:,1]
elif model_name=='gbc':
    y_pre = gbc.predict_proba(x_test)[:,1]
elif model_name=='xgb':
    dtest = xgb.DMatrix(x_test)
    y_pre = xgb_model.predict(dtest).ravel()
elif model_name=='lgb':
    y_pre = lgb_model.predict(x_test)
else:
    pass

# y_pre = lsvm.decision_function(X_test)
# y_pre = lg.predict(X_test)
from sklearn import metrics
auc = metrics.roc_auc_score(y_test, y_pre)
print('AUC:',auc)

开始预测！


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 120 tasks      | elapsed:    0.1s
[Parallel(n_jobs=40)]: Done 370 tasks      | elapsed:    0.2s
[Parallel(n_jobs=40)]: Done 720 tasks      | elapsed:    0.4s


AUC: 0.7800153853269748


[Parallel(n_jobs=40)]: Done 1000 out of 1000 | elapsed:    0.5s finished


In [63]:
rfc.oob_score_

0.9382554616488599

## 未作stratify
nodup + null + tag +(fillna(-1))AUC: 0.82785  
nodup + null + tag +(fillna(-1)) + maj_cnt AUC:    
nodup + null + tag +(fillna(-1)) + major_df + maj_cnt_df  AUC: 0.832029(200)
nodup + null + tag +(fillna(-1)) + major_df AUC:0.829731  
nodup + null + tag +(fillna(-1)) + major_df + maj_cnt_df + discrete_df:0.8301
data_raw被替换 AUC：0.821537  
nodup + null + tag +(fillna(-1)) + major_df + maj_cnt_df + week_df AUC: 0.826666  
nodup + null + tag +(fillna(-1))  + maj_cnt_df + week_df AUC:0.828063  
nodup + null + tag +(fillna(-1))  + maj_cnt_df + maj_cnt_df + day_val_df AUC: 0.828095

## stratify
#### xgboost  
nodup + null + tag +(fillna(-1)) AUC:0.814429(200)  
nodup + null + tag +(fillna(-1)) + week_df AUC:0.816316(232)  
nodup + null + tag +(fillna(-1)) + week_df + maj_cnt_df AUC:0.815836(168)
#### rfc  
