In [4]:
import pandas as pd
import numpy as np
import joblib

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split


### joblib保存

In [3]:
%%time
# 导入训练数据集
data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])
data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])
major_df = joblib.load('../../preprocess_data_discrete/major_df.lz4').head(33465)
# 以下3个数据代替原先的data_raw
cat_df = joblib.load('../../preprocess_data_discrete/cat_df.lz4').head(33465)
discrete_df = joblib.load('../../preprocess_data_discrete/discrete_df.lz4').head(33465)
scale_df = joblib.load('../../preprocess_data_discrete/scale_df.lz4').head(33465)
data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])
maj_cnt_df = joblib.load('../../preprocess_data_discrete/maj_cnt_df.lz4').head(33465)

data = pd.concat([data_date,data_null,major_df,cat_df,discrete_df,scale_df,data_tag,maj_cnt_df],axis=1)
data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])
x = data.fillna(-1).values
y = data_label.values.ravel()

CPU times: user 57 s, sys: 54.5 s, total: 1min 51s
Wall time: 1min 58s


In [23]:
%%time
joblib.dump(data,'./train_data.lz4',compress='lz4')

['./train_data.lz4']

In [12]:
%%time
# 导入测试数据
valid_date = pd.read_csv('../../preprocess_data/valid_date.csv').drop(columns=['id','loan_hour'])
valid_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])
major_test = joblib.load('../../preprocess_data_discrete/major_test.lz4')
cat_test = joblib.load('../../preprocess_data_discrete/cat_test.lz4')
discrete_test = joblib.load('../../preprocess_data_discrete/discrete_test.lz4')
scale_test = joblib.load('../../preprocess_data_discrete/scale_test.lz4')
valid_tag = pd.read_csv('../predict_tag/tag.csv',usecols=['tag'])
maj_cnt_test = joblib.load('../../preprocess_data_discrete/maj_cnt_test.lz4')

CPU times: user 8.92 s, sys: 860 ms, total: 9.78 s
Wall time: 9.86 s


In [13]:
valid = pd.concat([valid_date,valid_null,major_test,cat_test,discrete_test,scale_test,valid_tag,maj_cnt_test],axis=1)

In [14]:
%%time
joblib.dump(valid,'./valid_data.lz4',compress='lz4')

CPU times: user 4.66 s, sys: 532 ms, total: 5.19 s
Wall time: 9.37 s


['./valid_data.lz4']

### 加载数据

In [3]:
%%time
# train
data = joblib.load('./train_data.lz4')
data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])
x = data.fillna(-1).values
y = data_label.values.ravel()

CPU times: user 9.3 s, sys: 11.1 s, total: 20.4 s
Wall time: 20.4 s


In [18]:
%%time
# test
valid = joblib.load('./valid_data.lz4')
x_test = valid.fillna(-1).values

CPU times: user 844 ms, sys: 776 ms, total: 1.62 s
Wall time: 1.62 s


In [50]:
# 线上测试
x_train = x
y_train = y

In [51]:
def SelectModel(model_name):
    if model_name == 'GBC':
        from sklearn.ensemble import GradientBoostingClassifier
        model = GradientBoostingClassifier(loss='deviance',
                                           learning_rate =0.1,
                                           n_estimators=300,
                                           subsample=0.9,
                                           max_depth=3,
#                                            verbose=1,
                                          random_state=2018)
    elif model_name == 'XGB':
        from xgboost import XGBClassifier

        model = XGBClassifier(max_depth=6,
                              learning_rate =0.04, 
                              booster='gbtree',
                              objective='binary:logistic',
                              early_stopping_rounds=100,
                              scale_pos_weight=float(len(y_train)-np.sum(y_train))/float(np.sum(y_train)),
                              eval_metric='auc',
                              gamma=1,
                              reg_lambda=1,
                              subsample=0.9,
                              min_child_weight=1,
                              seed=2018,
                              silent=False,
                              n_jobs=24,
                              num_boost_round =400
                             )
    elif model_name == 'RFC':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=1500,
                                       n_jobs =36,
                                       max_features='sqrt',
                                       class_weight='balanced',
                                       verbose =1,
                                       random_state=2018)
    elif model_name == 'LGB':
        from lightgbm import LGBMClassifier
        model = LGBMClassifier(boost='gbdt',
                    num_leaves=135, 
                    scale_pos_weight=float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),
                    max_depth=-1,
                    learning_rate=.04,
                    max_bin=200,
                    min_data_in_leaf= 60,
                    objective='binary',
                    metric='auc',
                    num_threads=32,
                    slient=False,
                    num_boost_round =400)
    else:
        pass
    return model


In [None]:
%%time
rfc_model = SelectModel('RFC')
rfc_model.fit(x_train[:,:10800],y_train)
joblib.dump(rfc_model,'rfc_model')

lgb_model = SelectModel('LGB')
lgb_model.fit(x_train,y_train)
joblib.dump(lgb_model,'lgb_model')

xgb_model = SelectModel('XGB')
xgb_model.fit(x_train,y_train)
joblib.dump(xgb_model,'xgb_model')

gbc_model = SelectModel('GBC')
gbc_model.fit(x_train,y_train)
joblib.dump(gbc_model,'gbc_model')



[Parallel(n_jobs=36)]: Using backend ThreadingBackend with 36 concurrent workers.
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    5.4s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:   14.2s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:   26.2s


### voting

In [26]:
test_id = pd.read_csv('../../preprocess_data_new/valid_date.csv',usecols=['id']).values.ravel()

In [1]:
import joblib
rfc_model = joblib.load('rfc_model')
lgb_model = joblib.load('lgb_model')
xgb_model = joblib.load('xgb_model')
gbc_model = joblib.load('gbc_model')

In [29]:
rfc_pred = rfc_model.predict_proba(x_test[:,:10800])[:,1]
lgb_pred = lgb_model.predict_proba(x_test)[:,1]
xgb_pred = xgb_model.predict_proba(x_test)[:,1]
gbc_pred = gbc_model.predict_proba(x_test)[:,1]

[Parallel(n_jobs=36)]: Using backend ThreadingBackend with 36 concurrent workers.
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.2s
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.3s
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    0.6s
[Parallel(n_jobs=36)]: Done 1178 tasks      | elapsed:    0.9s
[Parallel(n_jobs=36)]: Done 1500 out of 1500 | elapsed:    1.1s finished


In [35]:
pred = pd.DataFrame()
pred['id'] = test_id
pred['prob'] = (rfc_pred + lgb_pred + xgb_pred + gbc_pred)/4
import os
if not os.path.exists('./pred'):
    os.mkdir('pred')
pred.to_csv('./pred/voting_pred.txt',index=False)

voting_pred AUC:0.8121  