In [6]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split 
import numpy as np
import time
import os

### 导入数据

In [5]:
# 计时器
def timer (func):
    def wrapper(*args,**kwargs): 
        start = time.time()
        result = func(*args,**kwargs)
        end = time.time()
        print(func.__name__+'运行时间：',end-start)
        return result
    return wrapper

@timer
def load_train():
    # 导入数据
    y_train_df = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])
    x_train_df = joblib.load('../semi_super/x_train.lz4')
    x_train_df = x_train_df.fillna(-1)

    x_train_arr = x_train_df.values
    y_train_arr = y_train_df.values.ravel()
    return x_train_arr,y_train_arr

@timer
def load_unlabel():
    prob_unlabel_df = pd.read_csv('../predict_unlabel/preds/unlabel_pred.csv',usecols=['prob'])
    x_unlabel_df = joblib.load('../semi_super/x_unlabel.lz4')
    unlabel_df = pd.concat([x_unlabel_df,prob_unlabel_df],axis=1)
    unlabel_df = unlabel_df.sort_values(by=['prob'],ascending=False)
    unlabel_df = unlabel_df.reset_index(drop=True)
    unlabel_df = unlabel_df.fillna(-1)


    y_unlabel_df = unlabel_df[['prob']].copy()
    x_unlabel_df = unlabel_df.drop(columns=['prob'])

    y_unlabel_df.loc[:int(0.1*len(y_unlabel_df)),'prob'] = 1
    y_unlabel_df.loc[int(0.1*len(y_unlabel_df)):,'prob'] = 0
    # 按prob 由高到低排列
    x_unlabel_arr = x_unlabel_df.values
    y_unlabel_arr = y_unlabel_df.values.ravel()


    # 颠倒一下正负样本的先后次序
    x_unlabel_0 = x_unlabel_arr[int(0.1*len(x_unlabel_df)):]
    x_unlabel_1 = x_unlabel_arr[:int(0.1*len(x_unlabel_df))]
    x_unlabel_arr = np.vstack((x_unlabel_0,x_unlabel_1))

    y_unlabel_0 = y_unlabel_arr[int(0.1*len(y_unlabel_df)):]
    y_unlabel_1 = y_unlabel_arr[:int(0.1*len(y_unlabel_df))]
    y_unlabel_arr = np.hstack((y_unlabel_0,y_unlabel_1))
    
    return x_unlabel_arr,y_unlabel_arr,-int(0.1*len(x_unlabel_df))

### 选择合适的正负比例及数量的unlabel数据加入train集

In [6]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier

# @timer
def gen_semiset(x_unlabel_arr,y_unlabel_arr,start=-6653,num=10,ratio=0.1):
    num_0 = num-int(num*ratio)
    num_1 = int(num*ratio)
    tmp = x_unlabel_arr[start-num_0:start+num_1]
    y_tmp = y_unlabel_arr[start-num_0:start+num_1]
    return tmp,y_tmp

# @timer
def combine_data(x_train_arr,y_train_arr,tmp,y_tmp):


    # 容易出错vstack hstack
    x_train_tmp = np.vstack((x_train_arr,tmp))
    y_train_tmp = np.hstack((y_train_arr,y_tmp))
    return x_train_tmp,y_train_tmp

# @timer
def gen_local_train(x_unlabel_arr,y_unlabel_arr,x_train_arr,y_train_arr,
                    start=-6653,num=10,ratio=0.1):
    tmp,y_tmp = gen_semiset(x_unlabel_arr,y_unlabel_arr,start,num,ratio)
    return combine_data(x_train_arr,y_train_arr,tmp,y_tmp)


# @timer
def train(x_train,y_train,x_test,y_test,model_name='rfc'):
    #2.建立模型 
#     model_names = ['rfc','gbc','xgb','lgb','dtc']
    if model_name == 'rfc':
        # RandomForestClassifier
        
        rfc  =RandomForestClassifier(n_estimators=10,
                                     n_jobs =32,
                                     max_features='sqrt',
                                     class_weight='balanced',
#                                      verbose =1,
                                     random_state=2018)
    elif model_name == 'gbc':
        
        gbc = GradientBoostingClassifier(loss='deviance',learning_rate =0.1,n_estimators=300,subsample=0.9,max_depth=3,verbose=1,random_state=2018)
    elif model_name == 'xgb':
        
        params={
        'booster':'gbtree',
        'objective': 'binary:logistic',
        'early_stopping_rounds':100,
        'scale_pos_weight': float(len(data_label)-np.sum(data_label.values))/float(np.sum(data_label.values)),  # 负例样本除以正例样本
        'eval_metric': 'auc',
        'gamma':1,
        'max_depth':6,
        'lambda':1,
        'subsample':0.9,
        'colsample_bytree':0.9,
        'min_child_weight':1, 
        'eta': 0.04,
        'seed':2010,
        'nthread':32
            }
    elif model_name == 'lgb':
        
        parameters = {
            'boost':'gbdt',
            'num_leaves':135, 
            'scale_pos_weight':float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),
            'max_depth':-1,
            'learning_rate':.05,
            'max_bin':200,
            'min_data_in_leaf' : 60,
            'objective':'binary',
            'metric':'auc',
            'verbose':1,
            'num_threads':36
        }
    elif model_name == 'dtc':
        dtc = DecisionTreeClassifier(class_weight='balanced')
    else:
        pass

    #3.训练模型
#     print('开始训练！')
    if model_name == 'rfc':
        rfc.fit(x_train,y_train)
    #     joblib.dump(rfc,'./model/rfc_model')
    elif model_name == 'gbc':
        gbc.fit(x_train,y_train)
    #     joblib.dump(gbc,'./model/rgbc_model')
    elif model_name =='xgb':
        dtrain = xgb.DMatrix(x_train,y_train)
        dvalid = xgb.DMatrix(x_test,y_test)
        dtest = xgb.DMatrix(x_test)
        watchlist  = [(dvalid,'valid')]
        xgb_model = xgb.train(params,dtrain,num_boost_round=300,evals=watchlist)
    #     xgb_model.save_model('./model/xgb_model')
    elif model_name == 'lgb':
        lgb_train = lgb.Dataset(x_train, y_train.ravel())
        lgb_model = lgb.train(parameters,lgb_train,num_boost_round=50)
    #     lgb_model.save_model('./model/lgb_model')
    elif model_name == 'dtc':
        dtc.fit(x_train,y_train)
    else:
        pass
    # lg.fit(x_train,y_train.ravel())
    # lsvm.fit(x_train,y_train)
    # history = model.fit(x_train,y_train,epochs=2,batch_size=1024,class_weight = 'auto',validation_data=(x_test,y_test))
     #4.预测结果
#     print('开始预测！')
    if model_name=='rfc':
        y_pre = rfc.predict_proba(x_test)[:,1]
    elif model_name=='gbc':
        y_pre = gbc.predict_proba(x_test)[:,1]
    elif model_name=='xgb':
        dtest = xgb.DMatrix(x_test)
        y_pre = xgb_model.predict(dtest).ravel()
    elif model_name=='lgb':
        y_pre = lgb_model.predict(x_test)
    elif model_name == 'dtc':
        y_pre = dtc.predict(x_test)[:,1]
    else:
        pass
    
    auc = metrics.roc_auc_score(y_test, y_pre)
#     print('AUC:',auc)
    return auc
        

In [7]:
# 导入所有训练数据
x_train_arr,y_train_arr = load_train()
x_unlabel_arr,y_unlabel_arr,start = load_unlabel()
START = start

# 生成有效训练数据
STEP = 6
NUM = 15000
x_train_new,y_train_new = gen_local_train(x_unlabel_arr,y_unlabel_arr,x_train_arr,y_train_arr,start=START,num=NUM,ratio=1/STEP)

load_train运行时间： 10.11999797821045
load_unlabel运行时间： 34.36000037193298


In [9]:
x_train_new.shape,y_train_new.shape

((48465, 6704), (48465,))

### bagging train

In [17]:
def SelectModel(model_name):
    if model_name == 'XGB':
        from xgboost import XGBClassifier

        model = XGBClassifier(max_depth=6,
                              learning_rate =0.04, 
                              booster='gbtree',
                              objective='binary:logistic',
                              early_stopping_rounds=100,
                              scale_pos_weight=float(len(y_train_arr)-np.sum(y_train_arr))/float(np.sum(y_train_arr)),
                              eval_metric='auc',
                              gamma=1,
                              reg_lambda=1,
                              subsample=0.9,
                              min_child_weight=1,
                              seed=2018,
                              silent=False,
                              n_jobs=40,
                              num_boost_round =300
                             )
    elif model_name == 'RFC':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=1500,
                                       n_jobs =40,
                                       max_features='sqrt',
                                       class_weight='balanced',
#                                        verbose =1,
                                       random_state=2018)
    elif model_name == 'LGB':
        from lightgbm import LGBMClassifier
        model = LGBMClassifier(boost='gbdt',
                    num_leaves=135, 
                    scale_pos_weight=float(len(y_train_arr)-np.sum(y_train_arr))/float(np.sum(y_train_arr)),
                    max_depth=-1,
                    learning_rate=.04,
                    max_bin=200,
                    min_data_in_leaf= 60,
                    objective='binary',
                    metric='auc',
                    num_threads=40,
                    slient=False,
                    num_boost_round =300)
    else:
        pass
    return model

In [18]:
import os
# 训练多个模型
if not os.path.exists('./model'):
    os.mkdir('model')
model_list = ['XGB','RFC','LGB']
for model in model_list:
    print(model)
    clf = SelectModel(model)
    clf.fit(x_train_arr,y_train_arr)
    joblib.dump(clf,'./model/{}'.format(model))

XGB
RFC
LGB




### 预测结果

In [13]:
# 导入valid数据
tag_files = ['predict_tag','predict_tag_new']
TAG_FILE = tag_files[1]
def load_valid(tag_file): 
    valid_date = pd.read_csv('../../preprocess_data/valid_date.csv').drop(columns=['id','loan_hour'])
    valid_raw = joblib.load('../../preprocess_data_new/valid_nodup.lz4').drop(columns=['id','loan_dt'])
    valid_tag = pd.read_csv('../{}/valid_tag.csv'.format(tag_file),usecols=['tag'])
    valid_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])
    maj_cnt_test = joblib.load('../../preprocess_data_discrete/maj_cnt_test.lz4')

    valid = pd.concat([valid_date,valid_raw,valid_tag,valid_null,maj_cnt_test],axis=1)
    valid = valid.fillna(-1)
    x_test_arr = valid.values
    return x_test_arr

x_test_arr = load_valid(TAG_FILE)

In [14]:
# 参数
valid_save_path = './{}_valid_preds'.format(TAG_FILE)
model_list = ['XGB','RFC','LGB']
valid_id = pd.read_csv('../../preprocess_data_new/valid_date.csv',usecols=['id']).values.ravel()

def predict_score(save_path,model_list,id):
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    # model_list = 
    for model in model_list:
        pred = pd.DataFrame()
        pred['id'] = id
        clf = joblib.load('./model/{}'.format(model))
        pred['score'] = clf.predict_proba(x_test_arr)[:,1]
        pred.to_csv(os.path.join(save_path,'{}.csv'.format(model)),index=False)
        
predict_score(valid_save_path,model_list,valid_id)

In [15]:
valid_pred_dir = valid_save_path

def combine_score_to_prob(pred_dir,model_list,test_id):
    pred_path = os.path.join(pred_dir,'{}.csv'.format(model_list[0]))
    score = pd.read_csv(pred_path)['score']
    for model in model_list[1:]:
        pred_path = os.path.join(pred_dir,'{}.csv'.format(model))
        score += pd.read_csv(pred_path).score
    score = score/len(model_list)
    pred = pd.DataFrame()
    pred['id'] = test_id
    pred['prob'] = score.values
    pred.to_csv('./{}_bagging_pred.csv'.format(TAG_FILE),index=False)

combine_score_to_prob(valid_pred_dir,model_list,valid_id)

predict_tag_bagging_pred  AUC:0.81929804644399  
predict_tag_new_bagging_pred AUC:0.8185711623187