In [1]:
#coding:utf-8
#稳定函数模块
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import Binarizer
from sklearn.metrics import roc_auc_score,classification_report,f1_score,accuracy_score,recall_score



#二值化处理，传入np.array或list，输出二值化后的同类型数据
def binarize(result):
    for i in range(len(result)):
        if result[i]>=0.5:
            result[i]=1
        else:
            result[i]=0
    return result
#样例
#result = binarize(result)

#计算F1score的函数,传入预测结果和正确的结果，返回F1_score
def cal_f1(result_valid,lb_valid):
    TP,FP,FN = 0,0,0
    for i in range(len(lb_valid)):
        if result_valid[i] ==1 and lb_valid[i] == 1:
            TP +=1
        if result_valid[i]==1 and lb_valid[i]==0:
            FP +=1
        if result_valid[i] == 0 and lb_valid[i]==1:
            FN +=1
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    F1 = 2*precision*recall/(precision + recall)
    print('precision:',precision)
    print('recall:',recall)
    print('F1:',F1)
    return True
#样例
#cal_f1(result_valid,lb_valid)

#测试集预测,传入lgb_model、sid以及测试集特征，直接生成csv文件并返回True
#如果需要二值化，则多一条binarize = True
def predict(bst,sid,test,outputfile,binarize = False):
    result = bst.predict(test)
    if binarize==True:
        result = binarize(list(result))
    #进行拼接并生成最终文件
    result = pd.DataFrame(result)
    final = pd.concat([sid,result], axis=1, join='outer')
    final.columns = ['sid','label']
    final.to_csv(outputfile,index = None)
    return True
#使用样例
#predict(bst,sid,test,'lgb.csv',binarize = False)

#lgb训练log作图，传入metric_dict，自动生成图像，并在最后一个点处标记数据
def picture_lgb_loss(metric_dict):
    plt.figure(figsize=(8, 6))
    plt.plot(metric_dict['valid_0']['auc'])
    plt.plot(metric_dict['valid_0']['binary_logloss'])
    plt.title('lgb-naive log')
    plt.ylabel('Loss&AUC')
    plt.xlabel('Epoch')
    plt.legend(['val_auc','valid_loss'], loc='right')
    
    final = len(metric_dict['valid_0']['auc'])

    plt.text(final, metric_dict['valid_0']['auc'][final-1] ,"final:"+str(metric_dict['valid_0']['auc'][final-1])
             , ha='center', va='bottom', fontsize=10)
    plt.text(final, metric_dict['valid_0']['binary_logloss'][final-1] ,"final:"+str(metric_dict['valid_0']['binary_logloss'][final-1])
             , ha='center', va='bottom', fontsize=10)
    plt.show()
#样例
#picture_lgb_loss(metric_dict)    
    
# 创建训练集，验证集，本地训练集
def create_train_valid_test(train_x,train_y,untrain_size=0.1,random=0):
    #train_x，总训练特征，dataframe
    #train_y，总训练标签，Series
    #untrain_size，非训练集比例
    #random，随机划分的种子，种子相同即随机划分相同
    #划分训练集和非训练集
    train_X,valid_X, train_Y, valid_Y = train_test_split(train_x,
                                                   train_y,
                                                   test_size = untrain_size,
                                                   random_state = random)
    #将非训练集划分为验证集和本地测试集
    valid_X,test_X,valid_Y,test_Y = train_test_split(valid_X,
                                                   valid_Y,
                                                   test_size = 0.5,
                                                   random_state = random)
    return train_X,valid_X,test_X, train_Y, valid_Y,test_Y
#样例
#train_X,valid_X,local_test_X, train_Y, valid_Y,local_test_Y = create_train_valid_test(train_x,train_y)

# 创建可训练的lgb数据集
def create_lgb_dataset(data,categorical_feature=None,free_raw_data=True):
    # list of dataframe，格式：[train_X,valid_X,train_Y,valid_Y]
    # categorical_feature：哪些特征要被视为分类特征；注意使用，分类特征和数值特征的处理方法不同
    # free_raw_data：是否释放原始数据；当需要使用分类特征是，必须为False
    if categorical_feature != None:
        free_raw_data=False
    train_data = lgb.Dataset(data[0],data[2],categorical_feature=categorical_feature,free_raw_data=free_raw_data)
    valid_data = lgb.Dataset(data[1],data[3],categorical_feature=categorical_feature,free_raw_data=free_raw_data)
    #返回可训练的训练集与验证集
    return train_data,valid_data



#可用于early_stop,方式：bst = lgb.train 里面加一个 feval=lgb_f1
def lgb_f1_score(y_pred, data):
    y_true = data.get_label()  #lgb_data获取label
    y_pred = np.round(y_pred) # 默认精度为0，四舍五入
    return 'f1', f1_score(y_pred, y_true), True

In [20]:
#未加入初赛
%time data = pd.read_feather('feature/combine.bin', use_threads = 12)
data = data.drop(['sid'],axis=1)
##划分数据：
train=data[1000000:6000000]
label=train['label']

data = data.drop(['label'],axis = 1)
train=data[1000000:6000000]
test=data[6000000:].reset_index(drop=True)
del data

In [6]:
#加入初赛
%time data = pd.read_feather('feature/combine.bin', use_threads = 12)
data = data.drop(['sid'],axis=1)
##划分数据：
train=data[:6000000]
label=train['label']

data = data.drop(['label'],axis = 1)
train=data[:6000000]
test=data[6000000:].reset_index(drop=True)
del data

Wall time: 7.5 s


In [32]:
cols = train.columns

In [33]:
#lgb训练的模型,传入训练集、标签以及测试集,如果没有指定k折训练，则自动进行1折训练
#返回预测的结果
def lgb_model(train, label, test, params, k_split = 0, up_load = False):
    #如果不上交（本地lgb）
    if up_load == False:
        # 创建训练集，验证集，本地训练集
        train_X,valid_X,local_test_X, train_Y, valid_Y,local_test_Y = create_train_valid_test(train,label)
        # 创建可训练的lgb数据集
        train_data,valid_data = create_lgb_dataset([train_X,valid_X,train_Y,valid_Y])

        if k_split==0:
            #不分折训练
           
            #lgb训练， metric_dict保存训练产生指标
            metric_dict = {}
            bst = lgb.train(params, train_data,100000,valid_sets = valid_data,early_stopping_rounds=100,
                            verbose_eval = 200, callbacks = [lgb.record_evaluation(metric_dict)],feval=lgb_f1)

            #输出特征数
            print("特征数：",bst.num_feature())
            #保存lgb模型
            #bst.save_model('lgb.model')
            f_importance = bst.feature_importance(importance_type = 'gain')
            f_name = bst.feature_name()
            print("特征重要性(信息增益)：")
            for i in range(len(f_name)):
                print(f_name[i]+': '+str(f_importance[i]))
            #绘图——loss
            picture_lgb_loss(metric_dict)
            #评估
            cal_f1(binarize(bst.predict(local_test_X)),np.array(local_test_Y))
            result = bst.predict(test)
            return result

        #k折交叉训练
        if k_split!=0:

            #skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
            #5折交叉训练分布  （skf确保训练集，测试集中各类别样本的比例与原始数据集中相同。即skf划分出来的训练集标签一定各个种类都有）
            skf = StratifiedKFold(n_splits=k_split,shuffle=True,random_state=98)

            ##用于存放测试集概率，k折最后要除以k取平均
            result = np.zeros(test.shape[0])  
            train_X = np.array(train_X)
            train_Y = np.array(train_Y)
            
            #对于每一折的训练，i计数，tr为训练集所代表的的行数列表，va为验证集所代表的的行数列表
            for i,(tr,va) in enumerate(skf.split(train_X,train_Y)):
                print(type(train))
                print(tr)
                print(train_X[tr])
                print('fold:',i+1,'training')
                dtrain = lgb.Dataset(train_X[tr],train_Y[tr])
                dvalid = lgb.Dataset(train_X[va],train_Y[va],reference=dtrain)
                ##训练：
                metric_dict = {}
                bst = lgb.train(params, dtrain, num_boost_round=30000, valid_sets=dvalid,
                                verbose_eval=400,early_stopping_rounds=200, callbacks = [lgb.record_evaluation(metric_dict)],
                               feval=lgb_f1)

                ##预测测试集：
                result += bst.predict(local_test_X, num_iteration=bst.best_iteration)
                
            result/=5
            cal_f1(binarize(rsult),np.array(local_test_Y))
            
            return result
    #如果上交，把全部数据丢入训练
    if up_load == True:
        if k_split == 0 :
            #上交的lgb（所有都拿来训练）
            train_data_full = lgb.Dataset(train,label)
            bst_full = lgb.train(params, train_data_full,3300)
            #输出特征数
            print("特征数：",bst_full.num_feature())
            #输出特征重要性
            print("特征重要性：",bst_full.feature_importance(importance_type = 'gain'))
            #保存lgb模型
            #bst_full.save_model('lgb_full.model')
            result = bst.predict(test)
            return result
        
        if k_split!=0:

            #skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
            #5折交叉训练分布  （skf确保训练集，测试集中各类别样本的比例与原始数据集中相同。即skf划分出来的训练集标签一定各个种类都有）
            skf = StratifiedKFold(n_splits=k_split,shuffle=True,random_state=97)

            ##用于存放测试集概率，k折最后要除以k取平均
            result = np.zeros(test.shape[0])  
            train = np.array(train)
            label = np.array(label)
            #对于每一折的训练，i计数，tr为训练集所代表的的行数，va为验证集所代表的的行数
            for i,(tr,va) in enumerate(skf.split(train,label)):
                print('fold:',i+1,'training')
                dtrain = lgb.Dataset(train[tr],label[tr])
                dvalid = lgb.Dataset(train[va],label[va],reference=dtrain)
                ##训练：
                metric_dict = {}
                bst = lgb.train(params, dtrain, num_boost_round=10, valid_sets=dvalid,
                                verbose_eval=2,early_stopping_rounds=200, callbacks = [lgb.record_evaluation(metric_dict)],
                                feval=lgb_f1_score)
                print("特征重要性：",list(zip(list(cols),list(bst.feature_importance(importance_type = 'gain')))))

                ##预测测试集：
                result += bst.predict(test, num_iteration=bst.best_iteration)
                
            result /= k_split
            return result
            
        
#本地运行不分折训练
#lgb_model(train,label,test, k_split = 0, up_load = False)

In [34]:

#默认参数
params1 = {'boosting_type': 'goss',
                          'objective': 'binary',
                          'metric': ['auc','binary_logloss'],

                          'num_leaves': 31,
                          #'max_depth' : 7,
                          'learning_rate': 0.05,
                          #'feature_fraction': 0.9,
                  }
#k折训练参数，有几个特征就写几个种子
params2 = {
            'learning_rate': 0.01,
            'boosting_type': 'goss',
            'objective': 'binary',
            'metric': 'auc',
            'feature_fraction': 0.8,
            #'bagging_fraction': 0.8,#使用goss，bagging-fraction需要为1
            'bagging_freq': 5,
            'num_leaves': 1000,
            'verbose': -1,
            'max_depth': -1,
          #  'reg_alpha':2.2,
          #  'reg_lambda':1.4,
            'seed':74,
            }

result = lgb_model(train,label,test,params2, k_split = 5, up_load = True)

fold: 1 training
Training until validation scores don't improve for 200 rounds.
[2]	valid_0's auc: 0.84934	valid_0's f1: 0.76062
[4]	valid_0's auc: 0.854622	valid_0's f1: 0.76062
[6]	valid_0's auc: 0.853625	valid_0's f1: 0.76062
[8]	valid_0's auc: 0.854442	valid_0's f1: 0.76062
[10]	valid_0's auc: 0.854896	valid_0's f1: 0.76062
Did not meet early stopping. Best iteration is:
[9]	valid_0's auc: 0.854996	valid_0's f1: 0.76062
特征重要性： [('h', 4768000.830466747), ('w', 2610011.2204298377), ('ppi', 4327465.291700363), ('orientation', 92685.95234853029)]
fold: 2 training
Training until validation scores don't improve for 200 rounds.
[2]	valid_0's auc: 0.848755	valid_0's f1: 0.76062
[4]	valid_0's auc: 0.853978	valid_0's f1: 0.76062
[6]	valid_0's auc: 0.852918	valid_0's f1: 0.76062
[8]	valid_0's auc: 0.853913	valid_0's f1: 0.76062
[10]	valid_0's auc: 0.854283	valid_0's f1: 0.76062
Did not meet early stopping. Best iteration is:
[9]	valid_0's auc: 0.854452	valid_0's f1: 0.76062
特征重要性： [('h', 4768

In [None]:
result = binarize(np.array(result))

In [None]:
result = pd.DataFrame(result)
#生成答案
test_a = pd.read_csv('test_a.txt','\t')
test_b = pd.read_csv('test_b.txt','\t')
test = pd.concat([test_a,test_b],axis=0,sort=False).reset_index(drop=True)

sid = pd.DataFrame(test['sid'])
final = pd.concat([sid,result], axis=1, join='outer')
final.columns = ['sid','label']
final.to_csv('lgb_5fold_try.csv',index = None)