In [1]:
# -*- coding: UTF-8 -*-
"""
此脚本用于展示建模调参
lightGBM
"""

# 保证脚本与Python3兼容
from __future__ import print_function

import os   #读取数据文件
import sys
import pymysql 
from sqlalchemy import create_engine
 
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold  #划分训练集测试集使用
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.linear_model import LogisticRegression ,LogisticRegressionCV
#from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer #特征转换器
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix,f1_score
import lightgbm as lgb


%matplotlib inline
import warnings

warnings.filterwarnings("ignore")

def readData(path):
    """
    使用pandas读取数据
    """
    data = pd.read_csv(path)
    cols = list(data.columns.values)
    return data[cols]
     
 
def visualData(data):
    """
    画直方图，直观了解数据
    """
    data.hist(
        rwidth=0.9, grid=True, figsize=(8, 8), alpha=0.6,bins=10, color="blue")
    plt.show()
 
    
if __name__ == "__main__":
    # 设置显示格式
    pd.set_option('display.width', 1000)
    homePath = os.path.dirname(os.path.abspath('__file__'))
    # Windows下的存储路径与Linux并不相同
    if os.name == "nt":
        dataPath = "%s\\train_mod.csv" % homePath
    else:
        dataPath = "%s/train_mod.csv" % homePath
    train = readData(dataPath)
    #train = train.drop_duplicates()
    
    if os.name == "nt":
        dataPath = "%s\\val_mod.csv" % homePath
    else:
        dataPath = "%s/val_mod.csv" % homePath
    val = readData(dataPath)
    
    if os.name == "nt":
        dataPath = "%s\\test_mod.csv" % homePath
    else:
        dataPath = "%s/test_mod.csv" % homePath
    test = readData(dataPath)
    
    train = train.copy()
    train.fillna(0,inplace=True)
    test = test.copy()
    test.fillna(0,inplace=True)
    val = val.copy()
    val.fillna(0,inplace=True)

In [2]:
label_map = {'7C26FADD409BD4B9': 0 ,'816A9BEBED2D7C99': 1 ,'0F2E4CC10EDBE80F': 2, '56AFA2A526F96CC9': 3,'C7E2941B65C6CCD6': 4}
train['label'] = train['label'].map(label_map)
#test['label'] = test['label'].map(label_map)
val['label'] = val['label'].map(label_map)
usefeatures = ['dt_m_1086', 'dt_m_1068', 'dt_m_1012', 'dt_m_1017', 'dt_m_1087', 'dt_m_1105', 'dt_m_1102', 'dt_m_1096', 'dt_m_1051', 'dt_m_1003', 'dt_m_1027', 'dt_m_1043', 'dt_m_1073', 'dt_m_1028', 'dt_m_1041', 'dt_m_1052', 'dt_m_1004', 'dt_m_1011', 'dt_m_1618', 'dt_m_1630', 'dt_m_1032', 'dt_m_1009', 'dt_m_1108', 'dt_m_1067', 'dt_m_1006', 'dt_m_1074', 'dt_m_1035', 'dt_m_1005', 'dt_m_1075', 'dt_m_1085', 'app4_visits', 'dt_m_1000', 'dt_m_1601', 'app1_visits', 'dt_m_1099', 'age', 'dt_m_1617', 'dt_m_1034', 'open_age', 'dt_m_1620', 'dt_m_1015', 'cust_point']
#Xtrain = train.drop(['user','label'],axis=1).values
Xtrain = train[usefeatures]
#Xtest = val.drop(['user','label'],axis=1).values
Xtest = val[usefeatures]
Ytrain = train['label'].values
Ytest = val['label'].values
Xtrain.shape

(258208, 42)

### 第一步：学习率和迭代次数
默认参数，此时学习率为0.1，比较大，观察弱分类数目的大致范围(采用默认参数配置，看看模型是过拟合还是欠拟合)

In [3]:
import pandas as pd
import lightgbm as lgbm

#直接调用lightgbm内嵌的交叉验证(cv)，可对连续的n_estimators参数进行快速交叉验证
#而GridSearchCV只能对有限个参数进行交叉验证，且速度相对较慢
import json
def modelfit(params , alg , X_train , y_train , early_stopping_rounds=100):
    lgbm_params = params.copy()
    lgbm_params['num_class'] = 5
    lgbm_params.pop('silent');
     
    lgbmtrain = lgbm.Dataset(X_train , y_train , silent=True)
     
    #num_boost_round为弱分类器数目，下面的代码参数里因为已经设置了early_stopping_rounds
    #即性能未提升的次数超过过早停止设置的数值，则停止训练
    cv_result = lgbm.cv(lgbm_params , lgbmtrain , num_boost_round=10000 , nfold=5 , stratified=True , shuffle=True , metrics='multi_logloss' , early_stopping_rounds=early_stopping_rounds , show_stdv=True , seed=0 )
     
    print('best n_estimators:' , len(cv_result['multi_logloss-mean']))
    print('best cv score:' , cv_result['multi_logloss-mean'][-1])
    #cv_result.to_csv('lgbm1_nestimators.csv' , index_label='n_estimators')
    #json.dump(cv_result , open('lgbm_1.json' , 'w'))
     
    #采用交叉验证得到的最佳参数n_estimators,训练模型
    alg.set_params(n_estimators=len(cv_result['multi_logloss-mean']))
    alg.fit(X_train , y_train)
     
    #Predict training set:
    train_predprob = alg.predict_proba(X_train)
    logloss = metrics.log_loss(y_train , train_predprob)
     
    #Print model report:
    print("logloss of train :")
    print (logloss)
    
#X_train,X_val,y_train,y_val = train_test_split(X_smo,y_smo,test_size=0.2,random_state=2020)
#X_train,X_val,y_train,y_val = train_test_split(Xtrain,Ytrain,test_size=0.2,random_state=2020)    

params = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
          'silent': True,#是否打印信息，默认False
          'learning_rate': 0.1,
          'num_leaves': 100,
          'max_depth': 15,
          'max_bin': 127,
          'subsample_for_bin': 50000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 1,
          'reg_lambda': 0,
          'min_split_gain': 0.0,
          'min_child_weight': 1,
          'min_child_samples': 20,
          'scale_pos_weight': 1}
 
lgbm1 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=1000 , seed=0 , **params)
 
#modelfit(params , lgbm1 , X_train , y_train)
modelfit(params , lgbm1 , Xtrain , Ytrain)
#data_train = lgb.Dataset(X_train, y_train)
#cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='multi_error',early_stopping_rounds=100,seed=0)
#print('best n_estimators:', len(cv_results['auc-mean']))
#print('best cv score:', pd.Series(cv_results['auc-mean']).max())

best n_estimators: 112
best cv score: 0.0939771305992323
logloss of train :
0.03769001029798878


### 第二步：调整树的参数：max_depth & min_child_weight
(参数的步长为1；下一步是在最佳参数周围，将步长降为0.05，进行精细调整)

第一轮参数调整得到的n_estimators最优值(104)，其余参数继续默认值

用交叉验证评价模型性能时，用scoring参数定义评价指标。评价指标是越高越好，因此用一些损失函数当评价指标时，需要再加负号，如neg_log_loss，neg_mean_squared_error详见sklearn文档:http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss

In [5]:
#max_depth 建议3-10,min_child_weight=1 / sqrt(ratio_rare_event) = 5.5
max_depth = range(6 , 15 , 1)
min_child_weight = range(1 , 10 , 1)
param_test2_1 = dict(max_depth=max_depth , min_child_weight=min_child_weight)
param_test2_1

{'max_depth': range(6, 15), 'min_child_weight': range(1, 10)}

In [6]:
#prepare cross validation
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

#X_train,X_val,y_train,y_val = train_test_split(Xtrain,Ytrain,test_size=0.2,random_state=2020)
kfold = StratifiedKFold(n_splits=5 , shuffle=True , random_state=3)
params2 = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
           'n_jobs': -1,
          'silent': True,
          'learning_rate': 0.1,
          'num_leaves': 100,
          'max_depth': 15,
          'max_bin': 127,
          'subsample_for_bin': 50000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'reg_alpha': 1,
          'reg_lambda': 0,
          'min_split_gain': 0.0,
          'min_child_weight': 1,
          'min_child_samples': 20,
          'scale_pos_weight': 1}
 
lgbm2_1 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=112 , seed=0 , **params2)
 
#GridSearchCV参数说明：(学习器 ，参数范围 ，评价指标 ， cpu核心的使用数(-1为并行，使用全部的核) ， 交叉验证一共多少折)
gsearch2_1 = GridSearchCV(lgbm2_1 , param_grid=param_test2_1 , scoring='neg_log_loss' , n_jobs =-1 , cv = kfold)#,return_train_score=True)
gsearch2_1.fit(Xtrain , Ytrain)
 
#gsearch2_1.grid_scores_ , gsearch2_1.best_params_ , gsearch2_1.best_score_
means = gsearch2_1.cv_results_['mean_test_score']
params = gsearch2_1.cv_results_['params']
for mean ,param in zip(means,params):
    print("%f with:  %r" % (mean,param))
print(gsearch2_1.best_params_, gsearch2_1.best_score_)
    
    #-0.101006 with:  {'max_depth': 6, 'min_child_weight': 9}
    #-0.097327 with:  {'max_depth': 12, 'min_child_weight': 7}


-0.094423 with:  {'max_depth': 6, 'min_child_weight': 1}
-0.094104 with:  {'max_depth': 6, 'min_child_weight': 2}
-0.094099 with:  {'max_depth': 6, 'min_child_weight': 3}
-0.093939 with:  {'max_depth': 6, 'min_child_weight': 4}
-0.093865 with:  {'max_depth': 6, 'min_child_weight': 5}
-0.094041 with:  {'max_depth': 6, 'min_child_weight': 6}
-0.093805 with:  {'max_depth': 6, 'min_child_weight': 7}
-0.093848 with:  {'max_depth': 6, 'min_child_weight': 8}
-0.093882 with:  {'max_depth': 6, 'min_child_weight': 9}
-0.093888 with:  {'max_depth': 7, 'min_child_weight': 1}
-0.093561 with:  {'max_depth': 7, 'min_child_weight': 2}
-0.093394 with:  {'max_depth': 7, 'min_child_weight': 3}
-0.093362 with:  {'max_depth': 7, 'min_child_weight': 4}
-0.093367 with:  {'max_depth': 7, 'min_child_weight': 5}
-0.093171 with:  {'max_depth': 7, 'min_child_weight': 6}
-0.093107 with:  {'max_depth': 7, 'min_child_weight': 7}
-0.093137 with:  {'max_depth': 7, 'min_child_weight': 8}
-0.092957 with:  {'max_depth': 

In [7]:
#用交叉验证得到的最佳max_depth和min_child_weight进行训练及预测
import lightgbm as lgbm

params2 = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
          'silent': True,#是否打印信息，默认False
          'learning_rate': 0.1,
          'num_leaves': 100,
          'max_depth': 12,#第二次交叉验证得到的参数
          'max_bin': 127,
          'subsample_for_bin': 50000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 1,
          'reg_lambda': 0,
          'min_split_gain': 0.0,
          'min_child_weight': 9,#第二次交叉验证得到的参数
          'min_child_samples': 20,
          'scale_pos_weight': 1}
 
lgbm2 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=112 , seed=0 , **params2)

print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.datetime.today())
start = datetime.datetime.now()

lgbm2.fit(Xtrain , Ytrain)
 
#Predict training set:
train_predprob = lgbm2.predict_proba(Xtrain)
logloss = metrics.log_loss(Ytrain , train_predprob)
 
#Print model report:
print("logloss of train :")
print (logloss)

end = datetime.datetime.now()
print("程序运行时间："+str((end-start).seconds)+"秒")

2020-02-18 14:31:31
2020-02-18 14:31:31.225301
logloss of train :
0.04953894445422604
程序运行时间：31秒


### 调试num_leaves

In [8]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

#X_train,X_val,y_train,y_val = train_test_split(Xtrain,Ytrain,test_size=0.2,random_state=2020)
kfold = StratifiedKFold(n_splits=5 , shuffle=True , random_state=3)
param_test3 = {'num_leaves': range(10,150,10),
               'max_depth': range(8,14,1)
              }
             
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.datetime.today())
start = datetime.datetime.now()

params3 = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
          'silent': True,#是否打印信息，默认False
          'learning_rate': 0.1,
          'num_leaves': 100,
          'max_depth': 12,#第二次交叉验证得到的参数
          'max_bin': 127,
          'subsample_for_bin': 50000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 1,
          'reg_lambda': 0,
          'min_split_gain': 0.0,
          'min_child_weight': 9,#第二次交叉验证得到的参数
          'min_child_samples': 20,
          'scale_pos_weight': 1}
 
lgbm3 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=112 , seed=0 , **params3)
 
#GridSearchCV参数说明：(学习器 ，参数范围 ，评价指标 ， cpu核心的使用数(-1为并行，使用全部的核) ， 交叉验证一共多少折)
gsearch3 = GridSearchCV(lgbm3 , param_grid=param_test3 , scoring='neg_log_loss' , n_jobs =-1 , cv = kfold)#,return_train_score=True)
gsearch3.fit(Xtrain , Ytrain)
 
#gsearch2_1.grid_scores_ , gsearch2_1.best_params_ , gsearch2_1.best_score_
means = gsearch3.cv_results_['mean_test_score']
params = gsearch3.cv_results_['params']
for mean ,param in zip(means,params):
    print("%f with:  %r" % (mean,param))
print(gsearch3.best_params_, gsearch3.best_score_)

end = datetime.datetime.now()
print("程序运行时间："+str((end-start).seconds)+"秒")

2020-02-18 14:42:47
2020-02-18 14:42:47.253968
-0.098364 with:  {'max_depth': 8, 'num_leaves': 10}
-0.095024 with:  {'max_depth': 8, 'num_leaves': 20}
-0.093946 with:  {'max_depth': 8, 'num_leaves': 30}
-0.093279 with:  {'max_depth': 8, 'num_leaves': 40}
-0.092947 with:  {'max_depth': 8, 'num_leaves': 50}
-0.092942 with:  {'max_depth': 8, 'num_leaves': 60}
-0.092683 with:  {'max_depth': 8, 'num_leaves': 70}
-0.092687 with:  {'max_depth': 8, 'num_leaves': 80}
-0.092716 with:  {'max_depth': 8, 'num_leaves': 90}
-0.092691 with:  {'max_depth': 8, 'num_leaves': 100}
-0.092655 with:  {'max_depth': 8, 'num_leaves': 110}
-0.092700 with:  {'max_depth': 8, 'num_leaves': 120}
-0.092569 with:  {'max_depth': 8, 'num_leaves': 130}
-0.092540 with:  {'max_depth': 8, 'num_leaves': 140}
-0.098375 with:  {'max_depth': 9, 'num_leaves': 10}
-0.094886 with:  {'max_depth': 9, 'num_leaves': 20}
-0.093805 with:  {'max_depth': 9, 'num_leaves': 30}
-0.093239 with:  {'max_depth': 9, 'num_leaves': 40}
-0.093014 wi

In [10]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

#X_train,X_val,y_train,y_val = train_test_split(Xtrain,Ytrain,test_size=0.2,random_state=2020)
kfold = StratifiedKFold(n_splits=5 , shuffle=True , random_state=3)
#调试min_child_samples和min_child_weight
param_test4 = {'min_child_samples': range(18,23,1),
               'min_child_weight': range(8,14,1)
              }
             
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.datetime.today())
start = datetime.datetime.now()

params4 = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
          'silent': True,#是否打印信息，默认False
          'learning_rate': 0.1,
          'num_leaves': 110 ,
          'max_depth': 10,#第二次交叉验证得到的参数
          'max_bin': 127,
          'subsample_for_bin': 50000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 1,
          'reg_lambda': 0,
          'min_split_gain': 0.0,
          'min_child_weight': 9,#第二次交叉验证得到的参数
          'min_child_samples': 20,
          'scale_pos_weight': 1}
 
lgbm4 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=112 , seed=0 , **params4)
 
#GridSearchCV参数说明：(学习器 ，参数范围 ，评价指标 ， cpu核心的使用数(-1为并行，使用全部的核) ， 交叉验证一共多少折)
gsearch4 = GridSearchCV(lgbm4 , param_grid=param_test4 , scoring='neg_log_loss' , n_jobs =-1 , cv = kfold)#,return_train_score=True)
gsearch4.fit(Xtrain , Ytrain)
 
#gsearch2_1.grid_scores_ , gsearch2_1.best_params_ , gsearch2_1.best_score_
means = gsearch4.cv_results_['mean_test_score']
params = gsearch4.cv_results_['params']
for mean ,param in zip(means,params):
    print("%f with:  %r" % (mean,param))
print(gsearch4.best_params_, gsearch4.best_score_)

end = datetime.datetime.now()
print("程序运行时间："+str((end-start).seconds)+"秒")

2020-02-18 16:19:25
2020-02-18 16:19:25.601614
-0.092617 with:  {'min_child_samples': 18, 'min_child_weight': 8}
-0.092619 with:  {'min_child_samples': 18, 'min_child_weight': 9}
-0.092487 with:  {'min_child_samples': 18, 'min_child_weight': 10}
-0.092547 with:  {'min_child_samples': 18, 'min_child_weight': 11}
-0.092332 with:  {'min_child_samples': 18, 'min_child_weight': 12}
-0.092475 with:  {'min_child_samples': 18, 'min_child_weight': 13}
-0.092727 with:  {'min_child_samples': 19, 'min_child_weight': 8}
-0.092619 with:  {'min_child_samples': 19, 'min_child_weight': 9}
-0.092487 with:  {'min_child_samples': 19, 'min_child_weight': 10}
-0.092547 with:  {'min_child_samples': 19, 'min_child_weight': 11}
-0.092332 with:  {'min_child_samples': 19, 'min_child_weight': 12}
-0.092475 with:  {'min_child_samples': 19, 'min_child_weight': 13}
-0.092508 with:  {'min_child_samples': 20, 'min_child_weight': 8}
-0.092487 with:  {'min_child_samples': 20, 'min_child_weight': 9}
-0.092487 with:  {'mi

In [11]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

#X_train,X_val,y_train,y_val = train_test_split(Xtrain,Ytrain,test_size=0.2,random_state=2020)
kfold = StratifiedKFold(n_splits=5 , shuffle=True , random_state=3)
#调试feature_fraction和bagging_fraction
#subsample和colsample_bytree 
param_test5 = {'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],
               'feature_fraction': [0.5,0.6,0.7,0.8,0.9]
              }
             
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.datetime.today())
start = datetime.datetime.now()

params5 = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
          'silent': True,#是否打印信息，默认False
          'learning_rate': 0.1,
          'num_leaves': 110,
          'max_depth': 10,#第二次交叉验证得到的参数
          'max_bin': 127,
          'subsample_for_bin': 50000,
         # 'subsample': 0.8,
          'subsample_freq': 1,
          #'colsample_bytree': 0.8,
          'reg_alpha': 1,
          'reg_lambda': 0,
          'min_split_gain': 0.0,
          'min_child_weight': 12,#第二次交叉验证得到的参数
          'min_child_samples': 18,
          'scale_pos_weight': 1}
 
lgbm5 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=112 , seed=0 , **params5)
 
#GridSearchCV参数说明：(学习器 ，参数范围 ，评价指标 ， cpu核心的使用数(-1为并行，使用全部的核) ， 交叉验证一共多少折)
gsearch5 = GridSearchCV(lgbm5 , param_grid=param_test5 , scoring='neg_log_loss' , n_jobs =-1 , cv = kfold)#,return_train_score=True)
gsearch5.fit(Xtrain , Ytrain)
 
#gsearch2_1.grid_scores_ , gsearch2_1.best_params_ , gsearch2_1.best_score_
means = gsearch5.cv_results_['mean_test_score']
params = gsearch5.cv_results_['params']
for mean ,param in zip(means,params):
    print("%f with:  %r" % (mean,param))
print(gsearch5.best_params_, gsearch5.best_score_)

end = datetime.datetime.now()
print("程序运行时间："+str((end-start).seconds)+"秒")

2020-02-18 20:59:51
2020-02-18 20:59:51.911024
-0.092686 with:  {'bagging_fraction': 0.6, 'feature_fraction': 0.5}
-0.092678 with:  {'bagging_fraction': 0.6, 'feature_fraction': 0.6}
-0.092717 with:  {'bagging_fraction': 0.6, 'feature_fraction': 0.7}
-0.092843 with:  {'bagging_fraction': 0.6, 'feature_fraction': 0.8}
-0.092660 with:  {'bagging_fraction': 0.6, 'feature_fraction': 0.9}
-0.092647 with:  {'bagging_fraction': 0.7, 'feature_fraction': 0.5}
-0.092779 with:  {'bagging_fraction': 0.7, 'feature_fraction': 0.6}
-0.092535 with:  {'bagging_fraction': 0.7, 'feature_fraction': 0.7}
-0.092402 with:  {'bagging_fraction': 0.7, 'feature_fraction': 0.8}
-0.092516 with:  {'bagging_fraction': 0.7, 'feature_fraction': 0.9}
-0.092624 with:  {'bagging_fraction': 0.8, 'feature_fraction': 0.5}
-0.092613 with:  {'bagging_fraction': 0.8, 'feature_fraction': 0.6}
-0.092474 with:  {'bagging_fraction': 0.8, 'feature_fraction': 0.7}
-0.092332 with:  {'bagging_fraction': 0.8, 'feature_fraction': 0.8}
-

In [25]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

#X_train,X_val,y_train,y_val = train_test_split(Xtrain,Ytrain,test_size=0.2,random_state=2020)
kfold = StratifiedKFold(n_splits=5 , shuffle=True , random_state=3)
#调试 reg_alpha和 reg_lambda
param_test6 = {'reg_alpha': [0,1e-5,1e-4,0.001,0.005,0.01,0.03,0.08,0.3,0.5,1],
               'reg_lambda': [0,1e-5,1e-4,0.001,0.005,0.01,0.03,0.08,0.3,0.5]
              }
             
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.datetime.today())
start = datetime.datetime.now()

params6 = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
          'silent': True,#是否打印信息，默认False
          'learning_rate': 0.1,
          'num_leaves': 110,
          'max_depth': 10,#第二次交叉验证得到的参数
          'max_bin': 127,
          'subsample_for_bin': 50000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          #'reg_alpha': 1,
          #'reg_lambda': 0,
          'min_split_gain': 0.0,
          'min_child_weight': 12,#第二次交叉验证得到的参数
          'min_child_samples': 18,
          'scale_pos_weight': 1}
 
lgbm6 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=112 , seed=0 , **params6)
 
#GridSearchCV参数说明：(学习器 ，参数范围 ，评价指标 ， cpu核心的使用数(-1为并行，使用全部的核) ， 交叉验证一共多少折)
gsearch6 = GridSearchCV(lgbm6 , param_grid=param_test6 , scoring='neg_log_loss' , n_jobs =-1 , cv = kfold)#,return_train_score=True)
gsearch6.fit(Xtrain , Ytrain)
 
#gsearch2_1.grid_scores_ , gsearch2_1.best_params_ , gsearch2_1.best_score_
means = gsearch6.cv_results_['mean_test_score']
params = gsearch6.cv_results_['params']
for mean ,param in zip(means,params):
    print("%f with:  %r" % (mean,param))
print(gsearch6.best_params_, gsearch6.best_score_)

end = datetime.datetime.now()
print("程序运行时间："+str((end-start).seconds)+"秒")

2020-02-19 17:18:03
2020-02-19 17:18:03.766639
-0.092490 with:  {'reg_alpha': 0, 'reg_lambda': 0}
-0.092525 with:  {'reg_alpha': 0, 'reg_lambda': 1e-05}
-0.092457 with:  {'reg_alpha': 0, 'reg_lambda': 0.0001}
-0.092509 with:  {'reg_alpha': 0, 'reg_lambda': 0.001}
-0.092580 with:  {'reg_alpha': 0, 'reg_lambda': 0.005}
-0.092515 with:  {'reg_alpha': 0, 'reg_lambda': 0.01}
-0.092547 with:  {'reg_alpha': 0, 'reg_lambda': 0.03}
-0.092524 with:  {'reg_alpha': 0, 'reg_lambda': 0.08}
-0.092485 with:  {'reg_alpha': 0, 'reg_lambda': 0.3}
-0.092509 with:  {'reg_alpha': 0, 'reg_lambda': 0.5}
-0.092507 with:  {'reg_alpha': 1e-05, 'reg_lambda': 0}
-0.092505 with:  {'reg_alpha': 1e-05, 'reg_lambda': 1e-05}
-0.092526 with:  {'reg_alpha': 1e-05, 'reg_lambda': 0.0001}
-0.092498 with:  {'reg_alpha': 1e-05, 'reg_lambda': 0.001}
-0.092537 with:  {'reg_alpha': 1e-05, 'reg_lambda': 0.005}
-0.092533 with:  {'reg_alpha': 1e-05, 'reg_lambda': 0.01}
-0.092455 with:  {'reg_alpha': 1e-05, 'reg_lambda': 0.03}
-0.09

In [5]:
#预测
if os.name == "nt":
    testPath = "%s\\test_mod.csv" % homePath
else:
    testPath = "%s/test_mod.csv" % homePath
testset = readData(testPath)
#test1=test[feature_selection] 

   
print(testset.shape)
print(testset.columns)
#sc = StandardScaler()
testset['pred'] = lgbm2.predict( testset.drop(['user','label'],axis=1))
#testset.shape
re_label_map = {0: '7C26FADD409BD4B9'   ,1: '816A9BEBED2D7C99',2: '0F2E4CC10EDBE80F',3:  '56AFA2A526F96CC9',4: 'C7E2941B65C6CCD6' }
testset['pred'] = testset['pred'].map(re_label_map)
testset[['user','pred']].to_csv('modlgbm.csv',index=0,header=0)

testset['pred'].value_counts()

(79407, 171)
Index(['open_age', 'age', 'user', 'cust_point', 'inet_pd_inst_cnt', 'label', 'dt_m_1000', 'dt_m_1003', 'dt_m_1004', 'dt_m_1005',
       ...
       'product_nbr_900000453', 'product_nbr_900037', 'product_nbr_902868114', 'product_nbr_917913615', 'product_nbr_918701298', 'product_nbr_999999999', 'product_nbr_Z1180000030', 'product_nbr_Z1180000045', 'product_nbr_Z1180000058', 'product_nbr_Z12114095'], dtype='object', length=171)


7C26FADD409BD4B9    71643
816A9BEBED2D7C99     4285
56AFA2A526F96CC9     2087
0F2E4CC10EDBE80F      925
C7E2941B65C6CCD6      467
Name: pred, dtype: int64

### 训练集加验证集都参与训练

In [23]:
#用交叉验证得到的最佳max_depth和min_child_weight进行训练及预测
# 设置显示格式
pd.set_option('display.width', 1000)
homePath = os.path.dirname(os.path.abspath('__file__'))
# Windows下的存储路径与Linux并不相同
if os.name == "nt":
    dataPath = "%s\\train_mod.csv" % homePath
else:
    dataPath = "%s/train_mod.csv" % homePath
train = readData(dataPath)
#train = train.drop_duplicates()

if os.name == "nt":
    dataPath = "%s\\val_mod.csv" % homePath
else:
    dataPath = "%s/val_mod.csv" % homePath
val = readData(dataPath)

 

train = train.copy()
train.fillna(0,inplace=True)
 
val = val.copy()
val.fillna(0,inplace=True)

if os.name == "nt":
    testPath = "%s\\test_mod.csv" % homePath
else:
    testPath = "%s/test_mod.csv" % homePath
testset = readData(testPath)
testset = testset.copy()
testset.fillna(0,inplace=True)

trainval  =  pd.concat([train,val],axis=0)
trainval = trainval.copy()
trainval.fillna(0,inplace=True)
label_map = {'7C26FADD409BD4B9': 0 ,'816A9BEBED2D7C99': 1 ,'0F2E4CC10EDBE80F': 2, '56AFA2A526F96CC9': 3,'C7E2941B65C6CCD6': 4}
trainval['label'] = trainval['label'].map(label_map)
#test['label'] = test['label'].map(label_map)
#val['label'] = val['label'].map(label_map)
Xtrainval = trainval[usefeatures].values
#Xtrainval.head()
#Xtrainval = Xtrainval.fillna(0,inplace=True)
#Xtest = val.drop(['user','label'],axis=1).values
Ytrainval = trainval['label'].values
#Ytest = val['label'].values
trainval.shape

params3 = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'nthread': -1,
          'silent': True,#是否打印信息，默认False
          'learning_rate': 0.1,
          'num_leaves': 110,
          'max_depth': 10,#第二次交叉验证得到的参数
          'max_bin': 127,
          'subsample_for_bin': 50000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 0.001,
          'reg_lambda': 0.5,
          'min_split_gain': 0.0,
          'min_child_weight': 12,#第二次交叉验证得到的参数
          'min_child_samples': 18,
          'scale_pos_weight': 1}
 
#lgbm3 = lgbm.sklearn.LGBMClassifier(num_class=5 , n_estimators=116 , seed=0 , **params2)

 

lgbm2.fit(Xtrainval , Ytrainval)
 
#Predict training set:
train_predprob1 = lgbm2.predict_proba(Xtrainval)
logloss = metrics.log_loss(Ytrainval , train_predprob1)
print("logloss of train :")
print (logloss)

#testset['pred'] = lgbm2.predict( test.drop(['user','label'],axis=1))
testset['pred'] = lgbm2.predict(testset[usefeatures])
re_label_map = {0: '7C26FADD409BD4B9'   ,1: '816A9BEBED2D7C99',2: '0F2E4CC10EDBE80F',3:  '56AFA2A526F96CC9',4: 'C7E2941B65C6CCD6' }
testset['pred'] = testset['pred'].map(re_label_map)
testset[['user','pred']].to_csv('modlgbm2.csv',index=0,header=0)

testset['pred'].value_counts() 

logloss of train :
0.05350143532291663


7C26FADD409BD4B9    71760
816A9BEBED2D7C99     4201
56AFA2A526F96CC9     2065
0F2E4CC10EDBE80F      919
C7E2941B65C6CCD6      462
Name: pred, dtype: int64