In [1]:
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,roc_auc_score 

%matplotlib inline

In [2]:
#load data
train_data=pd.read_csv(r'../data/train.csv')
test_data=pd.read_csv(r'../data/test.csv')

In [3]:
#按行连接起来
data=pd.concat([train_data,test_data],axis=0).reset_index(drop=True)
#删除data中的label列
data.drop(['label'],axis=1,inplace=True)
label=train_data.label

#PCA处理
pca=PCA(n_components=35, random_state=1)
data_pca=pca.fit_transform(data)
#定义交叉验证
Xtrain,Ytrain,xlabel,ylabel=train_test_split(data_pca[0:len(train_data)],label,test_size=0.1, random_state=34)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [None]:
#使用默认参数尝试分类
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(Xtrain,xlabel)
y_pred = gbm0.predict(Ytrain)
y_predprob = gbm0.predict_proba(Ytrain)
print("Accuracy:%.4g"%gbm0.score(Ytrain,ylabel))
print("Accuracy: %.4g"%accuracy_score(ylabel,y_pred))

In [None]:
#分析基分类器个数n_estimators
starttime = datetime.datetime.now()
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), 
                       param_grid = param_test1, cv=5)
gsearch1.fit(Xtrain,xlabel)

print(gsearch1.best_params_)
print(gsearch1.best_score_)
endtime = datetime.datetime.now()
print ((endtime - starttime).seconds)

In [None]:
#分析决策树的最大深度max_depth和内部节点再划分所需的最小样本数min_samples_split

starttime = datetime.datetime.now()
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, min_samples_leaf=20, 
                          max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test2,  cv=5)
gsearch2.fit(Xtrain,xlabel)
print(gsearch2.best_params_)
print(gsearch2.best_score_)

endtime = datetime.datetime.now()
print ((endtime - starttime).seconds)

In [None]:
#分析内部结点再划分所需的最小样本数min_samples_split和叶子节点最少样本书min_samples_leaf
starttime = datetime.datetime.now()
param_test3 = {'min_samples_leaf':range(60,101,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=11,min_samples_split=100,
                                     max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, cv=5)
gsearch3.fit(Xtrain,xlabel)
print(gsearch3.best_params_)
print(gsearch3.best_score_)

endtime = datetime.datetime.now()
print ((endtime - starttime).seconds)

In [None]:
#分析最大特征数max_features
starttime = datetime.datetime.now()
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=11, min_samples_leaf =100, 
               min_samples_split =1200, subsample=0.8, random_state=10), 
                       param_grid = param_test4, cv=5)
gsearch4.fit(Xtrain,xlabel)
print(gsearch4.best_params_)
print(gsearch4.best_score_)

endtime = datetime.datetime.now()
print ((endtime - starttime).seconds)

In [None]:
#分析子采样subsample
starttime = datetime.datetime.now()

param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=11, min_samples_leaf =100, 
               min_samples_split =1200, max_features=9, random_state=10), 
                       param_grid = param_test5, scoring='roc_auc',iid=False, cv=5)
gsearch5.fit(Xtrain,xlabel)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

endtime = datetime.datetime.now()
print ((endtime - starttime).seconds)

In [None]:
starttime = datetime.datetime.now()

gbm1 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=11, min_samples_leaf =100, 
               min_samples_split =1200, max_features='sqrt', subsample=0.8, random_state=10)
gbm1.fit(Xtrain,xlabel)
y_pred = gbm1.predict(Ytrain)
y_predprob = gbm1.predict_proba(Ytrain)
print("Accuracy:%.4g"%gbm1.score(Ytrain,ylabel))
print("Accuracy: %.4g"%accuracy_score(ylabel,y_pred))

endtime = datetime.datetime.now()
print ((endtime - starttime).seconds)

In [None]:
result=gbm1.predict(data_pca[len(train_data):])

print('Saving...')
with open('../out/sklearn_GBDT.csv', 'w') as writer:
    writer.write('"ImageId","Label"\n')
    count = 0
    for p in result:
        count += 1
        writer.write(str(count) + ',"' + str(p) + '"\n')