In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import cross_validation, metrics



In [2]:
train = pd.read_csv('../datasets/train.csv')
train['Disbursed'].value_counts() # Disbursed的值就是二元分类的输出

0    19680
1      320
Name: Disbursed, dtype: int64

In [3]:
train

Unnamed: 0,Disbursed,Existing_EMI,ID,Loan_Amount_Applied,Loan_Tenure_Applied,Monthly_Income,Var4,Var5,Age,EMI_Loan_Submitted_Missing,...,Var2_2,Var2_3,Var2_4,Var2_5,Var2_6,Mobile_Verified_0,Mobile_Verified_1,Source_0,Source_1,Source_2
0,0,0.0,ID000002C20,300000,5,20000,1,0,37,1,...,0,0,0,0,1,1,0,1,0,0
1,0,0.0,ID000004E40,200000,2,35000,3,13,30,0,...,0,0,0,0,1,0,1,1,0,0
2,0,0.0,ID000007H20,600000,4,22500,1,0,34,1,...,0,0,0,0,0,0,1,0,0,1
3,0,0.0,ID000008I30,1000000,5,35000,3,10,28,1,...,0,0,0,0,0,0,1,0,0,1
4,0,25000.0,ID000009J40,500000,2,100000,3,17,31,1,...,0,0,0,0,0,0,1,0,0,1
5,0,15000.0,ID000010K00,300000,5,45000,3,17,33,0,...,0,0,0,0,0,0,1,0,0,1
6,0,0.0,ID000011L10,6,5,70000,1,0,28,1,...,0,0,0,0,0,1,0,0,1,0
7,0,2597.0,ID000012M20,200000,5,20000,3,3,40,1,...,0,0,0,0,0,0,1,0,0,1
8,0,0.0,ID000013N30,0,0,75000,5,13,43,0,...,1,0,0,0,0,0,1,1,0,0
9,0,0.0,ID000014O40,300000,3,30000,1,0,26,0,...,0,0,0,0,0,0,1,0,1,0


In [4]:
x_col = [x for x in train.columns if x not in ['Disbursed', 'ID']]
X = train[x_col]
y = train['Disbursed']

In [5]:
rf1 = RandomForestClassifier(oob_score=True, random_state=10)
rf1.fit(X, y)
print(rf1.oob_score_)
y_predprob = rf1.predict_proba(X)[:, 1]
print('AUC Score (Train):%f' % metrics.roc_auc_score(y, y_predprob))

0.98005
AUC Score (Train):0.999833


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


可见袋外分数已经很高，而且AUC分数也很高。相对于GBDT的默认参数输出，RF的默认参数拟合效果对本例要好一些。

我们首先对n_estimators进行网格搜索：

In [6]:
param_test1 = {'n_estimators': range(10, 71, 10)}
gs1 = GridSearchCV(estimator=RandomForestClassifier(min_samples_split=100,
                                                   min_samples_leaf=20,
                                                    max_depth=8,
                                                    max_features='sqrt',
                                                   random_state=10),
                  param_grid=param_test1, scoring='roc_auc', cv=5)
gs1.fit(X, y)
gs1.cv_results_, gs1.best_params_, gs1.best_score_



({'mean_fit_time': array([ 0.08478646,  0.15494571,  0.23470712,  0.2979835 ,  0.36880894,
          0.44927578,  0.51189919]),
  'mean_score_time': array([ 0.00647736,  0.01130652,  0.01748252,  0.01736946,  0.02387433,
          0.02848735,  0.03032112]),
  'mean_test_score': array([ 0.80680934,  0.81600252,  0.81818272,  0.81838438,  0.82034069,
          0.82113345,  0.8199191 ]),
  'mean_train_score': array([ 0.8902114 ,  0.89959868,  0.90359284,  0.90555378,  0.90597112,
          0.90670245,  0.90710504]),
  'param_n_estimators': masked_array(data = [10 20 30 40 50 60 70],
               mask = [False False False False False False False],
         fill_value = ?),
  'params': [{'n_estimators': 10},
   {'n_estimators': 20},
   {'n_estimators': 30},
   {'n_estimators': 40},
   {'n_estimators': 50},
   {'n_estimators': 60},
   {'n_estimators': 70}],
  'rank_test_score': array([7, 6, 5, 4, 2, 1, 3], dtype=int32),
  'split0_test_score': array([ 0.81797431,  0.82673558,  0.8370927 ,  

这样我们得到了最佳的弱学习器迭代次数，接着我们对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。

In [7]:
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}
gs2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60,
                                                      min_samples_leaf=20,
                                                      max_features='sqrt',
                                                      oob_score=True,
                                                      random_state=10),
                   param_grid = param_test2, scoring='roc_auc', iid=False, cv=5)
gs2.fit(X,y)
gs2.cv_results_, gs2.best_params_, gs2.best_score_



({'mean_fit_time': array([ 0.37211461,  0.3869194 ,  0.36949687,  0.36878099,  0.36444592,
          0.37339869,  0.3674614 ,  0.36772904,  0.47705116,  0.46441655,
          0.50714378,  0.47404304,  0.48172145,  0.47168093,  0.4552835 ,
          0.4593061 ,  0.53781347,  0.50477858,  0.54097066,  0.53052921,
          0.51964259,  0.53038945,  0.51916456,  0.50352454,  0.56394987,
          0.58695683,  0.55792551,  0.54549308,  0.55879998,  0.57158704,
          0.55395083,  0.54457898,  0.59336486,  0.61364818,  0.58529634,
          0.58055592,  0.58809104,  0.59616151,  0.57141323,  0.55494571,
          0.61553984,  0.61400094,  0.65501208,  0.62466116,  0.64602866,
          0.65945106,  0.74992704,  0.77011166]),
  'mean_score_time': array([ 0.02051396,  0.02272873,  0.01838484,  0.02094684,  0.01957664,
          0.0186758 ,  0.01883764,  0.01825233,  0.02504783,  0.02241697,
          0.023101  ,  0.02276821,  0.02306771,  0.02163863,  0.0246634 ,
          0.02250986,  0.0

我们看看我们现在模型的袋外分数：

In [8]:
rf2 = RandomForestClassifier(n_estimators=60,
                             max_depth=13,
                            min_samples_split=110,
                            min_samples_leaf=20,
                            max_features='sqrt',
                            oob_score=True,
                            random_state=10)
rf2.fit(X, y)
print(rf2.oob_score_)

0.984


可见此时我们的袋外分数有一定的提高。也就是时候模型的泛化能力增强了。

对于内部节点再划分所需最小样本数min_samples_split，我们暂时不能一起定下来，因为这个还和决策树其他的参数存在关联。下面我们再对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。

In [9]:
param_test3 = {'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}
gs3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=60,
                                                      max_depth=13,
                                                      max_features='sqrt',
                                                      oob_score=True,
                                                      random_state=10),
                   param_grid = param_test3, scoring='roc_auc', iid=False, cv=5)
gs3.fit(X,y)
gs3.cv_results_, gs3.best_params_, gs3.best_score_



({'mean_fit_time': array([ 0.68654671,  0.61241045,  0.66750355,  0.61569638,  0.61762242,
          0.60995245,  0.61557779,  0.84833159,  0.70143919,  0.63522682,
          0.84366307,  0.62094936,  0.72801304,  0.87967553,  0.82541962,
          0.80159726,  0.70594206,  0.64006052,  0.6539124 ,  0.65555182]),
  'mean_score_time': array([ 0.03210192,  0.02856736,  0.03195238,  0.0281528 ,  0.02792025,
          0.02731304,  0.02856822,  0.04549084,  0.03914223,  0.027707  ,
          0.04004416,  0.02916646,  0.0372757 ,  0.0378438 ,  0.04491768,
          0.03931961,  0.03336945,  0.03264775,  0.03295007,  0.03129845]),
  'mean_test_score': array([ 0.8209294 ,  0.81913348,  0.82048399,  0.8179751 ,  0.8209429 ,
          0.82097426,  0.82486503,  0.82169239,  0.82352087,  0.82164475,
          0.82069876,  0.82141332,  0.82278249,  0.82141411,  0.82042881,
          0.82162093,  0.82224975,  0.82224975,  0.81890403,  0.81916643]),
  'mean_train_score': array([ 0.94798589,  0.940369

最后我们再对最大特征数max_features做调参：

In [10]:
param_test4 = {'max_features':range(3,11,2)}
gs4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60,
                                                      max_depth=13,
                                                      min_samples_split=120,
                                                      min_samples_leaf=20,
                                                      oob_score=True,
                                                      random_state=10),
                   param_grid = param_test4, scoring='roc_auc', iid=False, cv=5)
gs4.fit(X,y)
gs4.cv_results_, gs4.best_params_, gs4.best_score_



({'mean_fit_time': array([ 0.45424013,  0.57838116,  0.62967935,  0.72300634]),
  'mean_score_time': array([ 0.02816782,  0.03177066,  0.03040404,  0.03055687]),
  'mean_test_score': array([ 0.81981191,  0.8163868 ,  0.82486503,  0.81703506]),
  'mean_train_score': array([ 0.90445415,  0.91814913,  0.92847774,  0.9330581 ]),
  'param_max_features': masked_array(data = [3 5 7 9],
               mask = [False False False False],
         fill_value = ?),
  'params': [{'max_features': 3},
   {'max_features': 5},
   {'max_features': 7},
   {'max_features': 9}],
  'rank_test_score': array([2, 4, 1, 3], dtype=int32),
  'split0_test_score': array([ 0.81893102,  0.82697972,  0.83293834,  0.81775994]),
  'split0_train_score': array([ 0.8989037 ,  0.91926364,  0.92845625,  0.93346386]),
  'split1_test_score': array([ 0.79912387,  0.79626763,  0.79838748,  0.80414563]),
  'split1_train_score': array([ 0.90922633,  0.91967004,  0.92806709,  0.93307582]),
  'split2_test_score': array([ 0.78474935, 

用我们搜索到的最佳参数，我们再看看最终的模型拟合：

In [11]:
rf3 = RandomForestClassifier(n_estimators=60,
                           max_depth=13,
                           min_samples_split=120,
                           min_samples_leaf=20,
                           max_features=7,
                           oob_score=True,
                           random_state=10)
rf3.fit(X, y)
print(rf3.oob_score_)

0.984


可见此时模型的袋外分数基本没有提高，主要原因是0.984已经是一个很高的袋外分数了，如果想进一步需要提高模型的泛化能力，我们需要更多的数据。