In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
train = pd.read_csv('../ensemble-learning/train_modified.zip_files/train_modified.csv')
target='Disbursed' # Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['%s' % target].value_counts()

0    19680
1      320
Name: Disbursed, dtype: int64

In [3]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [4]:
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X,y)
print(rf0.oob_score_)
y_predprob = rf0.predict_proba(X)[:,1]
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))



0.98315
AUC Score (Train): 0.999994


In [10]:
param_test1 = {'n_estimators':range(10,71,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                        min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([0.05345702, 0.11788507, 0.14900713, 0.18809733, 0.23057714,
         0.28642883, 0.33391018]),
  'std_fit_time': array([0.00257584, 0.01202385, 0.00709591, 0.00406839, 0.00159624,
         0.00715641, 0.00688195]),
  'mean_score_time': array([0.00578532, 0.00917583, 0.01256642, 0.01496034, 0.01775932,
         0.02194691, 0.02373085]),
  'std_score_time': array([3.98803110e-04, 3.98731260e-04, 7.98153968e-04, 7.29420592e-07,
         4.03100913e-04, 1.09341747e-03, 4.10485804e-04]),
  'param_n_estimators': masked_array(data=[10, 20, 30, 40, 50, 60, 70],
               mask=[False, False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'n_estimators': 10},
   {'n_estimators': 20},
   {'n_estimators': 30},
   {'n_estimators': 40},
   {'n_estimators': 50},
   {'n_estimators': 60},
   {'n_estimators': 70}],
  'split0_test_score': array([0.81797431, 0.82673558, 0.8370927 , 0.83676321, 0.8351753 ,
         0.83

In [11]:
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60,
                                  min_samples_leaf=20,max_features='sqrt' ,random_state=10),
   param_grid = param_test2, scoring='roc_auc', cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 13, 'min_samples_split': 110}, 0.8242016800050813)

In [12]:
param_test3 = {'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, max_depth=13,
                        max_features='sqrt' ,random_state=10),
                        param_grid = param_test3, scoring='roc_auc', cv=5)
gsearch3.fit(X,y)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

({'mean_fit_time': array([0.34149332, 0.35345788, 0.33131981, 0.32014346, 0.32912068,
         0.34429593, 0.33032775, 0.30778255, 0.30518985, 0.30079651,
         0.30079603, 0.29840894, 0.29920058, 0.29760475, 0.29561567,
         0.29501133, 0.29122748, 0.29241714, 0.29102139, 0.28862915]),
  'std_fit_time': array([0.02187503, 0.01947904, 0.00920713, 0.00676692, 0.00954506,
         0.03059857, 0.00719183, 0.00561985, 0.0018932 , 0.00286166,
         0.00287599, 0.00364305, 0.00315452, 0.00195397, 0.00162251,
         0.0009771 , 0.00209204, 0.00232143, 0.00193389, 0.00185   ]),
  'mean_score_time': array([0.02373147, 0.02393332, 0.02413964, 0.02273946, 0.02333751,
         0.02412462, 0.02353168, 0.02293916, 0.02293291, 0.02234626,
         0.02234602, 0.02234554, 0.02193556, 0.02194171, 0.02196107,
         0.02214613, 0.02193608, 0.02194223, 0.02194777, 0.02174234]),
  'std_score_time': array([3.95711641e-04, 1.08653289e-03, 3.96572969e-04, 7.46111086e-04,
         4.88694899e-04

In [13]:
rf2 = RandomForestClassifier(n_estimators= 60, max_depth=13, min_samples_split=120,
                                  min_samples_leaf=20,max_features=7 ,oob_score=True, random_state=10)
rf2.fit(X,y)
print(rf2.oob_score_)


0.984


