In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

合并训练集和待遇测集，统一处理。合并前多添加一列‘source’，分别用0和1，区分训练集和测试集。经检查，列‘Age’都是18岁以上，所以列‘Over18’无意义删除之，然后对个别的数据类型为str的列进行编码，处理完毕后拆分训练集和待遇测集

In [2]:
#读取数据，合并统一处理
df_train = pd.read_csv('pfm_train.csv')
df_target = pd.read_csv('pfm_test.csv')
df_train['source'] = 0
df_target['source'] = 1
X = pd.concat([df_train.drop('Attrition', axis=1),df_target],ignore_index=True)

#清洗数据
#print('under18 =',X[X['Age']<18].shape[0] == 0)
X = X.drop(['Over18'],axis=1)
column_list = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
X_le = pd.get_dummies(X[column_list])
X = pd.merge(X,X_le,how='outer',left_index=True,right_index=True).drop(column_list,axis=1)

#拆分数据
X_train = X[X['source']==0].drop(['source'],axis=1)
y_train = df_train['Attrition']
X_target = X[X['source']==1].drop(['source'],axis=1)

RandomForestClassifier默认参数，训练后预测X_target，得分0.85

In [3]:
rfc_1 = RandomForestClassifier(oob_score=True, random_state=10)
rfc_1.fit(X_train, y_train)
print(rfc_1.oob_score_)
result_1 = pd.DataFrame(rfc_1.predict(X_target),columns=['result'])
result_1.to_csv('result_1.csv',index=False)

0.824545454545


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


参数调整，首先对‘n_estimators’进行搜索

In [4]:
%%time
def search_1(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(random_state=10),param_grid=param_list_1,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['n_estimators']

param_list_1 = {'n_estimators':np.arange(10,201,10)}
best_n_estimators = search_1(param_list_1)
param_list_1 = {'n_estimators':np.arange(best_n_estimators-10,best_n_estimators+10)}
best_n_estimators = search_1(param_list_1)

{'n_estimators': 140} 0.801725255323
{'n_estimators': 136} 0.803001174664
CPU times: user 863 ms, sys: 103 ms, total: 966 ms
Wall time: 11.7 s


参数调整，首先对‘max_depth’，‘min_samples_split’，‘min_samples_leaf’进行搜索

In [5]:
%%time
def search_2(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(n_estimators=best_n_estimators,random_state=10),
                     param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['max_depth'], gs.best_params_['min_samples_split'], gs.best_params_['min_samples_leaf']

param_list_2 = {'max_depth':np.arange(1,61,10),'min_samples_split':np.arange(2,61,10),'min_samples_leaf':np.arange(1,61,10)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-20,best_max_depth+20,5),
                'min_samples_split':np.arange(best_min_samples_split-20,best_min_samples_split+20,5),
                'min_samples_leaf':np.arange(1,31,5)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-10,best_max_depth+10,2),
                'min_samples_split':np.arange(best_min_samples_split-9,best_min_samples_split+10,2),
                'min_samples_leaf':np.arange(1,5)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-5,best_max_depth+5),
                'min_samples_split':np.arange(best_min_samples_split-5,best_min_samples_split+5),
                'min_samples_leaf':np.arange(1,3)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

{'max_depth': 21, 'min_samples_leaf': 1, 'min_samples_split': 22} 0.807887794344
{'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 17} 0.809948683155
{'max_depth': 13, 'min_samples_leaf': 1, 'min_samples_split': 10} 0.813730513546
{'max_depth': 13, 'min_samples_leaf': 1, 'min_samples_split': 10} 0.813730513546
CPU times: user 11.6 s, sys: 919 ms, total: 12.5 s
Wall time: 4min 26s


参数调整，首先对‘n_estimators’，‘max_depth’，‘min_samples_split’，‘min_samples_leaf’进行最后一次小范围搜索

In [6]:
%%time
def search_3(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(random_state=10),param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['n_estimators'], gs.best_params_['max_depth'], gs.best_params_['min_samples_split'], gs.best_params_['min_samples_leaf']
param_list_3 = {'n_estimators':np.arange(best_n_estimators-3,best_n_estimators+3),
                'max_depth':np.arange(best_max_depth-3,best_max_depth+3),
                'min_samples_split':np.arange(best_min_samples_split-3,best_min_samples_split+3),
                'min_samples_leaf':np.arange(1,3)}
best_n_estimators, best_max_depth, best_min_samples_split, best_min_samples_leaf = search_3(param_list_3)

{'max_depth': 13, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 135} 0.813966688456
CPU times: user 4.01 s, sys: 306 ms, total: 4.31 s
Wall time: 1min 41s


用各个最佳参数，训练第二个模型预测X_target，得分0.86

In [7]:
%%time
rfc_2 = RandomForestClassifier(n_estimators=best_n_estimators,max_depth=best_max_depth,
                               min_samples_split=best_min_samples_split,min_samples_leaf=best_min_samples_leaf,
                               oob_score=True, random_state=10)
rfc_2.fit(X_train, y_train)
print(rfc_2.oob_score_)
result_2 = pd.DataFrame(rfc_2.predict(X_target),columns=['result'])
result_2.to_csv('result_1.csv',index=False)

0.859090909091
CPU times: user 250 ms, sys: 4.8 ms, total: 254 ms
Wall time: 252 ms
