In [30]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from pyecharts import Bar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

合并训练集和待遇测集，统一处理。合并前多添加一列‘source’，分别用0和1，区分训练集和测试集。经检查，列‘Age’都是18岁以上，所以列‘Over18’无意义删除之。列‘EmployeeNumber’对分析无意义删除之。然后对个别的数据类型为str的列进行编码，处理完毕后拆分训练集和待遇测集

In [24]:
#读取数据，合并统一处理
df_train = pd.read_csv('pfm_train.csv')
df_target = pd.read_csv('pfm_test.csv')
df_train['source'] = 0
df_target['source'] = 1
X = pd.concat([df_train.drop('Attrition', axis=1),df_target],ignore_index=True)

#清洗数据
#print('under18 =',X[X['Age']<18].shape[0] == 0)
X = X.drop(['Over18','EmployeeNumber'],axis=1)
column_list = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
X_le = pd.get_dummies(X[column_list])
X = pd.merge(X,X_le,how='outer',left_index=True,right_index=True).drop(column_list,axis=1)
X.drop(['source'],axis=1).iloc[1,:]
min_max_scaler = preprocessing.MinMaxScaler()
X = pd.DataFrame(min_max_scaler.fit_transform(X),columns=X.columns)

#拆分数据
X_train = X[X['source']==0].drop(['source'],axis=1)
y_train = df_train['Attrition']
X_target = X[X['source']==1].drop(['source'],axis=1)

PCA降维

In [28]:
pca = PCA(n_components=0.90)
pca.fit(X_train)
#print(pca.explained_variance_ratio_)#方差比例
#print(pca.explained_variance_)#方差
#print(pca.components_[0])#各个特征的方差
bar = Bar('components')
bar.add('',X.drop(['source'],axis=1).columns,pca.components_[0],
        xaxis_rotate=45,xaxis_interval=0,label_emphasis_textcolor='#000',is_random=True)
bar

X_train和X_target用PCA进行transform

In [26]:
X_train_pca = pca.transform(X_train)
X_target_pca = pca.transform(X_target)
X_train_pca.shape, X_target_pca.shape

((1100, 23), (350, 23))

先用默认参数的RandomForestClassifier，进行训练

In [33]:
rfc_1 = RandomForestClassifier(oob_score=True,random_state=10)
rfc_1.fit(X_train_pca,y_train)
rfc_1.oob_score_

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


0.83818181818181814

In [43]:
%%time
def search_1(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(random_state=10),
                      param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train_pca,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['n_estimators']

param_list_1 = {'n_estimators':np.arange(100,301,10)}
best_n_estimators = search_1(param_list_1)

param_list_1 = {'n_estimators':np.arange(best_n_estimators-20,best_n_estimators+20,5)}
best_n_estimators = search_1(param_list_1)

param_list_1 = {'n_estimators':np.arange(best_n_estimators-10,best_n_estimators+10,2)}
best_n_estimators = search_1(param_list_1)

param_list_1 = {'n_estimators':np.arange(best_n_estimators-5,best_n_estimators+5)}
best_n_estimators = search_1(param_list_1)

{'n_estimators': 220} 0.81114535908
{'n_estimators': 220} 0.81114535908
{'n_estimators': 220} 0.81114535908
{'n_estimators': 223} 0.811256446452
CPU times: user 3.34 s, sys: 239 ms, total: 3.58 s
Wall time: 32.8 s


In [50]:
%%time
def search_2(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(n_estimators=best_n_estimators,random_state=10),
                      param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train_pca,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['max_depth'], gs.best_params_['min_samples_split'], gs.best_params_['min_samples_leaf']

param_list_2 = {'max_depth':np.arange(1,61,10),
                'min_samples_split':np.arange(2,61,10),
                'min_samples_leaf':np.arange(1,61,10)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-10,best_max_depth+10,5),
                'min_samples_split':np.arange(2,5),
                'min_samples_leaf':np.arange(best_min_samples_leaf-10,best_min_samples_leaf+10,5)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-5,best_max_depth+5,2),
                'min_samples_split':np.arange(2,5),
                'min_samples_leaf':np.arange(best_min_samples_leaf-5,best_min_samples_leaf+5,2)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-3,best_max_depth+3),
                'min_samples_split':np.arange(2,3),
                'min_samples_leaf':np.arange(best_min_samples_leaf-3,best_min_samples_leaf+3)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

{'max_depth': 11, 'min_samples_leaf': 11, 'min_samples_split': 2} 0.815994491668
{'max_depth': 11, 'min_samples_leaf': 11, 'min_samples_split': 2} 0.815994491668
{'max_depth': 14, 'min_samples_leaf': 10, 'min_samples_split': 2} 0.813401317184
{'max_depth': 11, 'min_samples_leaf': 11, 'min_samples_split': 2} 0.815994491668
CPU times: user 5.14 s, sys: 410 ms, total: 5.55 s
Wall time: 2min 55s


In [51]:
%%time
def search_3(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(random_state=10),
                      param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train_pca,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['n_estimators'], gs.best_params_['max_depth'], gs.best_params_['min_samples_split'], gs.best_params_['min_samples_leaf']

param_list_3 = {'n_estimators':np.arange(best_n_estimators-3,best_n_estimators+3),
                'max_depth':np.arange(best_max_depth-3,best_max_depth+3),
                'min_samples_split':np.arange(2,3),
                'min_samples_leaf':np.arange(best_min_samples_leaf-3,best_min_samples_leaf+3)}
best_n_estimators, best_max_depth, best_min_samples_split, best_min_samples_leaf = search_3(param_list_3)

{'max_depth': 11, 'min_samples_leaf': 11, 'min_samples_split': 2, 'n_estimators': 222} 0.816598730718
CPU times: user 2.15 s, sys: 174 ms, total: 2.32 s
Wall time: 1min 44s


In [54]:
rfc_2 = RandomForestClassifier(n_estimators=best_n_estimators,
                               max_depth=best_max_depth,
                               min_samples_split=best_min_samples_split,
                               min_samples_leaf=best_min_samples_leaf,oob_score=True,random_state=10)
rfc_2.fit(X_train_pca,y_train)
rfc_2.oob_score_
result_2 = pd.DataFrame(rfc_2.predict(X_target_pca),columns=['result'])
result_2.to_csv('result_2.csv',index=False)