In [74]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from pyecharts import Bar
from sklearn.decomposition import PCA

In [75]:
#读取数据，合并统一处理
df_train = pd.read_csv('pfm_train.csv')
df_target = pd.read_csv('pfm_test.csv')
df_train['source'] = 0
df_target['source'] = 1
X = pd.concat([df_train.drop('Attrition', axis=1),df_target],ignore_index=True)

#清洗数据
#print('under18 =',X[X['Age']<18].shape[0] == 0)
X = X.drop(['Over18','EmployeeNumber','StandardHours'],axis=1)
def replace_business_travel(x):
    if x == 'Non-Travel':
        return 1
    elif x == 'Travel_Rarely':
        return 2
    else:
        return 3
X['BusinessTravel'] = X['BusinessTravel'].apply(lambda x:replace_business_travel(x))
X['OverTime'] = LabelEncoder.fit_transform(X['OverTime'],X['OverTime']).astype('str')
X['MonthlyIncome_overmean'] = X['MonthlyIncome'].apply(lambda x:0 if x >= X['MonthlyIncome'].mean() else 1).astype('str')
X['PercentSalaryHike_overmean'] = X['PercentSalaryHike'].apply(lambda x:0 if x >= X['PercentSalaryHike'].mean() else 1).astype('str')
X['TotalWorkingYears_overmean'] = X['TotalWorkingYears'].apply(lambda x:0 if x >= X['TotalWorkingYears'].mean() else 1).astype('str')
X['YearsAtCompany_overmean'] = X['YearsAtCompany'].apply(lambda x:0 if x >= X['YearsAtCompany'].mean() else 1).astype('str')
X['YearsInCurrentRole_overmean'] = X['YearsInCurrentRole'].apply(lambda x:0 if x >= X['YearsInCurrentRole'].mean() else 1).astype('str')
X['YearsSinceLastPromotion_overmean'] = X['YearsSinceLastPromotion'].apply(lambda x:0 if x >= X['YearsSinceLastPromotion'].mean() else 1).astype('str')
X['YearsWithCurrManager_overmean'] = X['YearsWithCurrManager'].apply(lambda x:0 if x >= X['YearsWithCurrManager'].mean() else 1).astype('str')

column_list = ['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime',
               'MonthlyIncome_overmean','PercentSalaryHike_overmean','TotalWorkingYears_overmean',
               'YearsAtCompany_overmean','YearsInCurrentRole_overmean','YearsSinceLastPromotion_overmean',
               'YearsWithCurrManager_overmean']
X_le = pd.get_dummies(X[column_list])
X = pd.merge(X,X_le,how='outer',left_index=True,right_index=True).drop(column_list,axis=1)

min_max_scaler = MinMaxScaler()
X = pd.DataFrame(min_max_scaler.fit_transform(X),columns=X.columns)

#拆分数据
X_train = X[X['source']==0].drop(['source'],axis=1)
y_train = df_train['Attrition']
X_target = X[X['source']==1].drop(['source'],axis=1)

In [81]:
rfc_1 = RandomForestClassifier(oob_score=True, random_state=10)
rfc_1.fit(X_train, y_train)
print(rfc_1.oob_score_)
result_1 = pd.DataFrame(rfc_1.predict(X_target),columns=['result'])
result_1.to_csv('result_3.csv',index=False)

0.836363636364


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [83]:
%%time
def search_1(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(random_state=10),
                      param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['n_estimators']

param_list_1 = {'n_estimators':np.arange(100,301,10)}
best_n_estimators = search_1(param_list_1)

param_list_1 = {'n_estimators':np.arange(best_n_estimators-20,best_n_estimators+20,5)}
best_n_estimators = search_1(param_list_1)

param_list_1 = {'n_estimators':np.arange(best_n_estimators-10,best_n_estimators+10,2)}
best_n_estimators = search_1(param_list_1)

param_list_1 = {'n_estimators':np.arange(best_n_estimators-5,best_n_estimators+5)}
best_n_estimators = search_1(param_list_1)

{'n_estimators': 250} 0.807449996015
{'n_estimators': 245} 0.807734241017
{'n_estimators': 251} 0.807738715847
{'n_estimators': 254} 0.80792846651
CPU times: user 2.27 s, sys: 281 ms, total: 2.55 s
Wall time: 22.5 s


In [93]:
%%time
def search_2(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(n_estimators=best_n_estimators,random_state=10),
                      param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['max_depth'], gs.best_params_['min_samples_split'], gs.best_params_['min_samples_leaf']

param_list_2 = {'max_depth':np.arange(1,61,10),
                'min_samples_split':np.arange(2,61,10),
                'min_samples_leaf':np.arange(1,61,10)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-10,best_max_depth+10,5),
                'min_samples_split':np.arange(best_min_samples_split-10,best_min_samples_split+10,5),
                'min_samples_leaf':np.arange(2,5)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

param_list_2 = {'max_depth':np.arange(best_max_depth-5,best_max_depth+5),
                'min_samples_split':np.arange(best_min_samples_split-5,best_min_samples_split+5),
                'min_samples_leaf':np.arange(2,5)}
best_max_depth, best_min_samples_split, best_min_samples_leaf = search_2(param_list_2)

{'max_depth': 21, 'min_samples_leaf': 1, 'min_samples_split': 22} 0.809786860972
{'max_depth': 16, 'min_samples_leaf': 4, 'min_samples_split': 12} 0.815253532895
{'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 12} 0.815314667576
CPU times: user 6.56 s, sys: 645 ms, total: 7.2 s
Wall time: 3min 34s


In [96]:
%%time
def search_3(param_list):
    gs = GridSearchCV(estimator=RandomForestClassifier(random_state=10),
                      param_grid=param_list,scoring='roc_auc',cv=5,n_jobs=-1)
    gs.fit(X_train,y_train)
    print(gs.best_params_,gs.best_score_)
    return gs.best_params_['n_estimators'], gs.best_params_['max_depth'], gs.best_params_['min_samples_split'], gs.best_params_['min_samples_leaf']

param_list_3 = {'n_estimators':np.arange(best_n_estimators-3,best_n_estimators+3),
                'max_depth':np.arange(best_max_depth-3,best_max_depth+3),
                'min_samples_split':np.arange(best_min_samples_split-3,best_min_samples_split+3),
                'min_samples_leaf':np.arange(2,5)}
best_n_estimators, best_max_depth, best_min_samples_split, best_min_samples_leaf = search_3(param_list_3)

{'max_depth': 14, 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 254} 0.815133614981
CPU times: user 6.51 s, sys: 595 ms, total: 7.11 s
Wall time: 4min 20s


In [104]:
rfc_2 = RandomForestClassifier(n_estimators=best_n_estimators,
                               max_depth=best_max_depth,
                               min_samples_split=best_min_samples_split,
                               min_samples_leaf=best_min_samples_leaf,oob_score=True,random_state=10)
rfc_2.fit(X_train,y_train)
print(rfc_2.oob_score_)
result_2 = pd.DataFrame(rfc_2.predict(X_target),columns=['result'])
result_2.to_csv('result_3.csv',index=False)

0.854545454545


In [101]:
pca = PCA(n_components=0.95)
pca.fit(X_train)
#print(pca.explained_variance_ratio_)#方差比例
#print(pca.explained_variance_)#方差
#print(pca.components_[0])#各个特征的方差
bar = Bar('components')
bar.add('',X.drop(['source'],axis=1).columns,pca.components_[0],
        xaxis_rotate=45,xaxis_interval=0,label_emphasis_textcolor='#000',is_random=True)
bar

In [102]:
X_train_pca = pca.transform(X_train)
X_target_pca = pca.transform(X_target)
X_train_pca.shape, X_target_pca.shape

((1100, 29), (350, 29))

In [106]:
rfc_1 = RandomForestClassifier(oob_score=True, random_state=10)
rfc_1.fit(X_train_pca, y_train)
print(rfc_1.oob_score_)
result_1 = pd.DataFrame(rfc_1.predict(X_target_pca),columns=['result'])
result_1.to_csv('result_3_pca.csv',index=False)

0.832727272727


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [122]:
df_train['DistanceFromHome'].value_counts()

2     159
1     144
10     68
3      63
8      61
9      58
7      55
4      49
5      49
6      47
16     26
24     25
29     23
11     22
18     21
23     20
15     20
20     19
25     19
26     19
19     18
13     17
14     16
17     15
28     15
22     15
12     14
21     12
27     11
Name: DistanceFromHome, dtype: int64

In [113]:
X_train['DistanceFromHome'].unique()

array([ 0.        ,  0.21428571,  0.10714286,  0.28571429,  0.03571429,
        0.75      ,  0.32142857,  0.07142857,  0.25      ,  0.5       ,
        0.64285714,  0.46428571,  0.60714286,  0.57142857,  0.78571429,
        0.82142857,  0.14285714,  1.        ,  0.89285714,  0.17857143,
        0.92857143,  0.53571429,  0.42857143,  0.71428571,  0.85714286,
        0.96428571,  0.39285714,  0.35714286,  0.67857143])