#### 员工离职预测
- 我们有员工的各种统计信息，以及该员工是否已经离职，统计的信息包括了（工资、出差、工作环境满意度、工作投入度、是否加班、是否升职、工资提升比例等）
- 现在需要你来通过训练数据得出 员工离职预测，并给出你在测试集上的预测结果"
- https://www.kaggle.com/c/bi-attrition-prediction/

In [261]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
 

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

In [262]:
def data_etl(data):
    #targte: Attrition, 需要改成0, 1; test数据没有, pass
    if 'Attrition' not in data.columns :
        pass
    else: 
        data['Attrition']  = data['Attrition'].replace({'Yes':1, 'No': 0})
    
    #drop Over18 columns
    data.drop(columns = ['Over18'], inplace = True)

    #对分类型的数据进行labelEncoder

    le = LabelEncoder()

    data['BusinessTravel'] = le.fit_transform(data['BusinessTravel'])
    data['Department'] = le.fit_transform(data['Department'])
    data['EducationField'] = le.fit_transform(data['EducationField'])
    data['JobRole'] = le.fit_transform(data['JobRole'])
    data['MaritalStatus'] = le.fit_transform(data['MaritalStatus'])
    data['Gender'] = le.fit_transform(data['Gender'])
    data['OverTime'] = le.fit_transform(data['OverTime'])

    return data

#训练不同模型并输出分数报告
def get_model_score(clf, X_train, y_train, X_test, y_test):

    clf.fit(X_train, y_train)
    y_predit = clf.predict(X_test)

    # 模型表现
    report = pd.Series()
    report['clfName'] = str(clf).split('(')[0]
    report['score']  = round(clf.score(X_train, y_train), 4)
    report['cv_score'] = round(np.mean(cross_val_score(clf, X_train, y_train, cv=10)),4)
    report['auc_score'] = round(roc_auc_score(y_test, y_predit),4)
    report['clf'] = clf
    report = pd.DataFrame(report).T

    return report

In [263]:
train_data = pd.read_csv(r'E:\bi_course\biCourse\L1\tue_course\train.csv', encoding = 'gb18030')
test_data = pd.read_csv(r'E:\bi_course\biCourse\L1\tue_course\test.csv', encoding = 'gb18030')

#数据探索
print('训练数据行列数:',train_data.shape, '; 测试数据行列数:',test_data.shape)

print('-'*20,'训练集数据探索','-'*20)
train_data.info()
print('-'*40,'-'*40)
print(train_data.describe())

print('-'*20,'测试集数据探索','-'*20)
test_data.info()
print('-'*40,'-'*40)
print(test_data.describe())

#对train 和 test 进行数据清洗: 

train_data = data_etl(train_data)
test_data = data_etl(test_data)

训练数据行列数: (1176, 36) ; 测试数据行列数: (294, 35)
-------------------- 训练集数据探索 --------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 36 columns):
user_id                     1176 non-null int64
Age                         1176 non-null int64
Attrition                   1176 non-null object
BusinessTravel              1176 non-null object
DailyRate                   1176 non-null int64
Department                  1176 non-null object
DistanceFromHome            1176 non-null int64
Education                   1176 non-null int64
EducationField              1176 non-null object
EmployeeCount               1176 non-null int64
EmployeeNumber              1176 non-null int64
EnvironmentSatisfaction     1176 non-null int64
Gender                      1176 non-null object
HourlyRate                  1176 non-null int64
JobInvolvement              1176 non-null int64
JobLevel                    1176 non-null int64
JobRole                     1

In [307]:
#特征列表

features = list(set(train_data.columns.to_list()) - set(['user_id','Attrition']) )
X_train,X_test, y_train, y_test = train_test_split(train_data[features], train_data['Attrition'], train_size = 0.7, random_state =1)   #模型训练集和测试集
x_toPredict = test_data[features] #样本外需要预测的数据

#数据规范化
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
x_toPredict = scaler.fit_transform(x_toPredict)

#特征选择

clf = XGBClassifier()
clf.fit(X_train, y_train)

importance = clf.feature_importances_

importance_df = pd.DataFrame()
importance_df['features'] = features
importance_df['importance'] = importance
importance_df.sort_values('importance', ascending= False)

features_select = list(set(importance_df[importance_df['importance']> 0.01].features))

X_train,X_test, y_train, y_test = train_test_split(train_data[features_select], train_data['Attrition'], train_size = 0.7, random_state =1)   #模型训练集和测试集
x_toPredict = test_data[features_select] #样本外需要预测的数据

#数据规范化
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
x_toPredict = scaler.fit_transform(x_toPredict)

In [308]:

#训练不同模型并得到评估结果

clfList = [DecisionTreeClassifier(), LogisticRegression(), GaussianNB(),MultinomialNB(),BernoulliNB(),svm.SVC(), ExtraTreesClassifier(),\
            RandomForestClassifier(), KNeighborsClassifier(), AdaBoostClassifier(),GradientBoostingClassifier(), XGBClassifier(), LinearDiscriminantAnalysis(),]

score_report = pd.DataFrame()

for clf in clfList:

    result = get_model_score(clf, X_train, y_train, X_test, y_test)
    score_report = score_report.append(result)

# score_report
score_report = score_report.sort_values(by = 'auc_score', ascending = False)
score_report

Unnamed: 0,clfName,score,cv_score,auc_score,clf
0,BernoulliNB,0.8433,0.8335,0.7354,"BernoulliNB(alpha=1.0, binarize=0.0, class_pri..."
0,GaussianNB,0.7436,0.7253,0.6979,"GaussianNB(priors=None, var_smoothing=1e-09)"
0,AdaBoostClassifier,0.9113,0.8554,0.6725,"(DecisionTreeClassifier(class_weight=None, cri..."
0,LinearDiscriminantAnalysis,0.87,0.8541,0.6093,"LinearDiscriminantAnalysis(n_components=None, ..."
0,DecisionTreeClassifier,1.0,0.7535,0.6064,"DecisionTreeClassifier(class_weight=None, crit..."
0,GradientBoostingClassifier,0.9696,0.8494,0.6007,([DecisionTreeRegressor(criterion='friedman_ms...
0,XGBClassifier,1.0,0.853,0.5872,"XGBClassifier(base_score=0.5, booster='gbtree'..."
0,LogisticRegression,0.8676,0.8578,0.5869,"LogisticRegression(C=1.0, class_weight=None, d..."
0,KNeighborsClassifier,0.8676,0.8153,0.5701,"KNeighborsClassifier(algorithm='auto', leaf_si..."
0,RandomForestClassifier,0.9854,0.8385,0.5681,"(DecisionTreeClassifier(class_weight=None, cri..."


In [309]:
#拿最优模型预测df_toPredict(即test数据集)

bestModel = score_report.iloc[0]['clf']
bestModel.fit(X_train, y_train)

print('bestModel:', bestModel)

sumbmission = pd.DataFrame()
sumbmission['user_id'] = test_data['user_id']
temp = pd.DataFrame(bestModel.predict_proba(x_toPredict),columns=['不流失概率','Attrition'])
sumbmission['Attrition'] = temp['Attrition'] #用最优模型对y值进行预测
sumbmission = sumbmission.set_index('user_id').sort_index()

sumbmission.to_csv('hr_submission.csv', header = True)

print('submission to csv done')

bestModel: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
submission to csv done
