# 机器学习建模

In [2]:
# 导包
import pandas as pd
import numpy as np
# 预处理
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
# 建模
from sklearn.model_selection import train_test_split

# 把Graphviz加入path环境变量
# import os
# os.environ['PATH'] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\bin'

In [3]:
# 总的预处理函数
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,slr=False,dp=False,lower_d=False,ld_n=1):
    # 读数据
    df = pd.read_csv('./data/HR.csv')
    # 1.清洗数据
    df = df.dropna(subset=['satisfaction_level'])
    df = df[df['last_evaluation']<=1][df['salary']!='nme']
    # 2.得到标注
    label = df['left']
    df = df.drop('left',axis=1)
    # 3.特征选择，特征较少，先不删除特征
    # 4.特征处理
    # 连续属性
    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]
    column_lst = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                 'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))
        else:
            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))
    # 离散属性
    # 数值化重写map函数，把salary对应到我们想要的数值
    def map_salary(s):
        d = dict([('low',0),('medium',1),('high',2)])
        return d.get(s,0)
    scaler_lst = [slr,dp]
    column_lst = ['salary','department']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            # 归一化处理
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1))
        else:
            df = pd.get_dummies(df,columns=[column_lst[i]])
    # 5.降维
    if lower_d:
        # 因为标注只有两类，LDA降维只剩1类，所以不使用LDA，使用PCA
        # return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

features,label = hr_preprocessing()
features,label

(       satisfaction_level  last_evaluation  number_project  \
 0                0.318681         0.265625             0.0   
 1                0.780220         0.781250             0.6   
 2                0.021978         0.812500             1.0   
 3                0.692308         0.796875             0.6   
 4                0.307692         0.250000             0.0   
 ...                   ...              ...             ...   
 14994            0.340659         0.328125             0.0   
 14995            0.307692         0.187500             0.0   
 14996            0.307692         0.265625             0.0   
 14997            0.021978         0.937500             0.8   
 14998            0.307692         0.250000             0.0   
 
        average_monthly_hours  time_spend_company  Work_accident  \
 0                   0.285047               0.125            0.0   
 1                   0.775701               0.500            0.0   
 2                   0.822430         

## 数据集切分为训练集、测试集、验证集

In [20]:
def hr_modeling(features,label):
    f_v = features.values
    l_v = label.values
    f_names = features.columns.values
    # 切分数据集，6:2:2
    X_tt,X_validation,Y_tt,Y_validation = train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test = train_test_split(X_tt,Y_tt,test_size=0.25)
    print(len(X_train),len(X_validation),len(X_test))
    return X_train,Y_train,X_test,Y_test,X_validation,Y_validation,f_names,f_v,l_v

X_train,Y_train,X_test,Y_test,X_validation,Y_validation,f_names,f_v,l_v = hr_modeling(features,label)
X_train,Y_train,X_test,Y_test,X_validation,Y_validation

8999 3000 3000


(array([[0.87912088, 0.3125    , 0.2       , ..., 0.        , 0.88888889,
         0.5       ],
        [0.51648352, 0.890625  , 0.4       , ..., 0.        , 0.11111111,
         0.        ],
        [0.34065934, 0.15625   , 0.        , ..., 0.        , 0.55555556,
         0.        ],
        ...,
        [0.95604396, 0.8125    , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.79120879, 0.3125    , 0.4       , ..., 0.        , 0.11111111,
         0.5       ],
        [0.31868132, 0.296875  , 0.        , ..., 0.        , 0.77777778,
         0.        ]]),
 array([0, 0, 1, ..., 0, 0, 1], dtype=int64),
 array([[0.28571429, 0.25      , 0.6       , ..., 0.        , 0.33333333,
         0.        ],
        [0.69230769, 0.875     , 0.2       , ..., 0.        , 0.77777778,
         0.5       ],
        [0.71428571, 0.796875  , 0.6       , ..., 0.        , 0.22222222,
         1.        ],
        ...,
        [0.6043956 , 0.609375  , 0.4       , ..., 0.        , 

## 监督学习-分类

### KNN

In [5]:
# knn导包
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,Y_train)

# 对训练集进行预测
Y_pred = knn_clf.predict(X_train)
print('Train')
print('ACC:',accuracy_score(Y_train,Y_pred))
print('REC:',recall_score(Y_train,Y_pred))
print('F-Score',f1_score(Y_train,Y_pred))

# 对验证集进行预测
Y_pred = knn_clf.predict(X_validation)
print('Validation')
print('ACC:',accuracy_score(Y_validation,Y_pred))
print('REC:',recall_score(Y_validation,Y_pred))
print('F-Score',f1_score(Y_validation,Y_pred))

# 对测试集进行预测
Y_pred = knn_clf.predict(X_test)
print('Test')
print('ACC:',accuracy_score(Y_test,Y_pred))
print('REC:',recall_score(Y_test,Y_pred))
print('F-Score',f1_score(Y_test,Y_pred))

Train
ACC: 0.974997221913546
REC: 0.9603024574669187
F-Score 0.9475402191653066
Validation
ACC: 0.955
REC: 0.9298245614035088
F-Score 0.9107732980832783
Test
ACC: 0.9526666666666667
REC: 0.9299719887955182
F-Score 0.9034013605442176


In [6]:
# 将训练好的模型保存
from sklearn.externals import joblib
joblib.dump(knn_clf,'knn_clf')



['knn_clf']

In [7]:
# 加载训练好的模型，并进行推理
knn_clf2 = joblib.load('knn_clf')
Y_pred = knn_clf2.predict(X_test)
print('Test2')
print('ACC:',accuracy_score(Y_test,Y_pred))
print('REC:',recall_score(Y_test,Y_pred))
print('F-Score',f1_score(Y_test,Y_pred))

Test2
ACC: 0.9526666666666667
REC: 0.9299719887955182
F-Score 0.9034013605442176


In [8]:
# 构建模型模板
models = []
models.append(('KNN',knn_clf))
def model_fit_print():
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_test = [(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        for i in range(len(xy_test)):
            X_part = xy_test[i][0]
            Y_part = xy_test[i][1]
            Y_pred = clf.predict(X_part)
            print(i)
            print(clf_name,"-ACC:",accuracy_score(Y_part,Y_pred))
            print(clf_name,"-REC:",recall_score(Y_part,Y_pred))
            print(clf_name,"-F1:",f1_score(Y_part,Y_pred))
        return clf_name,clf

model_fit_print()

0
KNN -ACC: 0.974997221913546
KNN -REC: 0.9603024574669187
KNN -F1: 0.9475402191653066
1
KNN -ACC: 0.955
KNN -REC: 0.9298245614035088
KNN -F1: 0.9107732980832783
2
KNN -ACC: 0.9526666666666667
KNN -REC: 0.9299719887955182
KNN -F1: 0.9034013605442176


('KNN',
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                      weights='uniform'))

### 朴素贝叶斯
贝叶斯用到的值都是离散的，
如果这些值都是二值（0,1）的话，就用伯努利贝叶斯，若值是连续的，伯努利贝叶斯也会将其二值化，
如果值服从高斯分布，就用高斯贝叶斯

In [9]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB
models = []
models.append(('GaussianNB',GaussianNB()))
model_fit_print()

0
GaussianNB -ACC: 0.8033114790532281
GaussianNB -REC: 0.695179584120983
GaussianNB -F1: 0.6243633276740238
1
GaussianNB -ACC: 0.8043333333333333
GaussianNB -REC: 0.7314439946018894
GaussianNB -F1: 0.6487133453022143
2
GaussianNB -ACC: 0.801
GaussianNB -REC: 0.6862745098039216
GaussianNB -F1: 0.62143310082435


('GaussianNB', GaussianNB(priors=None, var_smoothing=1e-09))

In [10]:
models = []
models.append(('BernoulliNB',BernoulliNB()))
model_fit_print()

0
BernoulliNB -ACC: 0.845316146238471
BernoulliNB -REC: 0.46975425330812853
BernoulliNB -F1: 0.5881656804733728
1
BernoulliNB -ACC: 0.8416666666666667
BernoulliNB -REC: 0.49527665317139
BernoulliNB -F1: 0.6071133167907362
2
BernoulliNB -ACC: 0.8303333333333334
BernoulliNB -REC: 0.44397759103641454
BernoulliNB -F1: 0.5546806649168853


('BernoulliNB',
 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))

### 决策树

In [11]:
from sklearn.tree import DecisionTreeClassifier,export_graphviz
models = []
models.append(('DecisionTreeGini',DecisionTreeClassifier()))
model_fit_print()

0
DecisionTreeGini -ACC: 1.0
DecisionTreeGini -REC: 1.0
DecisionTreeGini -F1: 1.0
1
DecisionTreeGini -ACC: 0.9766666666666667
DecisionTreeGini -REC: 0.9689608636977058
DecisionTreeGini -F1: 0.953519256308101
2
DecisionTreeGini -ACC: 0.9783333333333334
DecisionTreeGini -REC: 0.9495798319327731
DecisionTreeGini -F1: 0.9542575650950035


('DecisionTreeGini',
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'))

In [12]:
# 将决策树可视化
import pydotplus
from sklearn.externals.six import StringIO
clf_name,clf = model_fit_print()
dot_data = export_graphviz(clf,out_file=None,
                           feature_names=f_names,
                           class_names=['NL','L'],
                           filled=True,
                           rounded=True,
                           special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('dt_tree.pdf')



0
DecisionTreeGini -ACC: 1.0
DecisionTreeGini -REC: 1.0
DecisionTreeGini -F1: 1.0
1
DecisionTreeGini -ACC: 0.9746666666666667
DecisionTreeGini -REC: 0.9676113360323887
DecisionTreeGini -F1: 0.9496688741721855
2
DecisionTreeGini -ACC: 0.9763333333333334
DecisionTreeGini -REC: 0.9481792717086834
DecisionTreeGini -F1: 0.9501754385964912


True

In [13]:
# 使用信息增益进行决策树
models = []
models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
model_fit_print()

0
DecisionTreeEntropy -ACC: 1.0
DecisionTreeEntropy -REC: 1.0
DecisionTreeEntropy -F1: 1.0
1
DecisionTreeEntropy -ACC: 0.983
DecisionTreeEntropy -REC: 0.9676113360323887
DecisionTreeEntropy -F1: 0.9656565656565657
2
DecisionTreeEntropy -ACC: 0.977
DecisionTreeEntropy -REC: 0.9481792717086834
DecisionTreeEntropy -F1: 0.9515108924806747


('DecisionTreeEntropy',
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'))

In [14]:
# 使用最小不纯度为0.1切分进行决策树
models = []
models.append(('DecisionTreeGini2',DecisionTreeClassifier(min_impurity_decrease=0.1)))
model_fit_print()

0
DecisionTreeGini2 -ACC: 0.821313479275475
DecisionTreeGini2 -REC: 0.7126654064272212
DecisionTreeGini2 -F1: 0.6522491349480969
1
DecisionTreeGini2 -ACC: 0.8206666666666667
DecisionTreeGini2 -REC: 0.7233468286099866
DecisionTreeGini2 -F1: 0.6658385093167702
2
DecisionTreeGini2 -ACC: 0.818
DecisionTreeGini2 -REC: 0.6820728291316527
DecisionTreeGini2 -F1: 0.6407894736842105


('DecisionTreeGini2',
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.1, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'))

### 支持向量机

In [15]:
from sklearn.svm import SVC
models = []
models.append(('SVM',SVC(gamma='auto',C=1000)))
model_fit_print()

0
SVM -ACC: 0.9605511723524837
SVM -REC: 0.9092627599243857
SVM -F1: 0.9155365215322389
1
SVM -ACC: 0.9623333333333334
SVM -REC: 0.9257759784075573
SVM -F1: 0.9239057239057238
2
SVM -ACC: 0.956
SVM -REC: 0.9061624649859944
SVM -F1: 0.9074333800841514


('SVM', SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False))

### 集成方法-随机森林

In [16]:
from sklearn.ensemble import RandomForestClassifier
models = []
models.append(('RandomForest',RandomForestClassifier()))
model_fit_print()

0
RandomForest -ACC: 0.9972219135459496
RandomForest -REC: 0.9891304347826086
RandomForest -F1: 0.9940631678936119
1
RandomForest -ACC: 0.99
RandomForest -REC: 0.9649122807017544
RandomForest -F1: 0.9794520547945206
2
RandomForest -ACC: 0.987
RandomForest -REC: 0.9467787114845938
RandomForest -F1: 0.9719626168224299




('RandomForest',
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=10,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False))

### 集成方法-提升法AdaBoost

In [17]:
from sklearn.ensemble import AdaBoostClassifier
models = []
models.append(('AdaBoost',AdaBoostClassifier()))
model_fit_print()

0
AdaBoost -ACC: 0.9592176908545393
AdaBoost -REC: 0.9083175803402647
AdaBoost -F1: 0.9128473046782237
1
AdaBoost -ACC: 0.958
AdaBoost -REC: 0.8974358974358975
AdaBoost -F1: 0.9134615384615385
2
AdaBoost -ACC: 0.9546666666666667
AdaBoost -REC: 0.8851540616246498
AdaBoost -F1: 0.9028571428571429


('AdaBoost',
 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                    n_estimators=50, random_state=None))

## 监督学习-回归

### 线性回归

In [18]:
# 引入线性回归，岭回归，lasso回归
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error
def regr_test(features,label):
    print('X',features)
    print('Y',label)
    regr = LinearRegression()
    regr.fit(features.values,label.values)
    Y_pred = regr.predict(features.values)
    print('LinearRegression')
    print('Coef:',regr.coef_)
    print('MSE:',mean_squared_error(Y_pred,label.values))
    regr = Ridge(alpha=0.1)
    regr.fit(features.values,label.values)
    Y_pred = regr.predict(features.values)
    print('Ridge')
    print('Coef:',regr.coef_)
    print('MSE:',mean_squared_error(Y_pred,label.values))
    regr = Lasso(alpha=0.002)
    regr.fit(features.values,label.values)
    Y_pred = regr.predict(features.values)
    print('Lasso')
    print('Coef:',regr.coef_)
    print('MSE:',mean_squared_error(Y_pred,label.values))

regr_test(features[['number_project','average_monthly_hours']],features['last_evaluation'])

X        number_project  average_monthly_hours
0                 0.0               0.285047
1                 0.6               0.775701
2                 1.0               0.822430
3                 0.6               0.593458
4                 0.0               0.294393
...               ...                    ...
14994             0.0               0.257009
14995             0.0               0.299065
14996             0.0               0.219626
14997             0.8               0.859813
14998             0.0               0.289720

[14999 rows x 2 columns]
Y 0        0.265625
1        0.781250
2        0.812500
3        0.796875
4        0.250000
           ...   
14994    0.328125
14995    0.187500
14996    0.265625
14997    0.937500
14998    0.250000
Name: last_evaluation, Length: 14999, dtype: float64
LinearRegression
Coef: [0.27268022 0.26917309]
MSE: 0.05953800649100494
Ridge
Coef: [0.27265976 0.26914916]
MSE: 0.05953800657114579
Lasso
Coef: [0.25039551 0.24227119]
MSE: 0.059

### 逻辑回归

In [19]:
from sklearn.linear_model import LogisticRegression
models = []
models.append(('LogisticRegression',LogisticRegression(C=1000,tol=1e-10)))
model_fit_print()

0
LogisticRegression -ACC: 0.7977553061451272
LogisticRegression -REC: 0.3686200378071834
LogisticRegression -F1: 0.46153846153846156
1
LogisticRegression -ACC: 0.807
LogisticRegression -REC: 0.41700404858299595
LogisticRegression -F1: 0.5162907268170426
2
LogisticRegression -ACC: 0.7866666666666666
LogisticRegression -REC: 0.3403361344537815
LogisticRegression -F1: 0.4316163410301954




('LogisticRegression',
 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=1e-10, verbose=0,
                    warm_start=False))

### 人工神经网络

In [26]:
from keras.models import Sequential
from keras.layers.core import Dense,Activation
from keras.optimizers import SGD
mdl = Sequential()
mdl.add(Dense(50,input_dim=len(f_v[0])))
mdl.add(Activation('sigmoid'))
mdl.add(Dense(2))
mdl.add(Activation('softmax'))
sgd = SGD(lr=0.1)
mdl.compile(loss='mse',optimizer='adam')
mdl.fit(X_train,np.array([[0,1] if i==1 else [1,0] for i in Y_train]),epochs=500,batch_size=2048)

xy_test = [(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
for i in range(len(xy_test)):
    X_part = xy_test[i][0]
    Y_part = xy_test[i][1]
    Y_pred = mdl.predict_classes(X_part)
    print(i)
    print("NN-ACC:",accuracy_score(Y_part,Y_pred))
    print("NN-REC:",recall_score(Y_part,Y_pred))
    print("NN-F1:",f1_score(Y_part,Y_pred))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

### 回归树和提升树--梯度提升决策树GBDT

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
models = []
models.append(('GBDT',GradientBoostingClassifier(max_depth=6,n_estimators=100)))
model_fit_print()

0
GBDT -ACC: 0.9962218024224914
GBDT -REC: 0.9883068288119738
GBDT -F1: 0.9920187793427231
1
GBDT -ACC: 0.9823333333333333
GBDT -REC: 0.9507959479015919
GBDT -F1: 0.9612289685442575
2
GBDT -ACC: 0.981
GBDT -REC: 0.9460916442048517
GBDT -F1: 0.9609856262833677


('GBDT', GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=6,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=100,
                            n_iter_no_change=None, presort='auto',
                            random_state=None, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False))