# 机器学习建模

In [2]:
# 导包
import pandas as pd
import numpy as np
# 预处理
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
# 建模
from sklearn.model_selection import train_test_split

# 把Graphviz加入path环境变量
# import os
# os.environ['PATH'] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\bin'

In [3]:
# 总的预处理函数
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,slr=False,dp=False,lower_d=False,ld_n=1):
    # 读数据
    df = pd.read_csv('./data/HR.csv')
    # 1.清洗数据
    df = df.dropna(subset=['satisfaction_level'])
    df = df[df['last_evaluation']<=1][df['salary']!='nme']
    # 2.得到标注
    label = df['left']
    df = df.drop('left',axis=1)
    # 3.特征选择，特征较少，先不删除特征
    # 4.特征处理
    # 连续属性
    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]
    column_lst = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                 'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))
        else:
            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))
    # 离散属性
    # 数值化重写map函数，把salary对应到我们想要的数值
    def map_salary(s):
        d = dict([('low',0),('medium',1),('high',2)])
        return d.get(s,0)
    scaler_lst = [slr,dp]
    column_lst = ['salary','department']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            # 归一化处理
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1))
        else:
            df = pd.get_dummies(df,columns=[column_lst[i]])
    # 5.降维
    if lower_d:
        # 因为标注只有两类，LDA降维只剩1类，所以不使用LDA，使用PCA
        # return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

features,label = hr_preprocessing()
features,label

(       satisfaction_level  last_evaluation  number_project  \
 0                0.318681         0.265625             0.0   
 1                0.780220         0.781250             0.6   
 2                0.021978         0.812500             1.0   
 3                0.692308         0.796875             0.6   
 4                0.307692         0.250000             0.0   
 ...                   ...              ...             ...   
 14994            0.340659         0.328125             0.0   
 14995            0.307692         0.187500             0.0   
 14996            0.307692         0.265625             0.0   
 14997            0.021978         0.937500             0.8   
 14998            0.307692         0.250000             0.0   
 
        average_monthly_hours  time_spend_company  Work_accident  \
 0                   0.285047               0.125            0.0   
 1                   0.775701               0.500            0.0   
 2                   0.822430         

## 数据集切分为训练集、测试集、验证集

In [23]:
def hr_modeling(features,label):
    f_v = features.values
    l_v = label.values
    f_names = features.columns.values
    # 切分数据集，6:2:2
    X_tt,X_validation,Y_tt,Y_validation = train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test = train_test_split(X_tt,Y_tt,test_size=0.25)
    print(len(X_train),len(X_validation),len(X_test))
    return X_train,Y_train,X_test,Y_test,X_validation,Y_validation,f_names

X_train,Y_train,X_test,Y_test,X_validation,Y_validation,f_names = hr_modeling(features,label)
X_train,Y_train,X_test,Y_test,X_validation,Y_validation

8999 3000 3000


(array([[0.84615385, 0.5       , 0.        , ..., 0.        , 0.66666667,
         1.        ],
        [0.56043956, 0.671875  , 0.6       , ..., 0.        , 0.88888889,
         0.        ],
        [0.67032967, 0.609375  , 0.2       , ..., 0.        , 1.        ,
         0.        ],
        ...,
        [0.43956044, 0.21875   , 0.4       , ..., 0.        , 0.88888889,
         0.5       ],
        [0.40659341, 0.1875    , 0.        , ..., 0.        , 1.        ,
         0.        ],
        [0.30769231, 0.421875  , 0.4       , ..., 0.        , 0.77777778,
         0.5       ]]),
 array([0, 0, 0, ..., 0, 1, 0], dtype=int64),
 array([[0.30769231, 0.15625   , 0.        , ..., 0.        , 0.77777778,
         0.        ],
        [0.47252747, 0.515625  , 0.8       , ..., 0.        , 0.55555556,
         0.5       ],
        [0.69230769, 0.5       , 0.2       , ..., 0.        , 0.66666667,
         0.5       ],
        ...,
        [0.16483516, 0.046875  , 0.4       , ..., 0.        , 

## 监督学习-分类

### KNN

In [5]:
# knn导包
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,Y_train)

# 对训练集进行预测
Y_pred = knn_clf.predict(X_train)
print('Train')
print('ACC:',accuracy_score(Y_train,Y_pred))
print('REC:',recall_score(Y_train,Y_pred))
print('F-Score',f1_score(Y_train,Y_pred))

# 对验证集进行预测
Y_pred = knn_clf.predict(X_validation)
print('Validation')
print('ACC:',accuracy_score(Y_validation,Y_pred))
print('REC:',recall_score(Y_validation,Y_pred))
print('F-Score',f1_score(Y_validation,Y_pred))

# 对测试集进行预测
Y_pred = knn_clf.predict(X_test)
print('Test')
print('ACC:',accuracy_score(Y_test,Y_pred))
print('REC:',recall_score(Y_test,Y_pred))
print('F-Score',f1_score(Y_test,Y_pred))

Train
ACC: 0.9774419379931103
REC: 0.963363081258807
F-Score 0.9528455284552846
Validation
ACC: 0.95
REC: 0.9070422535211268
F-Score 0.8956884561891516
Test
ACC: 0.9486666666666667
REC: 0.9180327868852459
F-Score 0.8971962616822431


In [12]:
# 将训练好的模型保存
from sklearn.externals import joblib
joblib.dump(knn_clf,'knn_clf')

['knn_clf']

In [7]:
# 加载训练好的模型，并进行推理
knn_clf2 = joblib.load('knn_clf')
Y_pred = knn_clf2.predict(X_test)
print('Test2')
print('ACC:',accuracy_score(Y_test,Y_pred))
print('REC:',recall_score(Y_test,Y_pred))
print('F-Score',f1_score(Y_test,Y_pred))

Test2
ACC: 0.9486666666666667
REC: 0.9180327868852459
F-Score 0.8971962616822431


In [14]:
# 构建模型模板
models = []
models.append(('KNN',knn_clf))
def model_fit_print():
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_test = [(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        for i in range(len(xy_test)):
            X_part = xy_test[i][0]
            Y_part = xy_test[i][1]
            Y_pred = clf.predict(X_part)
            print(i)
            print(clf_name,"-ACC:",accuracy_score(Y_part,Y_pred))
            print(clf_name,"-REC:",recall_score(Y_part,Y_pred))
            print(clf_name,"-F1:",f1_score(Y_part,Y_pred))
        return clf_name,clf

model_fit_print()

0
KNN -ACC: 0.9774419379931103
KNN -REC: 0.963363081258807
KNN -F1: 0.9528455284552846
1
KNN -ACC: 0.95
KNN -REC: 0.9070422535211268
KNN -F1: 0.8956884561891516
2
KNN -ACC: 0.9486666666666667
KNN -REC: 0.9180327868852459
KNN -F1: 0.8971962616822431


('KNN',
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                      weights='uniform'))

### 朴素贝叶斯
贝叶斯用到的值都是离散的，
如果这些值都是二值（0,1）的话，就用伯努利贝叶斯，若值是连续的，伯努利贝叶斯也会将其二值化，
如果值服从高斯分布，就用高斯贝叶斯

In [17]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB
models = []
models.append(('GaussianNB',GaussianNB()))
model_fit_print()

0
GaussianNB -ACC: 0.8044227136348483
GaussianNB -REC: 0.7120713950211367
GaussianNB -F1: 0.6327212020033388
1
GaussianNB -ACC: 0.8013333333333333
GaussianNB -REC: 0.7084507042253522
GaussianNB -F1: 0.6279650436953808
2
GaussianNB -ACC: 0.8003333333333333
GaussianNB -REC: 0.6707650273224044
GaussianNB -F1: 0.6211258697027199


('GaussianNB', GaussianNB(priors=None, var_smoothing=1e-09))

In [19]:
models = []
models.append(('BernoulliNB',BernoulliNB()))
model_fit_print()

0
BernoulliNB -ACC: 0.8449827758639848
BernoulliNB -REC: 0.5025833724753406
BernoulliNB -F1: 0.6053748231966054
1
BernoulliNB -ACC: 0.8383333333333334
BernoulliNB -REC: 0.4535211267605634
BernoulliNB -F1: 0.5704162976085031
2
BernoulliNB -ACC: 0.8313333333333334
BernoulliNB -REC: 0.43989071038251365
BernoulliNB -F1: 0.5599999999999999


('BernoulliNB',
 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))

### 决策树

In [20]:
from sklearn.tree import DecisionTreeClassifier,export_graphviz
models = []
models.append(('DecisionTreeGini',DecisionTreeClassifier()))
model_fit_print()

0
DecisionTree -ACC: 1.0
DecisionTree -REC: 1.0
DecisionTree -F1: 1.0
1
DecisionTree -ACC: 0.9753333333333334
DecisionTree -REC: 0.9577464788732394
DecisionTree -F1: 0.9483960948396094
2
DecisionTree -ACC: 0.9676666666666667
DecisionTree -REC: 0.9494535519125683
DecisionTree -F1: 0.9347679892400808


('DecisionTree',
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'))

In [25]:
# 将决策树可视化
import pydotplus
from sklearn.externals.six import StringIO
clf_name,clf = model_fit_print()
dot_data = export_graphviz(clf,out_file=None,
                           feature_names=f_names,
                           class_names=['NL','L'],
                           filled=True,
                           rounded=True,
                           special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('dt_tree.pdf')

0
DecisionTree -ACC: 1.0
DecisionTree -REC: 1.0
DecisionTree -F1: 1.0
1
DecisionTree -ACC: 0.9746666666666667
DecisionTree -REC: 0.9580318379160637
DecisionTree -F1: 0.9457142857142858
2
DecisionTree -ACC: 0.975
DecisionTree -REC: 0.9614325068870524
DecisionTree -F1: 0.9490142760027193


True

In [26]:
# 使用信息增益进行决策树
models = []
models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
model_fit_print()

0
DecisionTreeEntropy -ACC: 1.0
DecisionTreeEntropy -REC: 1.0
DecisionTreeEntropy -F1: 1.0
1
DecisionTreeEntropy -ACC: 0.973
DecisionTreeEntropy -REC: 0.9551374819102749
DecisionTreeEntropy -F1: 0.9421841541755888
2
DecisionTreeEntropy -ACC: 0.9756666666666667
DecisionTreeEntropy -REC: 0.9614325068870524
DecisionTreeEntropy -F1: 0.9503063308373043


('DecisionTreeEntropy',
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'))

In [28]:
# 使用最小不纯度为0.1切分进行决策树
models = []
models.append(('DecisionTreeGini2',DecisionTreeClassifier(min_impurity_decrease=0.1)))
model_fit_print()

0
DecisionTreeGini2 -ACC: 0.8162018002000222
DecisionTreeGini2 -REC: 0.6968430826369545
DecisionTreeGini2 -F1: 0.6447594501718212
1
DecisionTreeGini2 -ACC: 0.8313333333333334
DecisionTreeGini2 -REC: 0.7380607814761215
DecisionTreeGini2 -F1: 0.6684141546526867
2
DecisionTreeGini2 -ACC: 0.8226666666666667
DecisionTreeGini2 -REC: 0.7162534435261708
DecisionTreeGini2 -F1: 0.6615776081424936


('DecisionTreeGini2',
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.1, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'))

### 支持向量机

In [31]:
from sklearn.svm import SVC
models = []
models.append(('SVM',SVC(gamma='auto',C=100000)))
model_fit_print()

0
SVM -ACC: 0.9756639626625181
SVM -REC: 0.9368616527390901
SVM -F1: 0.9485311398354876
1
SVM -ACC: 0.9676666666666667
SVM -REC: 0.9131693198263386
SVM -F1: 0.9286239882266372
2
SVM -ACC: 0.968
SVM -REC: 0.9297520661157025
SVM -F1: 0.9336099585062242


('SVM', SVC(C=100000, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False))

### 集成方法