In [36]:
# 导包
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

In [37]:
# 读数据
df = pd.read_csv('./data/HR.csv')
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14997,0.11,0.96,6,280,4,0,1,0,support,low
14998,0.37,0.52,2,158,3,0,1,0,support,low
14999,,0.52,2,158,3,0,1,0,support,low
15000,,999999.00,2,158,3,0,1,0,sale,low


In [38]:
# 1.清洗数据
df = df.dropna(subset=['satisfaction_level'])
df = df[df['last_evaluation']<=1][df['salary']!='nme']
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low


In [39]:
# 2.得到标注
label = df['left']
label

0        1
1        1
2        1
3        1
4        1
        ..
14994    1
14995    1
14996    1
14997    1
14998    1
Name: left, Length: 14999, dtype: int64

In [40]:
df = df.drop('left',axis=1)
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,0,sales,low
1,0.80,0.86,5,262,6,0,0,sales,medium
2,0.11,0.88,7,272,4,0,0,sales,medium
3,0.72,0.87,5,223,5,0,0,sales,low
4,0.37,0.52,2,159,3,0,0,sales,low
...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,support,low
14995,0.37,0.48,2,160,3,0,0,support,low
14996,0.37,0.53,2,143,3,0,0,support,low
14997,0.11,0.96,6,280,4,0,0,support,low


In [41]:
# 3.特征选择，特征较少，先不删除特征
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,0,sales,low
1,0.80,0.86,5,262,6,0,0,sales,medium
2,0.11,0.88,7,272,4,0,0,sales,medium
3,0.72,0.87,5,223,5,0,0,sales,low
4,0.37,0.52,2,159,3,0,0,sales,low
...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,support,low
14995,0.37,0.48,2,160,3,0,0,support,low
14996,0.37,0.53,2,143,3,0,0,support,low
14997,0.11,0.96,6,280,4,0,0,support,low


In [42]:
# 4.特征处理
# 连续属性
# sl表示satisfaction_level--False：MinMaxScaler；True：StandardScaler
# le表示last_evaluation--False：MinMaxScaler；True：StandardScaler
# npr表示number_project--False：MinMaxScaler；True：StandardScaler
# amh表示average_monthly_hours--False：MinMaxScaler；True：StandardScaler
# tsc表示time_spend_company--False：MinMaxScaler；True：StandardScaler
# wa表示Work_accident--False：MinMaxScaler；True：StandardScaler
# pl5表示promotion_last_5years--False：MinMaxScaler；True：StandardScaler
def preprocess1(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False):
    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]
    column_lst = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                 'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))
        else:
            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))

preprocess1(sl=True,le=True)
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary
0,-0.936495,-1.087275,0.0,0.285047,0.125,0.0,0.0,sales,low
1,0.752814,0.840707,0.6,0.775701,0.500,0.0,0.0,sales,medium
2,-2.022479,0.957554,1.0,0.822430,0.250,0.0,0.0,sales,medium
3,0.431041,0.899131,0.6,0.593458,0.375,0.0,0.0,sales,low
4,-0.976716,-1.145699,0.0,0.294393,0.125,0.0,0.0,sales,low
...,...,...,...,...,...,...,...,...,...
14994,-0.856051,-0.853580,0.0,0.257009,0.125,0.0,0.0,support,low
14995,-0.976716,-1.379394,0.0,0.299065,0.125,0.0,0.0,support,low
14996,-0.976716,-1.087275,0.0,0.219626,0.125,0.0,0.0,support,low
14997,-2.022479,1.424944,0.8,0.859813,0.250,0.0,0.0,support,low


In [43]:
# 离散属性
# slr表示salary--False：LabelEncoder；True：OneHotEncoder
# dp表示department--False：LabelEncoder；True：OneHotEncoder
def preprocess2(slr=False,dp=False):
    global df
    scaler_lst = [slr,dp]
    column_lst = ['salary','department']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            # 归一化处理
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1))
        else:
            df = pd.get_dummies(df,columns=[column_lst[i]])

# 数值化重写map函数，把salary对应到我们想要的数值
def map_salary(s):
    d = dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)

preprocess2()
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary
0,-0.936495,-1.087275,0.0,0.285047,0.125,0.0,0.0,0.777778,0.0
1,0.752814,0.840707,0.6,0.775701,0.500,0.0,0.0,0.777778,0.5
2,-2.022479,0.957554,1.0,0.822430,0.250,0.0,0.0,0.777778,0.5
3,0.431041,0.899131,0.6,0.593458,0.375,0.0,0.0,0.777778,0.0
4,-0.976716,-1.145699,0.0,0.294393,0.125,0.0,0.0,0.777778,0.0
...,...,...,...,...,...,...,...,...,...
14994,-0.856051,-0.853580,0.0,0.257009,0.125,0.0,0.0,0.888889,0.0
14995,-0.976716,-1.379394,0.0,0.299065,0.125,0.0,0.0,0.888889,0.0
14996,-0.976716,-1.087275,0.0,0.219626,0.125,0.0,0.0,0.888889,0.0
14997,-2.022479,1.424944,0.8,0.859813,0.250,0.0,0.0,0.888889,0.0


In [44]:
# 5.降维
def preprocess3(lower_d=False,ld_n=1):
    if lower_d:
        # 因为标注只有两类，LDA降维只剩1类，所以不使用LDA，使用PCA
        # return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values)

preprocess3()
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary
0,-0.936495,-1.087275,0.0,0.285047,0.125,0.0,0.0,0.777778,0.0
1,0.752814,0.840707,0.6,0.775701,0.500,0.0,0.0,0.777778,0.5
2,-2.022479,0.957554,1.0,0.822430,0.250,0.0,0.0,0.777778,0.5
3,0.431041,0.899131,0.6,0.593458,0.375,0.0,0.0,0.777778,0.0
4,-0.976716,-1.145699,0.0,0.294393,0.125,0.0,0.0,0.777778,0.0
...,...,...,...,...,...,...,...,...,...
14994,-0.856051,-0.853580,0.0,0.257009,0.125,0.0,0.0,0.888889,0.0
14995,-0.976716,-1.379394,0.0,0.299065,0.125,0.0,0.0,0.888889,0.0
14996,-0.976716,-1.087275,0.0,0.219626,0.125,0.0,0.0,0.888889,0.0
14997,-2.022479,1.424944,0.8,0.859813,0.250,0.0,0.0,0.888889,0.0


In [46]:
# 总的预处理函数
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,slr=False,dp=False,lower_d=False,ld_n=1):
    # 读数据
    df = pd.read_csv('./data/HR.csv')
    # 1.清洗数据
    df = df.dropna(subset=['satisfaction_level'])
    df = df[df['last_evaluation']<=1][df['salary']!='nme']
    # 2.得到标注
    label = df['left']
    df = df.drop('left',axis=1)
    # 3.特征选择，特征较少，先不删除特征
    # 4.特征处理
    # 连续属性
    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]
    column_lst = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                 'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))
        else:
            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))
    # 离散属性
    # 数值化重写map函数，把salary对应到我们想要的数值
    def map_salary(s):
        d = dict([('low',0),('medium',1),('high',2)])
        return d.get(s,0)
    scaler_lst = [slr,dp]
    column_lst = ['salary','department']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            # 归一化处理
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1))
        else:
            df = pd.get_dummies(df,columns=[column_lst[i]])
    # 5.降维
    if lower_d:
        # 因为标注只有两类，LDA降维只剩1类，所以不使用LDA，使用PCA
        # return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label