In [15]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [16]:
original_data_path = "original_data/bank-additional-full.csv"
# 小样本数据集：专门用于测试计算密集型算法（如SVM）
# original_data_path = "original_data/bank-additional.csv"
processed_data_path = "processed_data/"

In [17]:
bank_data_small  = pd.read_csv(original_data_path,sep=";") 
bank_data_small['pdays'] = bank_data_small['pdays'].replace(999, -1)
print(bank_data_small.describe())

               age      duration      campaign         pdays      previous  \
count  41188.00000  41188.000000  41188.000000  41188.000000  41188.000000   
mean      40.02406    258.285010      2.567593     -0.741988      0.172963   
std       10.42125    259.279249      2.770014      1.510327      0.494901   
min       17.00000      0.000000      1.000000     -1.000000      0.000000   
25%       32.00000    102.000000      1.000000     -1.000000      0.000000   
50%       38.00000    180.000000      2.000000     -1.000000      0.000000   
75%       47.00000    319.000000      3.000000     -1.000000      0.000000   
max       98.00000   4918.000000     56.000000     27.000000      7.000000   

       emp.var.rate  cons.price.idx  cons.conf.idx     euribor3m   nr.employed  
count  41188.000000    41188.000000   41188.000000  41188.000000  41188.000000  
mean       0.081886       93.575664     -40.502600      3.621291   5167.035911  
std        1.570960        0.578840       4.628198    

In [18]:
# 按照数据类型对特征进行分类（连续/离散）
string_features = bank_data_small.columns[bank_data_small.dtypes  == "object"].to_series().values
int_features = bank_data_small.columns[bank_data_small.dtypes  == "int64"].to_series().values
float_features = bank_data_small.columns[bank_data_small.dtypes  == "float64"].to_series().values
numeric_features = np.append(int_features,float_features)

bin_features = ['default', 'housing', 'loan','y']
order_features = ['education']
disorder_features = ['poutcome', 'job', 'marital', 'contact', 'month','day_of_week']

三种不同的缺失值填补方法

In [19]:
#使用均值填补缺失值
def Missing_value_perprocessing_mean (bank_data_small_train,bank_data_small_test):
    col  = bank_data_small_train.columns
    #Train_copy = Train.copy()
    #直接使用平均值填补缺失值
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    imp.fit(bank_data_small_train)
    bank_data_small_train = imp.transform(bank_data_small_train) 
    bank_data_small_test = imp.transform(bank_data_small_test) 
    bank_data_small_train = pd.DataFrame(bank_data_small_train,columns = col)
    bank_data_small_test = pd.DataFrame(bank_data_small_test,columns = col)
    return bank_data_small_train,bank_data_small_test 

#使用随机森林填补缺失值
def Missing_value_perprocessing_rf (bank_data_small_train,bank_data_small_test):
    Missing_features_dict = {}
    Missing_features_name = []
    #先统计哪些列存在缺失的数据
    for feature in bank_data_small_train.columns:
        Missing_count = bank_data_small_train[bank_data_small_train[feature].isnull()]['age'].count() 
        if Missing_count > 0:
            # 统计包含缺失值的列
            Missing_features_dict.update({feature: Missing_count})
    #对缺失的数据列按照缺失值数量从少到多排序，先拟合缺失值少的列        
    Missing_features_name = sorted(Missing_features_dict.keys(),reverse=True) 
    #print(Missing_features_name)
    for feature in Missing_features_name:     
        #训练集中有缺失值的数据
        train_miss_data = bank_data_small_train[bank_data_small_train[feature].isnull()]
        train_miss_data_X = train_miss_data.drop(Missing_features_name, axis=1)
        #训练集中没有缺失值的数据
        train_full_data = bank_data_small_train[bank_data_small_train[feature].notnull()]     
        train_full_data_Y = train_full_data[feature]
        train_full_data_X = train_full_data.drop(Missing_features_name, axis=1)
        #测试集中有缺失值的数据
        test_miss_data = bank_data_small_test[bank_data_small_test[feature].isnull()]
        test_miss_data_X = test_miss_data.drop(Missing_features_name, axis=1)
        #测试集中没有缺失值的数据
        test_full_data = bank_data_small_test[bank_data_small_test[feature].notnull()]     
        test_full_data_Y = test_full_data[feature]
        test_full_data_X = test_full_data.drop(Missing_features_name, axis=1)
        from sklearn.ensemble import RandomForestClassifier
        #使用随机森林拟合        
        rf = RandomForestClassifier(n_estimators=100)
        #利用训练集中没有缺失值的数据构建随机森林
        rf.fit(train_full_data_X, train_full_data_Y)
        #预测训练集中的缺失值
        train_miss_data_Y = rf.predict(train_miss_data_X)
        #预测测试集中的缺失值
        test_miss_data_Y = rf.predict(test_miss_data_X) 
        #将训练集中的缺失值补充完整
        train_miss_data[feature] = train_miss_data_Y  
        #将测试集中的缺失值补充完整
        test_miss_data[feature] = test_miss_data_Y 
        #将补充完整的
        bank_data_small_train = pd.concat([train_full_data, train_miss_data])
        bank_data_small_test = pd.concat([test_full_data, test_miss_data])      
        
    return bank_data_small_train,bank_data_small_test

#使用knn填补缺失值
def Missing_value_perprocessing_knn (bank_data_small_train,bank_data_small_test):
    Missing_features_dict = {}
    Missing_features_name = []
    #先统计哪些列存在缺失的数据
    for feature in bank_data_small_train.columns:
        Missing_count = bank_data_small_train[bank_data_small_train[feature].isnull()]['age'].count() 
        if Missing_count > 0:
            # 统计包含缺失值的列
            Missing_features_dict.update({feature: Missing_count})
    #对缺失的数据列按照缺失值数量从少到多排序，先拟合缺失值少的列        
    Missing_features_name = sorted(Missing_features_dict.keys(),reverse=True)
    from sklearn.neighbors import KNeighborsClassifier 
    for feature in Missing_features_name:     
        #训练集中有缺失值的数据
        train_miss_data = bank_data_small_train[bank_data_small_train[feature].isnull()]
        train_miss_data_X = train_miss_data.drop(Missing_features_name, axis=1)
        #训练集中没有缺失值的数据
        train_full_data = bank_data_small_train[bank_data_small_train[feature].notnull()]     
        train_full_data_Y = train_full_data[feature]
        train_full_data_X = train_full_data.drop(Missing_features_name, axis=1)
        #测试集中有缺失值的数据
        test_miss_data = bank_data_small_test[bank_data_small_test[feature].isnull()]
        test_miss_data_X = test_miss_data.drop(Missing_features_name, axis=1)
        #测试集中没有缺失值的数据
        test_full_data = bank_data_small_test[bank_data_small_test[feature].notnull()]     
        test_full_data_Y = test_full_data[feature]
        test_full_data_X = test_full_data.drop(Missing_features_name, axis=1)
        
        #使用K近邻拟合        
        knn = KNeighborsClassifier()
        forest = knn.fit(train_full_data_X, train_full_data_Y)
        
        train_miss_data_Y = knn.predict(train_miss_data_X)
        test_miss_data_Y = knn.predict(test_miss_data_X) 
        
        train_miss_data.loc[:, feature] = train_miss_data_Y
        test_miss_data.loc[:, feature] = test_miss_data_Y

        bank_data_small_train = pd.concat([train_full_data, train_miss_data])
        bank_data_small_test = pd.concat([test_full_data, test_miss_data])      
        
    return bank_data_small_train,bank_data_small_test

In [20]:
# 把二分类特征转化成（1，0）
def bin_features_perprocessing (bin_features, bank_data):
    for feature in bin_features:      
        new = np.zeros(bank_data[feature].shape[0])
        for rol in range(bank_data[feature].shape[0]):
            if bank_data[feature][rol] == 'yes' :
                new[rol] = 1
            elif bank_data[feature][rol]  == 'no':
                new[rol] = 0
            else:
                new[rol] = None
        bank_data[feature] =  new   
    return bank_data

In [21]:
#特征值有次序关系的特征，按照特征值强弱排序（1，2，……，n）（如：受教育程度）
def order_features_perprocessing (order_features,bank_data):
    education_values = ["illiterate", "basic.4y", "basic.6y", "basic.9y", 
    "high.school",  "professional.course", "university.degree","unknown"]
    replace_values = list(range(1,  len(education_values)))
    replace_values.append(None)
    bank_data[order_features] = bank_data[order_features].replace(education_values,replace_values)
    bank_data[order_features] = bank_data[order_features].astype("float")
    return bank_data

In [22]:
#特征值没有次序的特征，使用onehot编码
def disorder_features_perprocessing (disorder_features, bank_data):
    for features in disorder_features:
        #做onehot
        features_onehot = pd.get_dummies(bank_data[features])
        #把名字改成features_values
        features_onehot = features_onehot.rename(columns=lambda x: features+'_'+str(x))
        #拼接onehot得到的新features
        bank_data = pd.concat([bank_data,features_onehot],axis=1)
        #删掉原来的feature columns
        bank_data = bank_data.drop(features, axis=1)
    return bank_data

In [23]:
#归一化
def Scale_perprocessing (Train):
    col  = Train.columns
    copy = Train.copy()
    scaler = preprocessing.MinMaxScaler()
    copy = scaler.fit_transform(copy)
    Train = pd.DataFrame(copy,columns = col)
    return Train 

使用上面定义的一系列函数来处理数据

In [24]:
#将字符型特征转化为数值型特征
#转化二分类特征为1，0
bank_data_small = bin_features_perprocessing(bin_features, bank_data_small)
#转化包含次序的特征
bank_data_small = order_features_perprocessing(order_features, bank_data_small)
#转化无序的特征
bank_data_small = disorder_features_perprocessing(disorder_features, bank_data_small)

In [25]:
#随机重排后，划分训练集和测试集
bank_data_small.shape[0]
round(bank_data_small.shape[0]*0.8)
bank_data_small = bank_data_small.sample(frac=1,random_state=12)
import math
bank_data_small_train = bank_data_small.iloc[0:round(bank_data_small.shape[0]*0.8),:]
bank_data_small_test = bank_data_small.iloc[round(bank_data_small.shape[0]*0.8):,:]

In [26]:
#缺失数据处理
#平均值
#bank_data_small_train,bank_data_small_test = Missing_value_perprocessing_mean(bank_data_small_train,bank_data_small_test)
#k近邻
bank_data_small_train,bank_data_small_test = Missing_value_perprocessing_knn(bank_data_small_train,bank_data_small_test)
#随机森林
#bank_data_small_train,bank_data_small_test = Missing_value_perprocessing_rf(bank_data_small_train,bank_data_small_test) 

In [27]:
X_train_small = bank_data_small_train.drop(['y'], axis=1).copy()
y_train_small = pd.DataFrame(bank_data_small_train['y'],columns = ['y'])

X_test_small = bank_data_small_test.drop(['y'], axis=1).copy()
y_test_small = pd.DataFrame(bank_data_small_test['y'],columns = ['y'])

X_train_small = Scale_perprocessing(X_train_small)
X_test_small = Scale_perprocessing(X_test_small)

In [28]:
#导出
X_test_output_path = "processed_data/X_test.csv"
y_test_output_path = "processed_data/y_test.csv"
X_train_output_path = "processed_data/X_train.csv"
y_train_output_path = "processed_data/y_train.csv"

# X_test_output_path = "processed_data/X_test_small.csv"
# y_test_output_path = "processed_data/y_test_small.csv"
# X_train_output_path = "processed_data/X_train_small.csv"
# y_train_output_path = "processed_data/y_train_small.csv"

X_test_small.to_csv(X_test_output_path,index = False)
y_test_small.to_csv(y_test_output_path,index = False)
X_train_small.to_csv(X_train_output_path,index = False)
y_train_small.to_csv(y_train_output_path,index = False)