### 缺失值填充
对比0填充、均值填充、随机森林预测填充的效果

In [16]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.impute import SimpleImputer#填补缺失值的类
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

使用boston数据集

In [17]:
dataset = load_boston()

x_full, y_full = dataset.data, dataset.target
n_samples = x_full.shape[0]#506
n_features = x_full.shape[1]#13

制作缺失数据

In [18]:
np.random.seed(78)
#设置缺失比例
missing_ratio=0.3
n_missing_values=int(np.floor(n_samples*n_features*missing_ratio))

missing_samples=np.random.randint(0,n_samples,n_missing_values)
missing_features=np.random.randint(0,n_features,n_missing_values)
print(len(missing_samples),len(missing_features))

x_missing=x_full.copy()
x_missing[missing_samples,missing_features]=np.nan
x_missing=pd.DataFrame(x_missing)

x_missing.head()

1973 1973


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,2.31,,0.538,,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,,0.0,7.07,,0.469,,78.9,4.9671,,,17.8,,9.14
2,0.02729,,,,0.469,7.185,61.1,4.9671,2.0,,17.8,392.83,4.03
3,,,2.18,0.0,0.458,,,,3.0,,18.7,394.63,2.94
4,0.06905,,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


0填充与均值填充

In [19]:
zeroImputer=SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)
x_missing_zero=zeroImputer.fit_transform(x_missing)

meanImputer=SimpleImputer(missing_values=np.nan,strategy='mean')
x_missing_mean=meanImputer.fit_transform(x_missing)

随机森林填充

In [20]:
x_missing_reg=x_missing.copy()
sortindex=np.argsort(x_missing.isnull().sum()).values

#首先填充缺失值少的特征
for i in sortindex:
    df=x_missing_reg
    fillc=df.loc[:,i]
    df_others=pd.concat([df.loc[:,df.columns!=i],pd.DataFrame(y_full)],axis=1)
    
    # 先将其他非填充列缺失值进行0填充
    df_others.fillna(0,inplace=True)
    fill_y_train=fillc[fillc.notnull()]
    fill_y_test=fillc[fillc.isnull()]
    fill_x_train=df_others.loc[fill_y_train.index,:]
    fill_x_test=df_others.loc[fill_y_test.index,:]
    
    rfc=RandomForestRegressor(n_estimators=100)
    rfc.fit(fill_x_train,fill_y_train)
    fill_y_pred=rfc.predict(fill_x_test)
    x_missing_reg.loc[fill_y_test.index,i]=fill_y_pred

x_missing_reg.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

对比三种情况下的mse

In [21]:
xs=[x_full,x_missing_zero,x_missing_mean,x_missing_reg]
mses=[]
for x in xs:
    rfc=RandomForestRegressor(random_state=78,n_estimators=100)
    scores=cross_val_score(rfc,x,y_full,scoring='neg_mean_squared_error',cv=5).mean()
    mses.append(-1*scores)
    
[*zip(['Full data','Zero Imputation','Mean Imputation','Regressor Imputation'],mses)]

[('Full data', 21.513407991166744),
 ('Zero Imputation', 36.123920118734226),
 ('Mean Imputation', 27.503262165385358),
 ('Regressor Imputation', 18.150665942302467)]

可以发现随机森林填充效果不错