## 特征值插值

In [40]:
import pandas as pd 
import numpy as np 
import seaborn as sn
import matplotlib.pyplot as plt
import pdvega
import warnings
warnings.filterwarnings('ignore')

## 导入数据，处理年龄、哑变量

In [46]:
#导入数据
data = pd.read_csv('yj_aki_finaleigen.csv')
#处理年龄
index = data[data['age']>200].index
data['age'].loc[index] = 91.4
#处理哑变量
dummy_feature = data[['gender', 'admission_type']]
dummies = pd.get_dummies(dummy_feature,prefix=['sex','type'])
data = data.drop(['gender', 'admission_type','vaso','vent'], axis=1)
datawithdummy = dummies.join(data)
data = datawithdummy
#去掉建模无关项
data = data.drop(['akistarttime','icustay_id','ethnicity','hospital_expire_flag'],axis=1)
data.keys()

Index(['sex_F', 'sex_M', 'type_ELECTIVE', 'type_EMERGENCY', 'type_URGENT',
       'creat', 'classlabel', 'hr_max', 'hr_min', 'hr_avg', 'hr_std', 'hr_mid',
       'hr_25', 'hr_75', 'rr_max', 'rr_min', 'rr_avg', 'rr_std', 'rr_mid',
       'rr_25', 'rr_75', 'sbp_max', 'sbp_min', 'sbp_avg', 'sbp_std', 'sbp_mid',
       'sbp_25', 'sbp_75', 'dbp_max', 'dbp_min', 'dbp_avg', 'dbp_std',
       'dbp_mid', 'dbp_25', 'dbp_75', 'mbp_max', 'mbp_min', 'mbp_avg',
       'mbp_std', 'mbp_mid', 'mbp_25', 'mbp_75', 'si_max', 'si_min', 'si_avg',
       'si_std', 'si_mid', 'si_25', 'si_75', 'spo2_max', 'spo2_min',
       'spo2_avg', 'spo2_std', 'spo2_mid', 'spo2_25', 'spo2_75', 'tem_max',
       'tem_min', 'tem_avg', 'tem_std', 'tem_mid', 'tem_25', 'tem_75',
       'uo_max', 'uo_min', 'uo_avg', 'uo_std', 'uo_mid', 'uo_25', 'uo_75',
       'uosum', 'gcs_max', 'gcs_min', 'gcs_avg', 'gcs_std', 'gcs_mid',
       'gcs_25', 'gcs_75', 'height', 'weight', 'age', 'lostime'],
      dtype='object')

## 比较插值方法

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler
scale = RobustScaler()

### 直接去掉缺失值后建模

In [48]:
data_drop = data.copy()
data_drop= data_drop.dropna()
labelmat_drop = data_drop['classlabel']
datamat_drop = data_drop.drop(['classlabel'],axis=1)
datamat_drop['BMI'] = (datamat_drop['weight'])/(datamat_drop['height']*datamat_drop['height']*0.0001)
datamat_drop = datamat_drop.drop(['height','weight'], axis=1)
scale.fit(datamat_drop)
datamat_drop = scale.transform(datamat_drop)
# Estimate the score on the dropna dataset
estimator = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1)
score_drop = cross_val_score(estimator, datamat_drop, labelmat_drop).mean()
print("Score with the dropna dataset = %.5f" % score_drop)

Score with the dropna dataset = 0.79644


### 用平均值插值后建模

In [49]:
data_mean = data.copy()
labelmat_mean = data_mean['classlabel']
datamat_mean = data_mean.drop(['classlabel'],axis=1)
datamat_mean['BMI'] = (datamat_mean['weight'])/(datamat_mean['height']*datamat_mean['height']*0.0001)
datamat_mean = datamat_mean.drop(['height','weight'], axis=1)
estimator_mean = Pipeline([("imputer", SimpleImputer(missing_values=np.nan,
                                          strategy="mean")),
                           ("scale",RobustScaler()),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100,
                                                      n_jobs=-1))])
score_mean = cross_val_score(estimator_mean, datamat_mean, labelmat_mean).mean()
print("Score after imputation of the missing values with mean = %.5f" % score_mean)

Score after imputation of the missing values with mean = 0.77257


### 用中位数插值后建模

In [50]:
data_median = data.copy()
labelmat_median = data_median['classlabel']
datamat_median = data_median.drop(['classlabel'],axis=1)
datamat_median['BMI'] = (datamat_median['weight'])/(datamat_median['height']*datamat_median['height']*0.0001)
datamat_median = datamat_median.drop(['height','weight'], axis=1)
estimator_median = Pipeline([("imputer", SimpleImputer(missing_values=np.nan,
                                          strategy="median")),
                             ("scale",RobustScaler()),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100,
                                                      n_jobs=-1))])
score_median = cross_val_score(estimator_median, datamat_median, labelmat_median).mean()
print("Score after imputation of the missing values with mean = %.5f" % score_median)

Score after imputation of the missing values with mean = 0.76975


### 众数插值后建模

In [51]:
data_most = data.copy()
labelmat_most = data_most['classlabel']
datamat_most = data_most.drop(['classlabel'],axis=1)
datamat_most['BMI'] = (datamat_most['weight'])/(datamat_most['height']*datamat_most['height']*0.0001)
datamat_most = datamat_most.drop(['height','weight'], axis=1)
estimator_most = Pipeline([("imputer", SimpleImputer(missing_values=np.nan,
                                          strategy="most_frequent")),
                             ("scale",RobustScaler()),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100,
                                                      n_jobs=-1))])
score_most = cross_val_score(estimator_most, datamat_most, labelmat_most).mean()
print("Score after imputation of the missing values with mean = %.5f" % score_most)

Score after imputation of the missing values with mean = 0.77110
