In [1]:
import numpy as np
import pandas as pd

# 导入包
from featselector.model_feature_selector import ModelFeatSelector
from featselector.stats_feature_selector import StatFeatSelector

In [2]:
# 使用kaggle上的房价预测数据进行测试
df_test = pd.read_csv('./data/house_price/test.csv')
df_train = pd.read_csv('./data/house_price/train.csv')

df_test['SalePrice'] = -1
df_data = pd.concat([df_train, df_test], ignore_index=True)

In [3]:
df_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 81 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-

In [5]:
# 得到不同的特征，类别特征和数值特征
drop_feats = ['Id', 'SalePrice']

number_columns = []
cate_columns = []
label = 'SalePrice'
for col in df_data.columns:
    # 得到类别特征和数值特征
    if df_data[col].dtype == 'object':
        if col == 'Id':
            continue
        cate_columns.append(col)
    else:
        if col == 'SalePrice':
            continue
        number_columns.append(col)

## StatFeatSlector
> 使用统计学的方法，选择特征
1. 根据缺失值选择特征
2. 根据单一值选择特征
3. 根据特征的方差
4. 根据特征间的相关性系数
5. 根据特征和target之间的相关性选择



导入StatFeatureSelector类，共有以下的参数:
0. --------------必要参数-------------------------------------------------
    1. data: pandas DataFrame， 必要参数
    2. label: target的column 的名字， 必要参数
    3. category_columns: 所有类别特征的名字，list形式，必要参数
    4. number_columns: 数值特征，list形式，必要参数
5. --------------可选参数-------------------------------------------------
    + 设置的阈值
    + category_threshold: double，default (0.95)
    + missing_threshold: double, default (0.75)
    + correlation_threshold:  double, default (0.95)
    + std_threshold: double, default (0.05)
    + relativity_threshold: double, 去除先关性最低的k个特征，default (0.0005)
5. --------------可选参数-------------------------------------------------



In [6]:
stat_fs = StatFeatSelector(df_data, label, category_columns=cate_columns, number_columns=number_columns)

In [7]:
# 根据特征的缺失比例进行选择，去除缺失比例大于阈值的特征，阈值可以重新指定，或者使用类的默认值
# column, 是可选参数，默认使用类初始化时的所有特征
drop_feat_by_missing = stat_fs.identify_missing(missing_threshold=0.90, columns=cate_columns+number_columns)

drop_feats = drop_feats + drop_feat_by_missing

Drop features by missing value:              missing_fraction
PoolQC               0.996574
MiscFeature          0.964029
Alley                0.932169


In [8]:
# 根据特征中是有有一个数，出现的比例大于比例(0.95), 如果大于就去除这个特征，
# 可以重新指定参数，默认使用类初始化时的参数阈值，columns默认使用所有的类别特征

drop_feat_by_single = stat_fs.identify_single_unique()
drop_feats = drop_feats + drop_feat_by_single

Drop features by single unique:   drop_feature  single_fraction
0       Street         0.995889
1    Utilities         0.998972
2    LandSlope         0.951696
3   Condition2         0.989723
4     RoofMatl         0.985269
5      Heating         0.984584


In [9]:
# 使用方差大的特征，去除方差小的特征
# 默认使用所有的数值特征
drop_feat_by_std = stat_fs.identify_std(std_threshold=0.5,columns=number_columns)
drop_feats = drop_feats + drop_feat_by_std

Drop features by std:                    std
BsmtHalfBath  0.245687
KitchenAbvGr  0.214462


In [10]:
# 根据特征之间的相关性系数，去除相关性很大的特征中的其中一个，
# 例如a和b相关性大于0.96,则去除其中一个
drop_feat_by_corlinear = stat_fs.identify_corlinear(correlation_threshold=0.95)
drop_feats = drop_feats + drop_feat_by_corlinear

Drop features by correlation: Empty DataFrame
Columns: [drop_feat, corr_feat, corr_value]
Index: []


In [11]:
# 根据特征和target的相关性系数，去除相关性很小的特征，
drop_feat_by_corlinearlabel = stat_fs.identify_corlinearlabel(relativity_threshold=0.02)
drop_feats = drop_feats + drop_feat_by_corlinearlabel

Drop feautures by relativity with target:               corlinear
LowQualFinSF   0.011062
YrSold         0.003983
MiscVal       -0.017902
BsmtFinSF2    -0.019288
BsmtHalfBath  -0.019368


In [12]:
print(len(cate_columns))
print(len(number_columns))

43
37


In [13]:
cate_columns = [col for col in cate_columns if col not in drop_feats]
number_columns = [col for col in number_columns if col not in drop_feats]

print(len(cate_columns))
print(len(number_columns))

34
30


In [14]:
df_data = pd.get_dummies(df_data, columns=cate_columns)
feats = [col for col in df_data.columns if col not in drop_feats]


In [15]:
df_data[feats].head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,150.0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,284.0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,434.0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,540.0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,490.0,...,0,0,0,1,0,0,0,0,1,0


## ModelFeatSelector
> 根据提升树模型进行特征选择，

1. data: DataFrame
2. label: label name
3. task:  任务类型， [regression, classification]
4. model: ['gbdt', 'xgb', 'lgb']
5. importance_threshold, 除去小于阈值的特征， 默认是0.001




In [16]:
df_train = df_data[df_data[label] > 0]
df_test = df_data[df_data[label] < 0]

In [17]:
model_fs = ModelFeatSelector(df_train, label='SalePrice', task='regression', model_name='xgb',importance_threshold=0.01)

In [49]:
drop_feat_by_model = model_fs.identify_importance(feats, k=10, importance_threshold=0.0)

Training xgb Model

[0]	validation_0-rmse:189202	validation_1-rmse:182268
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 50 rounds.
[50]	validation_0-rmse:23905.9	validation_1-rmse:27781.1
[100]	validation_0-rmse:11858	validation_1-rmse:22739.9
[150]	validation_0-rmse:9721.25	validation_1-rmse:22243
[200]	validation_0-rmse:8275.63	validation_1-rmse:22106.2
[250]	validation_0-rmse:7085.54	validation_1-rmse:22036.3
Stopping. Best iteration:
[235]	validation_0-rmse:7394.03	validation_1-rmse:22005

[0]	validation_0-rmse:185568	validation_1-rmse:202632
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 50 rounds.
[50]	validation_0-rmse:23233.2	validation_1-rmse:50647.3
[100]	validation_0-rmse:11930.1	validation_1-rmse:52649.1
Stopping. Best iteration:
[56]	validation_0-rmse:19916.1	validation_1-r

In [50]:
pre_y = model_fs.model.predict(df_test[feats])

In [51]:
df_sub = pd.DataFrame(data={'Id':df_test['Id'], 'SalePrice': pre_y})
df_sub.to_csv("./data/house_price/submition.csv", index=False)

In [52]:
# 根据model去除重要性最小的k个特征
newfeats = [col for col in feats if col not in drop_feat_by_model]

In [54]:
# 使用新特征进行测试
xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.01,
    'max_depth': 5,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'n_estimators': 1000,
    'min_child_weight': 3,
    'gamma': 0,
    'silent': True,
    'n_jobs': 4,
    'random_state': 2019,
    'verbose': 1
}
import xgboost as xgb

In [55]:
model = xgb.XGBRegressor(**xgb_params)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df_train[newfeats], df_train['SalePrice'], test_size=0.15)
model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_valid, y_valid)], early_stopping_rounds=50, verbose=50)


In [59]:
# 根据Model特征选择，去除最后k=10个重要度最低的特征， 得分0.13112
# 特征去除之前同样的参数的情况下，得分是0.13296
df_sub1 = pd.DataFrame(data={'Id':df_test['Id'], 'SalePrice': model.predict(df_test[newfeats])})
df_sub1.to_csv("./data/house_price/submition2.csv", index=False)