In [35]:
import pandas as pd
from shapash.data.data_loader import data_loading
from category_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from shapash.explainer.smart_explainer import SmartExplainer

# 数据读取

In [27]:
house_df, house_dict = data_loading("house_prices")
y_df = house_df["SalePrice"].to_frame()
X_df = house_df[house_df.columns.difference(["SalePrice"])]

In [42]:
y_df.shape

(1460, 1)

In [29]:
X_df.head()

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,856,854,0,3,Single-family Detached,Typical - slight dampness allowed,No Exposure/No Basement,706,0,Good Living Quarters,...,Warranty Deed - Conventional,0,Paved,8,856,"All public Utilities (E,G,W,& S)",0,2003,2003,2008
2,1262,0,0,3,Single-family Detached,Typical - slight dampness allowed,Good Exposure,978,0,Average Living Quarters,...,Warranty Deed - Conventional,0,Paved,6,1262,"All public Utilities (E,G,W,& S)",298,1976,1976,2007
3,920,866,0,3,Single-family Detached,Typical - slight dampness allowed,Mimimum Exposure,486,0,Good Living Quarters,...,Warranty Deed - Conventional,0,Paved,6,920,"All public Utilities (E,G,W,& S)",0,2001,2002,2008
4,961,756,0,3,Single-family Detached,Good,No Exposure/No Basement,216,0,Average Living Quarters,...,Warranty Deed - Conventional,0,Paved,7,756,"All public Utilities (E,G,W,& S)",0,1915,1970,2006
5,1145,1053,0,4,Single-family Detached,Typical - slight dampness allowed,Average Exposure,655,0,Good Living Quarters,...,Warranty Deed - Conventional,0,Paved,9,1145,"All public Utilities (E,G,W,& S)",192,2000,2000,2008


# 数据预处理

In [30]:
categorical_features = [col for col in X_df.columns if X_df[col].dtype == "object"]
encoder = OrdinalEncoder(cols = categorical_features).fit(X_df)
X_df = encoder.transform(X_df)
X_df.head()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,856,854,0,3,1,1,1,706,0,1,...,1,0,1,8,856,1,0,2003,2003,2008
2,1262,0,0,3,1,1,2,978,0,2,...,1,0,1,6,1262,1,298,1976,1976,2007
3,920,866,0,3,1,1,3,486,0,1,...,1,0,1,6,920,1,0,2001,2002,2008
4,961,756,0,3,1,2,1,216,0,2,...,1,0,1,7,756,1,0,1915,1970,2006
5,1145,1053,0,4,1,1,4,655,0,1,...,1,0,1,9,1145,1,192,2000,2000,2008


# 模型训练

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, train_size = 0.75)
reg = RandomForestRegressor(n_estimators = 200, min_samples_leaf = 2).fit(X_train, y_train)
y_pred = pd.DataFrame(reg.predict(X_test), columns = ["pred"], index = X_test.index)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



# 模型解释

In [40]:
xpl = SmartExplainer(features_dict = house_dict)
xpl.compile(
    x = X_test,
    model = reg,
    preprocessing = encoder,
    y_pred = y_pred
)

Backend: Shap TreeExplainer


In [41]:
app = xpl.run_app()

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


INFO:root:Your Shapash application run on http://Wangzf:8050/


Dash is running on http://0.0.0.0:8050/



INFO:root:Use the method .kill() to down your app.
INFO:shapash.webapp.smart_app:Dash is running on http://0.0.0.0:8050/



 * Serving Flask app 'shapash.webapp.smart_app' (lazy loading)
 * Environment: production


In [43]:
app.kill()