In [28]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split


from explainerdashboard import ClassifierExplainer, ExplainerDashboard, RegressionExplainer

In [4]:
missing_marks = pd.read_csv('./../../datasets/missingmarks_clean_data.csv')

In [5]:
x = missing_marks.drop(columns=['institution', 'system', 'feature'])
y = missing_marks['feature']

In [9]:
x.columns

Index(['data_loss_gen', 'data_loss_human', 'data_loss_machine',
       'data_loss_hybrid', 'system_error_chance'],
      dtype='object')

In [6]:
x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=30)

In [21]:
x_test.shape

(2010, 5)

In [10]:
feature_description ={
    'data_loss_gen': "General reasons for data loss",
    'data_loss_human': "Human related reasons for data loss",
    'data_loss_machine': "Machine related reasons for data loss",
    'data_loss_hybrid': "Human and machine related reasons for data loss",
    'system_error_chance': "General error chance on Digital and Hybrid sytems"
}

In [29]:
regressor_model = RandomForestRegressor().fit(x_train, y_train)

In [33]:
regression_explainer = RegressionExplainer(regressor_model, x_test, y_test, 
                                #cats=['data_loss_gen', 'data_loss_human', 'data_loss_machine','data_loss_hybrid', 'system_error_chance'],
                                descriptions=feature_description
                                )

Changing class type to RandomForestRegressionExplainer...
Generating self.shap_explainer = shap.TreeExplainer(model)


In [34]:
ExplainerDashboard(regression_explainer).run()

Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
Generating layout...
Calculating shap values...
Calculating predictions...
Calculating residuals...
Calculating absolute residuals...



In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i,

Calculating shap interaction values...
Reminder: TreeShap computational complexity is O(TLD^2), where T is the number of trees, L is the maximum number of leaves in any tree and D the maximal depth of any tree. So reducing these will speed up the calculation.
Calculating dependencies...
Calculating importances...
Calculating ShadowDecTree for each individual decision tree...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://192.168.0.103:8050
Dash is running on http://0.0.0.0:8050/

Dash is running on http://0.0.0.0:8050/

Dash is running on http://0.0.0.0:8050/

Dash is running on http://0.0.0.0:8050/

 * Serving Flask app 'explainerdashboard.dashboards'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8050
 * Running on http://192.168.0.103:8050
Press CTRL+C to quit
192.168.0.103 - - [14/Jan/2023 17:22:49] "GET / HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:22:49] "GET /assets/bootstrap.min.css?m=1673416985.5852113 HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:22:49] "GET /_dash-component-suites/dash/deps/polyfill@7.v2_6_2m1667463249.12.1.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:22:49] "GET /_dash-component-suites/dash/deps/react@16.v2_6_2m1667463249.14.0.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:22:49] "GET /_dash-component-suites/dash/deps/react-dom@16.v2_6_2m1667463249.14.0.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:22:50] "GET /_dash-component-suites/dash_bootstrap_components/_components/dash_bootstrap_components.v1_3_0m1673416982.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:22:50] "GET /_dash-component-suites/dash/deps/prop-types@15.v2_6_2m16674

In [8]:
rf_tuned = RandomForestClassifier(
    random_state=42,
    n_estimators=4,
    max_features='sqrt',
    max_leaf_nodes=4,
    max_depth=4,
    min_impurity_decrease=1e-5,
    bootstrap=True,
    max_samples=0.9,
)

In [11]:
rf_tuned.fit(x_train, y_train)

In [22]:
rf_explainer = ClassifierExplainer(rf_tuned, x_test, 
#cats=['data_loss_gen', 'data_loss_human', 'data_loss_machine', 'data_loss_hybrid', 'system_error_chance'],
                                #cats_notencoded={'Embarked': 'Stowaway'}, # defaults to 'NOT_ENCODED'
                                descriptions=feature_description, # adds a table and hover labels to dashboard
                                labels=['Not Missing', 'Missing'], # defaults to ['0', '1', etc]
                                #idxs = test_names, # defaults to X.index
                                #index_name = "Passenger", # defaults to X.index.name
                                target = "feature")# defaults to y.name)

Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)


In [23]:
rf_explained = ExplainerDashboard(rf_explainer, 
                        title="Data Loss & Missing Marks Explainer", # defaults to "Model Explainer"
                        shap_interaction=False, # you can switch off tabs with bools
                        )


Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
No y labels were passed to the Explainer, so setting model_summary=False...
Generating layout...
Calculating shap values...



In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i,

Calculating dependencies...
Calculating ShadowDecTree for each individual decision tree...
Calculating predictions...
Calculating prediction probabilities...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...


In [None]:
rf_explained.run(port=5000)