In [3]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from explainerdashboard import ClassifierExplainer, ExplainerDashboard

In [4]:
missing_marks = pd.read_csv('./../../datasets/missingmarks_clean_data.csv')

In [5]:
x = missing_marks.drop(columns=['institution', 'system', 'feature'])
y = missing_marks['feature']

In [9]:
x.columns

Index(['data_loss_gen', 'data_loss_human', 'data_loss_machine',
       'data_loss_hybrid', 'system_error_chance'],
      dtype='object')

In [6]:
x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=30)

In [21]:
x_test.shape

(2010, 5)

In [10]:
feature_description ={
    'data_loss_gen': "General reasons for data loss",
    'data_loss_human': "Human related reasons for data loss",
    'data_loss_machine': "Machine related reasons for data loss",
    'data_loss_hybrid': "Human and machine related reasons for data loss",
    'system_error_chance': "General error chance on Digital and Hybrid sytems"
}

In [8]:
rf_tuned = RandomForestClassifier(
    random_state=42,
    n_estimators=4,
    max_features='sqrt',
    max_leaf_nodes=4,
    max_depth=4,
    min_impurity_decrease=1e-5,
    bootstrap=True,
    max_samples=0.9,
)

In [11]:
rf_tuned.fit(x_train, y_train)

In [22]:
rf_explainer = ClassifierExplainer(rf_tuned, x_test, 
#cats=['data_loss_gen', 'data_loss_human', 'data_loss_machine', 'data_loss_hybrid', 'system_error_chance'],
                                #cats_notencoded={'Embarked': 'Stowaway'}, # defaults to 'NOT_ENCODED'
                                descriptions=feature_description, # adds a table and hover labels to dashboard
                                labels=['Not Missing', 'Missing'], # defaults to ['0', '1', etc]
                                #idxs = test_names, # defaults to X.index
                                #index_name = "Passenger", # defaults to X.index.name
                                target = "feature")# defaults to y.name)

Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)


In [23]:
rf_explained = ExplainerDashboard(rf_explainer, 
                        title="Data Loss & Missing Marks Explainer", # defaults to "Model Explainer"
                        shap_interaction=False, # you can switch off tabs with bools
                        )


Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
No y labels were passed to the Explainer, so setting model_summary=False...
Generating layout...
Calculating shap values...



In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i,

Calculating dependencies...
Calculating ShadowDecTree for each individual decision tree...
Calculating predictions...
Calculating prediction probabilities...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...


In [24]:
rf_explained.run(port=5000)

Starting ExplainerDashboard on http://192.168.0.103:5000
Dash is running on http://0.0.0.0:5000/

Dash is running on http://0.0.0.0:5000/

 * Serving Flask app 'explainerdashboard.dashboards'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.0.103:5000
Press CTRL+C to quit
192.168.0.103 - - [14/Jan/2023 17:04:20] "GET / HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:21] "GET /assets/bootstrap.min.css?m=1673416985.5852113 HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:21] "GET /_dash-component-suites/dash/deps/polyfill@7.v2_6_2m1667463249.12.1.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:21] "GET /_dash-component-suites/dash/deps/react@16.v2_6_2m1667463249.14.0.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:21] "GET /_dash-component-suites/dash_bootstrap_components/_components/dash_bootstrap_components.v1_3_0m1673416982.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:21] "GET /_dash-component-suites/dash/deps/react-dom@16.v2_6_2m1667463249.14.0.min.js HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:21] "GET /_dash-component-suites/dash/deps/prop-types@15.v2_6_2m16674

Calculating pred_percentiles...


192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:27] "GET /assets/favicon.ico?m=1673416985.673553 HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/20

pos_label should either be int or str in self.labels!


192.168.0.103 - - [14/Jan/2023 17:04:32] "POST /_dash-update-component HTTP/1.1" 200 -

In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, 

pos_label should either be int or str in self.labels!


192.168.0.103 - - [14/Jan/2023 17:04:33] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:33] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:33] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:33] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:33] "POST /_dash-update-component HTTP/1.1" 200 -
192.168.0.103 - - [14/Jan/2023 17:04:39] "GET /_dash-component-suites/dash/dcc/async-slider.js HTTP/1.1" 200 -


In [85]:
rf_tuned.score(x_train, y_train), rf_tuned.score(x_val, y_val)

(0.9653227144516343, 0.9701492537313433)

In [87]:
rf_tuned.score(x_test, y_test)

0.9686567164179104

In [88]:
predictions_1 = rf_tuned.predict(x_test)

In [90]:
probability = rf_tuned.predict_proba(x_train)
probability

array([[0.95701408, 0.04298592],
       [0.95701408, 0.04298592],
       [0.95701408, 0.04298592],
       ...,
       [0.97966998, 0.02033002],
       [0.94933739, 0.05066261],
       [0.96278195, 0.03721805]])

In [96]:
probability = rf_tuned.predict_proba(x_test)
probability

array([[9.37781955e-01, 6.22180451e-02],
       [9.52064903e-01, 4.79350974e-02],
       [9.52064903e-01, 4.79350974e-02],
       ...,
       [9.52064903e-01, 4.79350974e-02],
       [7.60650261e-04, 9.99239350e-01],
       [9.57014080e-01, 4.29859196e-02]])

In [91]:


accuracy = accuracy_score(y_test, predictions_1)
print('Accuracy = {:.3f}%' .format(accuracy*100))

Accuracy = 96.866%


In [95]:
print(classification_report(y_test, predictions_1))

              precision    recall  f1-score   support

           1       0.96      1.00      0.98      1689
           2       1.00      0.81      0.89       321

    accuracy                           0.97      2010
   macro avg       0.98      0.90      0.94      2010
weighted avg       0.97      0.97      0.97      2010

