In [2]:
# install dependency
!pip3 install interpret

Collecting interpret
  Downloading interpret-0.6.9-py3-none-any.whl.metadata (1.0 kB)
Collecting interpret-core==0.6.9 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.9->interpret)
  Downloading interpret_core-0.6.9-py3-none-any.whl.metadata (2.8 kB)
Collecting SALib>=1.3.3 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.9->interpret)
  Downloading salib-1.5.1-py3-none-any.whl.metadata (11 kB)
Collecting dill>=0.2.5 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.9->interpret)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting aplr>=10.6.1 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.9->interpret)
  Downloading aplr-10.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting dash>=1.0.0 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.9->interpret)
  Download

In [3]:
# load dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from interpret.blackbox import ShapKernel
from interpret import show
import matplotlib.pyplot as plt


In [4]:
# set random seed for reproducibility
np.random.seed(42)

In [5]:
# load the boston housing dataset from the original source
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

# parse the data: features are every other row starting from 0, target is interleaved rows starting from 1
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# create a dataframe with meaningful feature names
# feature names based on the original boston dataset description
feature_names = [
    'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
    'ptratio', 'b', 'lstat'
]
data_df = pd.DataFrame(data, columns=feature_names)
data_df['price'] = target

data_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
# split the dataset into features and target
x = data_df.drop('price', axis=1)
y = data_df['price']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [8]:
# train a random forest regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

In [9]:
# make predictions
y_pred = rf_model.predict(x_test)

In [10]:
# evaluate the model
print(f"r-squared score: {r2_score(y_test, y_pred):.4f}")
print(f"mean squared error: {mean_squared_error(y_test, y_pred):.4f}")

r-squared score: 0.8923
mean squared error: 7.9015


In [11]:
# use interpret's shap kernel explainer for black-box explanation
shap = ShapKernel(
    model = rf_model,
    data = x_train,
    feature_names = x.columns.tolist()
)



In [15]:
# generate local explanations for the first test sample
local_explanation = shap.explain_local(x_test.iloc[[0]], y_test.iloc[[0]], name='shap local explanation')

# extract the explanation data for the first record (index 0)
first_record_explanation = local_explanation.data(0)  #get data for the first instance

# print the numerical values for the first record
print("Local Explanation for First Record:")
print(f"Predicted: {rf_model.predict(x_test[:1])[0]}")
print(f"True Label/Value: {y_test.iloc[0]}")
print("Feature Contributions:")
for feature_name, contribution in zip(first_record_explanation['names'], first_record_explanation['scores']):
    print(f"{feature_name}: {contribution}")


  0%|          | 0/1 [00:00<?, ?it/s]

Local Explanation for First Record:
Predicted: 22.839000000000002
True Label/Value: 23.6
Feature Contributions:
crim: 0.32560780167800674
zn: 0.0
indus: -0.09117549806294976
chas: 0.0
nox: 0.20308621512025044
rm: -1.668225100955519
age: -0.18889139112309894
dis: -0.10270191448195874
rad: 0.0
tax: -0.04131562106328942
ptratio: 0.32133036160536876
b: 0.02945741743850106
lstat: 1.2580034724189473


In [14]:
# visualize local explanations for the first test sample
show(local_explanation)