## Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1N9AAGF58Dya2u-EObjucPvgZ3VEoLQZR?usp=sharing)

## Install dependencies

In [None]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" "matplotlib>=3.5.2,<3.6.0" scikit-learn shap==0.40.0

## Diabetes Progression Prediction as a Regression problem

In [None]:
import shap
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

### Loading data and preprocessing

In [None]:
data = datasets.load_diabetes()
print(data.keys())

In [None]:
print(data.DESCR) 

In [None]:
# Read the DataFrame, first using the feature data
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add a target column, and fill it with the target data
df['target'] = data.target
# Show the first five rows
df.head()

### Split Dataset into Training and Validation

In [None]:
# Store the feature data
X = pd.DataFrame(data.data, columns=data.feature_names)
# store the target data
y = data.target

In [None]:
# split the data using scikit-learn's train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
print('Train samples:', len(X_train))
print('Test samples:', len(X_test))

### Training model

In [None]:
clf = RandomForestRegressor(n_estimators=50, max_depth=15)
clf.fit(X_train, y_train)
print(clf.get_params())

### Computing predictions

In [None]:
# logging predictions
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

### Computing metrics

In [None]:
metrics = {
    'train/mean_absolute_error': mean_absolute_error(y_train, y_pred_train),
    'train/mean_squared_error': mean_squared_error(y_train, y_pred_train),
    'test/mean_absolute_error': mean_absolute_error(y_test, y_pred_test),
    'test/mean_squared_error': mean_squared_error(y_test, y_pred_test)
}
print('Tree 1 metrics:', metrics)

### Computing Shap

In [None]:
# shap value computation model 1 test set
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)

## Training another model with different hyperparameters

In [None]:
clf = RandomForestRegressor(n_estimators=150, max_depth=10)

clf.fit(X_train, y_train)
print(clf.get_params())

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

metrics = {
    'train/mean_absolute_error': mean_absolute_error(y_train, y_pred_train),
    'train/mean_squared_error': mean_squared_error(y_train, y_pred_train),
    'test/mean_absolute_error': mean_absolute_error(y_test, y_pred_test),
    'test/mean_squared_error': mean_squared_error(y_test, y_pred_test)
}
print('Tree 2 metrics:', metrics)

# shap value computation Model 2 test set
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)