In [None]:
!pip install --upgrade git+https://github.com/goolig/dsClass.git
from dsClass.path_helper import *

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Explainable AI Exercise


[SHAP documentation](https://shap.readthedocs.io/en/latest/)

## The Dataset

Description of features:

age (numeric)

job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')

marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)

education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')

default: has credit in default? (categorical: 'no','yes','unknown')

housing: has housing loan? (categorical: 'no','yes','unknown')

loan: has personal loan? (categorical: 'no','yes','unknown')

contact: contact communication type (categorical: 'cellular','telephone')

month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')

day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')

campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)

previous: number of contacts performed before this campaign and for this client (numeric)

poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')

In [None]:
file_name = get_file_path("bank.csv")
df = pd.read_csv(file_name)
# Get X, y
y = df["y"].map({"no": 0, "yes": 1})
X = df.drop("y", axis=1)

num_features = ["age", "campaign", "pdays", "previous"]

cat_features = ["job", "marital", "education","default", "housing", "loan", "contact", 
                "month", "day_of_week", "poutcome"]

In [None]:
X.head()

In [None]:
y.hist()

In [None]:
from pandas_profiling import ProfileReport
report = ProfileReport(df, title='Bank dataset')
report

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)

## Create a preprocessing pipeline

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [None]:
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features), 
                                  ("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
                                   cat_features)])

In [None]:
# Train preprocessor
preprocessor.fit(X_train)

# Get the list of categories generated by the process
ohe_categories = preprocessor.named_transformers_["categorical"].categories_

# Create names for our one hot encoded features
new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_features, ohe_categories) for val in vals]

# Create a new list with all names of features
all_features = num_features + new_ohe_features

# Save processed data
X_train_processed = pd.DataFrame(preprocessor.transform(X_train), columns=all_features)
X_test_processed = pd.DataFrame(preprocessor.transform(X_test), columns=all_features)

## Fit models

Use sklearn (already imported) to create a random forest, and fit them on the training data. 

In [None]:
#addition
# create a Random Forest model
rf_model = RandomForestClassifier(n_estimators=150, n_jobs=-1, 
                                                      min_samples_split=.01, max_depth=15,
                                                      random_state=42)
#fit model
rf_model.fit(X_train_processed, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
predictions = rf_model.predict(X_test_processed)
print(confusion_matrix(y_test, predictions))
print('Random forest model accuracy score: {0:0.3f}'. format(accuracy_score(y_test, predictions)))

print(f'Random forest model full classification report:\n{classification_report(y_test, predictions)}')

## SHAP to interpret local predictions

In [None]:
import shap
# Need to load JS vis in the notebook
shap.initjs() 

### Create an explainer

The parameter passed to the explainer is the trained random forest model and the data

In [None]:
# create a tree explainer
explainer = shap.TreeExplainer(rf_model, X_train_processed)

## Explain a specific observation using SHAP

In [None]:
i = 4

observation = X_test_processed.iloc[[i]]
print(f"Observation true label: {y_test.iloc[i]}")
print(f"Observation predicted label (proba): {rf_model.predict_proba(observation)[0]}")

observation

calculate shap values and plot chart


Use SHAP to claculate the SHAP values for the defined observation (look into the documentation) 

In [None]:
#calculate shap values
shap_values = explainer.shap_values(observation)

In [None]:
shap_values

In [None]:
#plot chart
shap.force_plot(explainer.expected_value[1], shap_values[1], observation)

In [None]:
i = 100

observation = X_test_processed.iloc[[i]]
print(f"Observation true label: {y_test.iloc[i]}")
print(f"Observation predicted label (proba): {rf_model.predict_proba(observation)[0]}")

#calculate shap values
shap_values = explainer.shap_values(observation)
#plot chart for another observation
shap.force_plot(explainer.expected_value[1], shap_values[1], observation)

In [None]:
#sample 5000 observations from the test data
observations = X_test_processed.sample(5000, random_state=15)


## Explain the model - Summary plot

In [None]:
#calculate shap values
shap_values = explainer.shap_values(observations)
shap.summary_plot(shap_values[1], features=observations)

### Q1: Check how the instances with a low 'pdays' distribute between the classes? 

In [None]:
#Q1


Get only the instances from class 1

In [None]:
#Only class 1
X_1 = X_test[(y_test==1)]
X_1_test_processed = pd.DataFrame(preprocessor.transform(X_1), columns=all_features)

## Summary plot only for class 1

In [None]:
shap_values = shap.TreeExplainer(rf_model).shap_values(X_1_test_processed)
shap.summary_plot(shap_values[1], features=X_1_test_processed)

Summary plot only for class 0

In [None]:
#Only class 0
X_0 = X_test[(y_test==0)]
X_0_test_processed = pd.DataFrame(preprocessor.transform(X_0), columns=all_features)
shap_values = shap.TreeExplainer(rf_model).shap_values(X_0_test_processed)
shap.summary_plot(shap_values[1], features=X_0_test_processed)

## Generate a summary plot only for class 1 that was predicted 0

In [None]:
#Get all False negatives
y_pred_test_1 = rf_model.predict(X_test_processed)
X_fn = X_test[(y_test==1) & (y_pred_test_1==0)]
X_fn_test_processed = pd.DataFrame(preprocessor.transform(X_fn), columns=all_features)

In [None]:
#Create a summary plot only for false negatives
shap_values = shap.TreeExplainer(rf_model).shap_values(X_fn_test_processed)
shap.summary_plot(shap_values[1], features=X_fn_test_processed)

### Q2: Create a summary plot only for the true positives (in class 1). Write some insights about the difference in the top features between the TP and FN

In [None]:
#Q2
#Get all True positives
y_pred_test_1_1 = rf_model.predict(X_test_processed)
X_tp = X_test[(y_test==1) & (y_pred_test_1_1==1)]
X_tp_test_processed = pd.DataFrame(preprocessor.transform(X_tp), columns=all_features)
#Create a summary plot only for false negatives
shap_values = shap.TreeExplainer(rf_model).shap_values(X_tp_test_processed)
shap.summary_plot(shap_values[1], features=X_tp_test_processed)

## Q3 
### a. Train a xgboost classifier. 
### b. Evaluate it with metrics that represent how good the classifier is in predicting instances from class 1. 
### c. Explain why did you choose these metrics.

In [None]:
#Q3

## Q4: Create a tree explainer and repeat the process we did earlier in the notebook, this time with xgboost (all the forceplots and summary plots)

# Q5
Compare the top features in the summary plots between the false negatives and true positives. Then,try to change something in the input features and explain why did you do it. Train the model again and evaluate it. Write in words how this change affected the performance of the model.

Good luck!