In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from xailib.data_loaders.dataframe_loader import prepare_dataframe

from xailib.explainers.lime_explainer import LimeXAITabularExplainer
from xailib.explainers.lore_explainer import LoreTabularExplainer
from xailib.explainers.shap_explainer_tab import ShapXAITabularExplainer

from xailib.models.sklearn_classifier_wrapper import sklearn_classifier_wrapper

# Learning and explaining German Credit Dataset

## Loading and preparation of data

We start by reading from a CSV file the dataset to analyze. The table is loaded by means of the ```DataFrame``` class from the ```pandas``` library.

Among all the attributes of the table, we select the ```class_field``` column that contains the observed class for the corresponding row.

In [2]:
source_file = 'datasets/german_credit.csv'
class_field = 'default'
# Load and transform dataset 
df = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

After the data is loaded in memory, we need to extract metadata information to automatically handle the content withint the table.

The method ```prepare_dataframe``` scans the table and extract the following information:
 * ```df```: is a trasformed version of the original dataframe, where discrete attributes are transformed into numerical attributes by using one hot encoding strategy;
 * ```feature_names```: is a list containint the names of the features after the transformation;
 * ```class_values```: the list of all the possible values for the ```class_field``` column;
 * ```numeric_columns```: a list of the original features that contain numeric (i.e. continuous) values;
 * ```rdf```: the original dataframe, before the transformation;
 * ```real_feature_names```: the list of the features of the dataframe before the transformation;
 * ```features_map```: it is a dictionary pointing each feature to the original one before the transformation.

In [3]:
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = prepare_dataframe(df, class_field)

### Learning a Random Forest classfier

We train a RF classifier by using the ```sklearn``` library. We start by splitting the dataset into a train and test subsets. 

In [4]:
test_size = 0.3
random_state = 42
X_train, X_test, Y_train, Y_test = train_test_split(df[feature_names], df[class_field],
                                                        test_size=test_size,
                                                        random_state=random_state,
                                                        stratify=df[class_field])



Then we train the model on the training set. 
Once the model has been learned, we use a wrapper class to get access to the model for ```XAI lib```

In [5]:
bb = RandomForestClassifier(n_estimators=20, random_state=random_state)
bb.fit(X_train.values, Y_train.values)
bbox = sklearn_classifier_wrapper(bb)   

Select a new instance to be classfied by the model and print the predicted class.

In [6]:
inst = X_train.iloc[147].values
print('Instance ',inst)
print('True class ',Y_train.iloc[8])
print('Predicted class ',bb.predict(inst.reshape(1, -1)))

## Explaining the prediction
We use the explanators of ```XAI lib``` to provide an explantion for the classified instance ```inst```.
Every explainer of ```XAI lib``` takes in input the blackbox to be explained with the corresponding feature names, and a configuration object to initialize the explainer.

### SHAP explainer

In [None]:
explainer = ShapXAITabularExplainer(bbox, feature_names)
config = {'explainer' : 'tree', 'X_train' : X_train.iloc[0:100].values}
explainer.fit(config)

In [None]:
exp = explainer.explain(inst)
# print(exp.exp)

In [None]:
exp.plot_features_importance()

### LORE explainer

In [None]:
explainer = LoreTabularExplainer(bbox)
config = {'neigh_type':'rndgen', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, class_field, config)
exp = explainer.explain(inst)
print(exp)

In [None]:
exp.plotRules()

In [None]:
exp.plotCounterfactualRules()

### LIME explainer

In [8]:
limeExplainer = LimeXAITabularExplainer(bbox)
config = {'feature_selection': 'lasso_path'}
limeExplainer.fit(df, class_field, config)
lime_exp = limeExplainer.explain(inst)
print(lime_exp.exp.as_list())

In [9]:
# limeExplainer.plot_lime_values(lime_exp.as_list(), 5, 10)
lime_exp.plot_features_importance()

## Learning a different model

### Learning a Logistic Regressor

We train a Logistic Regression by using the ```sklearn``` library. We transform the dataset by using a ```Scaler``` to normalize all the attributes.


In [10]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

bb = LogisticRegression(C=1, penalty='l2')
bb.fit(X_scaled, Y_train.values)
# pass the model to the wrapper to use it in the XAI lib
bbox = sklearn_classifier_wrapper(bb)

In [11]:
# select a record to explain
inst = X_scaled[182]
print('Instance ',inst)
print('Predicted class ',bb.predict(inst.reshape(1, -1)))

## Explaining the prediction
We use the same explainators as for the previous model. In this case, a few adjustments are necessary for the initialization of the explanators. For example, SHAP needs a specific configuration for the linear model we are using.
### SHAP Explainer

In [12]:
explainer = ShapXAITabularExplainer(bbox, feature_names)
config = {'explainer' : 'linear', 'X_train' : X_scaled[0:100], 'feature_pert' : 'interventional'}
explainer.fit(config)

In [13]:
exp = explainer.explain(inst)
print(exp)

In [14]:
exp.plot_features_importance()

### LORE explainer

In [15]:
explainer = LoreTabularExplainer(bbox)
config = {'neigh_type':'geneticp', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, class_field, config)
exp = explainer.explain(inst)
print(exp)

In [16]:
exp.plotRules()

In [17]:
exp.plotCounterfactualRules()

### LIME explainer

In [19]:
limeExplainer = LimeXAITabularExplainer(bbox)
config = {'feature_selection': 'lasso_path'}
limeExplainer.fit(df, class_field, config)
lime_exp = limeExplainer.explain(inst)
print(lime_exp.exp.as_list())

In [20]:
lime_exp.plot_features_importance()