In [39]:
# Data preprocessing
from pandas import read_csv
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# XAI
import eli5

# ELI 5 (Explain Like I'm 5)
## Introduction
- Usefull to debug sklearn-like models.
- Provides global interpretation of "white box" models with consistent API (how your model works).
- Provides local explanation of predictions (a concrete prediction explanation, how it takes that prediction).

## ELI5 - API

https://eli5.readthedocs.io/en/latest/autodocs/eli5.html

1. Explain model globally (features importance) <br>
    ```python
    import eli5
    eli5.show_wheights(model)
    ```
    
    
2. Explain a single prediction <br>
    ```python
    import eli5
    eli5.show_prediction(model, observation)
    ```   

## Examples

Indian diabetes dataset features
1. Pregnancies
2. Glucose
3. Blood pressure
4. Skin thickness
5. Insulin
6. BMI
7. Diabetes pedrigree function
8. Age
9. Outcome

In [40]:
# Obtenermos los datos para generar modelo
filename = "data/pima-indians-diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
input_names = names[:len(names)-1] # quitamos la clase de los nombres 
df = read_csv(filename, names=names)

# Como todos las columnas tienen valores numéricas, lo dejamos como está
preprocessor = ColumnTransformer("numerical", "passthrough", names)

In [41]:
# Logistic Regression
# lr_model = Pipeline([("preprocessor", preprocessor), ("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])
lr_model = LogisticRegression(solver='liblinear')
# Decision Tree Classifier
# dt_model = Pipeline([("preprocessor", preprocessor), ("model", DecisionTreeClassifier(class_weight="balanced"))])
dt_model = DecisionTreeClassifier()
# Random Forest
rf_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

In [42]:
# Dividimos el dataset en test(33%) y train(66%)
array = df.values
X = array[:, 0:8]
Y = array[:, 8]
test_size = 0.33
seed = 7
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

### Logistic Regression

In [43]:
# Entrenamos el modelo
lr_model.fit(x_train, y_train)

# Probamos la precisión del modelo
y_predicted = lr_model.predict(x_test)
accuracy = accuracy_score(y_test, y_predicted)
print("Precisión: ",  accuracy*100, "%")

Precisión:  75.59055118110236 %


In [44]:
# Miramos la interpretabilidad del modelo
eli5.show_weights(lr_model, feature_names=input_names)

Weight?,Feature
0.941,pedi
0.112,preg
0.054,mass
0.026,plas
0.008,age
0.003,skin
-0.001,test
-0.02,pres
-5.298,<BIAS>


In [49]:
# Miramos la interpretabilidad de la predicción
i = 10;
eli5.show_prediction(lr_model, x_test[i], feature_names=input_names, show_feature_values=True)

Contribution?,Feature,Value
4.323,plas,167.0
1.748,mass,32.3
0.79,pedi,0.839
0.241,age,30.0
-5.298,<BIAS>,1.0


### Decision tree

In [48]:
# Entrenamos el modelo
dt_model.fit(x_train, y_train)

# Probamos la precisión del modelo
y_predicted = dt_model.predict(x_test)
accuracy = accuracy_score(y_test, y_predicted)
print("Precisión: ",  accuracy*100, "%")

Precisión:  71.65354330708661 %


In [50]:
# Miramos la interpretabilidad del modelo
eli5.show_weights(dt_model, feature_names=input_names)

Weight,Feature
0.3315,plas
0.1694,mass
0.1235,age
0.1155,pedi
0.0962,skin
0.066,pres
0.0573,test
0.0405,preg


In [51]:
# Miramos la interpretabilidad de la predicción
i = 10;
eli5.show_prediction(dt_model, x_test[i], feature_names=input_names, show_feature_values=True)

Contribution?,Feature,Value
0.483,plas,167.0
0.342,<BIAS>,1.0
0.086,mass,32.3
0.05,pedi,0.839
0.039,pres,0.0
