# Evaluating and explaining classifiers

Code lightly adapted from [*Practical Machine Learning with Python*, chapter 5](https://github.com/dipanjanS/practical-machine-learning-with-python/tree/master/notebooks/Ch05_Building_Tuning_and_Deploying_Models) by Dipanjan Sarkar.

# Load Wisconsin breast cancer dataset

These are features of diagnostic tissue imaging tests. Note that label coding is: `0 = malignant`, `1 = benign`.

In [2]:
from sklearn.datasets import load_breast_cancer

# load data
data = load_breast_cancer()
X = data.data
y = data.target
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## Standardize features

Compare results with and without feature normalization ...

In [3]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

# Clustering with *k*-Means

In [4]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2, random_state=2)
km.fit(X)

labels = km.labels_
centers = km.cluster_centers_
print(labels[100:110])

[0 0 0 0 0 1 0 0 1 0]


In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
bc_pca = pca.fit_transform(X)

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
fig.suptitle('Visualizing breast cancer clusters')
fig.subplots_adjust(top=0.85, wspace=0.5)
ax1.set_title('Actual Labels')
ax2.set_title('k-Means Labels')

for i in range(len(y)):
    if y[i] == 0:
        c1 = ax1.scatter(bc_pca[i,0], bc_pca[i,1],c='g', marker='.')
    if y[i] == 1:
        c2 = ax1.scatter(bc_pca[i,0], bc_pca[i,1],c='r', marker='.')
        
    if labels[i] == 0:
        c3 = ax2.scatter(bc_pca[i,0], bc_pca[i,1],c='g', marker='.')
    if labels[i] == 1:
        c4 = ax2.scatter(bc_pca[i,0], bc_pca[i,1],c='r', marker='.')

l1 = ax1.legend([c1, c2], ['0', '1'])
l2 = ax2.legend([c3, c4], ['0', '1'])

# Classification with logistic regression

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

In [None]:
from sklearn import linear_model

logistic = linear_model.LogisticRegression(max_iter=10000)
logistic.fit(X_train,y_train)
y_pred = logistic.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
print("Accuracy:", logistic.score(X_test, y_test))
print("Confusion matrix:")
display(confusion_matrix(y_test, y_pred))

In [None]:
logistic.coef_

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
plt.barh(list(range(X.shape[1])),logistic.coef_[0], tick_label=data.feature_names)
plt.show()

# Feature importance via permutation

In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(
    logistic, X_test, y_test,
    n_repeats=30, random_state=0
)

print("Permutation feature importance:\n")

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{data.feature_names[i]:<8}"
        f"\t{r.importances_mean[i]:.3f}"
        f" +/- {r.importances_std[i]:.3f}")

# Model interpretation with `skater`

Requires the `skater` library. To install from the command line:

```
conda install -c conda-forge skater
```

In [None]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

interpreter = Interpretation(X_test, feature_names=data.feature_names)
model = InMemoryModel(logistic.predict_proba, examples=X_train, target_names=logistic.classes_)

## Visualize feature importances

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
plots = interpreter.feature_importance.plot_feature_importance(model, ascending=False, ax=ax)

## One-way partial dependence plot

In [None]:
p = interpreter.partial_dependence.plot_partial_dependence(
    ['mean perimeter', 'worst area', 'worst texture', 'worst symmetry'], 
    model, 
    n_samples=200,
    with_variance=True, 
    figsize = (12, 8)
)

## Explaining individual predictions

In [None]:
from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer
exp = LimeTabularExplainer(X_train, feature_names=data.feature_names, 
                           discretize_continuous=True, class_names=['0', '1'])

In [None]:
exp.explain_instance(X_test[0], logistic.predict_proba).show_in_notebook()

In [None]:
exp.explain_instance(X_test[1], logistic.predict_proba).show_in_notebook()