In [1]:
import pandas as pd
import numpy as np
import sklearn.ensemble
import os.path
import sklearn.neural_network
import sklearn.svm
import sklearn.naive_bayes
import sklearn.tree
import lime.lime_tabular_mod
import lime.lime_tabular_multiregressor
import lime.lime_tabular_multiclassifier
import lime.lime_tabular_singleclassifier
import matplotlib.pyplot as plt
import doc.mod.utils.DatasetRepository as dr
import doc.mod.utils.ResultsProcessing as rp
from tqdm import tqdm
import random

repo = dr.DatasetRepository("../data")

pd.set_option("display.max_columns", None)
pd.set_option("mode.chained_assignment", None)
plt.style.use({"figure.facecolor": "white"})

In [2]:
version_str = "v1"
def perform_explain_test(labels_count,
                         version_str,
                         dataset_name,
                         dataset,
                         test_instance,
                         models):
    precision_for_classifier = {}
    recall_for_classifier = {}
    f1_for_classifier = {}
    for (classifier_name, model) in models:
        x = dataset.train_data.to_numpy()
        y = dataset.train_labels.to_numpy()
        model.fit(x, y)

        x = dataset.test_data.to_numpy()
        y_true = dataset.test_labels.to_numpy()
        y_predicted = model.predict(x)

        precision_for_classifier[classifier_name] = sklearn.metrics.precision_score(y_true=y_true, y_pred=y_predicted, average="macro")
        recall_for_classifier[classifier_name] = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_predicted, average="macro")
        f1_for_classifier[classifier_name] = sklearn.metrics.f1_score(y_true=y_true, y_pred=y_predicted, average="macro")

        results_file = f"saved_results/{dataset_name}/explanation/{classifier_name}_{version_str}.png"
        if not os.path.isfile(results_file):

            explainer_multiregressor = lime.lime_tabular_multiregressor.LTEMultiRegressionTree(
                dataset.train_data.to_numpy(),
                feature_names = dataset.train_data.columns.to_list(),
                class_names = model.classes_,
                discretize_continuous=False,
                sample_around_instance=True,
                categorical_features=dataset.categorical_features,
                with_kfold=5,
                use_inversed_data_for_training=True
            )
            explanation = explainer_multiregressor.explain_instance(
                test_instance,
                model.predict_proba,
                num_features = 4,
                top_labels = labels_count,
                distance_metric="minkowski",
                minkowski_norm=100.
            )
            explanation.render_explanation_tree(results_file)

In [3]:
for labels_count, dataset_name, dataset in [
    (3, "FetalHealth", repo.get_fetal_health_dataset(random_state=42)),
    (2, "HeartDisease", repo.get_heart_disease_dataset(random_state=42)),
    (2, "Mushrooms", repo.get_mushrooms_dataset(random_state=42)),
    (2, "Titanic", repo.get_titanic_dataset(random_state=42)),
    (6, "Wine", repo.get_wines_dataset(random_state=42)),
 ]:
    test_data_subset = dataset.test_data.to_numpy()
    test_instance = random.choice(test_data_subset)
    models = [
        ("Random Forest", sklearn.ensemble.RandomForestClassifier(n_jobs=-1)),
        ("AdaBoost", sklearn.ensemble.AdaBoostClassifier()),
        ("DecisionTree", sklearn.tree.DecisionTreeClassifier()),
        ("Neural Network", sklearn.neural_network.MLPClassifier()),
        # ("SVM", sklearn.svm.LinearSVC()), do not provide predict_proba
        ("Naive Bayes", sklearn.naive_bayes.GaussianNB()),
        ("kNN", sklearn.neighbors.KNeighborsClassifier())
    ]
    perform_explain_test(
        labels_count,
        version_str,
        dataset_name,
        dataset,
        test_instance,
        models
    )


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
