In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install lime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install smace

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import json
import logging
import os
import pickle
import sys
import warnings

import lime.lime_tabular
import numpy as np
import pandas as pd
import shap
import sklearn
import xgboost as xgb
from smace import utils
from smace.decisions import DM
from smace.explainer import Smace
from smace.models import Model

warnings.filterwarnings("ignore")
SEED = 0
np.random.seed(seed=SEED)

In [6]:
#Utils module
import random
from scipy.stats import truncnorm
# result
def evaluate(to, xi, sample, rank, dm, N_sample, df):
    D = len(xi)
    evals = np.array([[1 - to for j in range(D + 1)] for i in range(N_sample)])
    xi = pd.DataFrame([list(xi)], index=range(N_sample), columns=df.columns)
    if rank is None:  # random ranking
        random_rank = list(df.columns)
        n = random.randint(0, len(random_rank))
        random.shuffle(random_rank)
        rank = random_rank[:n]
    for j in range(D):
        if j < len(rank):
            xi[:][rank[j]] = sample[:][rank[j]]
        evals[:, j + 1] = dm.make_decision_eval(xi)
    return evals


def perturb(xi, df, N_sample, dm, to, local=True, categorical_names=None):
    scale = df.std()
    a, b = (df.min() - xi) / scale, (df.max() - xi) / scale
    if local:
        print(a)
        print(b)
        sample = truncnorm.rvs(a, b, loc=xi, scale=scale, size=[N_sample, len(xi)])
        sample = pd.DataFrame(sample)
        sample.columns = df.columns
        if categorical_names:
            sample[categorical_names] = (
                df[categorical_names]
                .sample(N_sample, replace=True)
                .reset_index(drop=True)
            )
    else:
        # sample = np.random.rand(N_sample, len(xi))
        sample = df.sample(N_sample, replace=True).reset_index(drop=True)
    return sample

In [7]:
class SmaceExplanation:
    """Object returned by explainers."""

    def __init__(self, example, exp, r, phi):
        """Create a new Explanation.
        Parameters
        ----------
        - example: instance to be explained.
        - exp: overall contributions.
        - r: rule contributions
        - phi: contributions for models
        """

        self.example = example
        self.exp = exp
        self.rule = r
        self.models = phi

    def table(self, num_features=5):
        # return dataframe with overall contributions
        e = self.exp
        xi = self.example
        exp_table = pd.DataFrame(index=list(e.keys()))
        exp_table["Example"] = list(xi)
        exp_table["Contribution"] = list(e.values())
        exp_table = pd.DataFrame(exp_table)
        return exp_table.reindex(
            exp_table.Contribution.abs().sort_values(ascending=False).index
        )[:num_features]

    def bar(self, num_features=5):
        # return bar plot with overall contributions
        return (
            self.table()
            .Contribution.reindex(
                self.table().Contribution.abs().sort_values(ascending=True).index
            )[-num_features:]
            .plot.barh()
        )

    def rule_table(self, num_features=5):
        # return dataframe with rule contributions
        e = self.rule
        exp_table = pd.DataFrame(index=list(e.keys()))
        exp_table["Contribution"] = list(e.values())
        exp_table = pd.DataFrame(exp_table)
        return exp_table.reindex(
            exp_table.Contribution.abs().sort_values(ascending=False).index
        )[:num_features]

    def rule_bar(self, num_features=5):
        # return bar plot with rule contributions
        return (
            self.rule_table()
            .Contribution.reindex(
                self.table().Contribution.abs().sort_values(ascending=True).index
            )[-num_features:]
            .plot.barh()
        )

    def model_table(self, model_name, num_features=5):
        # return dataframe with model contributions
        e = self.models[model_name]
        exp_table = pd.DataFrame(index=list(e.keys()))
        exp_table["Contribution"] = list(e.values())
        exp_table = pd.DataFrame(exp_table)
        return exp_table.reindex(
            exp_table.Contribution.abs().sort_values(ascending=False).index
        )[:num_features]

    def model_bar(self, model_name, num_features=15):
        # return bar plot with model contributions
        return (
            self.model_table(model_name)
            .Contribution.reindex(
                self.table().Contribution.abs().sort_values(ascending=True).index
            )[-num_features:]
            .plot.barh()
        )

In [8]:
N_example = 100
N_sample = 1000
to = 1
rule_name = "paper"
local = True
what = "telco_" + rule_name

In [9]:
root_dir = '/content/drive/MyDrive/TFM'
rule_file = os.path.join(root_dir, "telco_rule.json")
with open(rule_file, "r") as fp:
    rules_json = json.load(fp)
data = pd.read_csv(os.path.sep.join([root_dir,'cell2cell.csv']))

In [10]:
#Preparing dataset
data['Churn'] = data['Churn'].replace({1:'Yes', 0: 'No'})
y = data.Churn
X = data.copy()
X = X.drop(columns = ['Churn'])
data_cr = X.copy()
X = X.values

feature_names = data.columns.to_list()
feature_names.remove('Churn')
target_names = y.unique()

In [11]:
# models
model = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                          max_depth = 5, alpha = 10, n_estimators = 500)
xgb_cr = model.fit(X, y)
cr_mod = Model(xgb_cr, "cr", data_cr, mode="classification")
models_list = [cr_mod]



In [12]:
# decision system
dm = DM(rules_json, models_list, data_cr)

In [13]:
# Initialize the explainers
explainer = Smace(dm)
data_summary = shap.sample(data_cr, 100)
shap_explainer = shap.KernelExplainer(dm.make_decision_eval, data_summary)
lime_explainer = lime.lime_tabular.LimeTabularExplainer(data_cr.values, feature_names=data_cr.columns, 
                                                        discretize_continuous=True, verbose=True, mode="classification")


In [14]:
dec_avg = dm.make_decision_eval(data_cr).mean()
print("Decision avg: ", dec_avg)

D = len(data_cr.columns)
N = len(models_list)
print("D: ", str(D))
print("N: ", str(N))

Decision avg:  0.05183407206703088
D:  29
N:  1


In [None]:
# examples to explain
random_example = data_cr.copy()
example = random_example[dm.make_decision_eval(random_example) == 1 - to]
full_example = dm.__run_models__(example)
full_example["dist"] = 0
scale = dm.full_data.max() - dm.full_data.min()
for i, row in full_example.iterrows():
    full_example.dist.loc[i] = np.linalg.norm(
        (row[dm.rules[rule_name].variables] - dm.rules[rule_name].values) / scale, 2
    )
example = example.loc[full_example.sort_values("dist")[:N_example].index].reset_index(
    drop=True
)

In [16]:
example.to_csv('/content/drive/MyDrive/TFM/example.csv', index=True, header=True)

In [17]:
shap_values = shap_explainer.shap_values(X[1])
shap_values

array([ 0.        ,  0.01441061, -0.00653492, -0.05380828, -0.00360929,
        0.        , -0.00610505,  0.00243549, -0.00214914, -0.0017351 ,
       -0.00081631,  0.        , -0.00155806, -0.00157717, -0.00089696,
        0.00189835,  0.0020846 ,  0.        , -0.00057989, -0.00085618,
       -0.00484237,  0.        , -0.00058096, -0.00125158,  0.00482185,
       -0.00110555,  0.        , -0.0043799 , -0.0032642 ])

In [18]:
lime_values = utils.lime_mapper(lime_explainer.explain_instance(X[1], dm.make_decision_class, num_features=D))
lime_values

Intercept 0.13866842497140772
Prediction_local [0.01578662]
Right: 0


array([ 0.00114974,  0.00339345,  0.00907367, -0.08601052,  0.00317162,
        0.00467748,  0.00203826, -0.00134677,  0.00195194, -0.00333792,
       -0.0027671 , -0.00259445, -0.01401697,  0.00893939, -0.00691062,
       -0.0045008 , -0.00657048, -0.00433703,  0.00359863, -0.00060026,
        0.00336694, -0.11936356,  0.00237381, -0.00632068,  0.00763876,
        0.00218589,  0.00088219, -0.00208813,  0.08344171])

In [24]:
# evaluation
smace_eval, lime_eval, shap_eval, random_eval = None, None, None, None
for i, xi in example.iterrows():
    print("\n", what, " > i: ", i)
    print(xi)
    print(cr_mod.predict(xi))
    smace_exp = explainer.explain(xi, rule_name)
    explanation = smace_exp.exp
    shap_values = shap_explainer.shap_values(xi)
    lime_values = utils.lime_mapper(
        lime_explainer.explain_instance(xi, dm.make_decision_class, num_features=D)
    )
    e_rule = smace_exp.rule_table(D + N)
    exp = pd.DataFrame(index=list(explanation.keys()))
    exp["SMACE"] = list(explanation.values())
    exp["SHAP"] = shap_values
    exp["LIME"] = lime_values
    print(exp)
    print(e_rule)
    print(smace_exp.model_table("cr"))
    smace_rank = exp.SMACE[exp.SMACE < 0].sort_values(ascending=True).index
    shap_rank = exp.SHAP[exp.SHAP < 0].sort_values(ascending=True).index
    lime_rank = exp.LIME[exp.LIME < 0].sort_values(ascending=True).index
    sample = perturb(xi, data_cr, N_sample, dm, to, local=local)

    if smace_eval is not None:
        smace_eval = np.concatenate(
            (
                smace_eval,
                evaluate(to, xi, sample, smace_rank, dm, N_sample, data_cr),
            )
        )
        lime_eval = np.concatenate(
            (
                lime_eval,
                evaluate(to, xi, sample, lime_rank, dm, N_sample, data_cr),
            )
        )
        shap_eval = np.concatenate(
            (
                shap_eval,
                evaluate(to, xi, sample, shap_rank, dm, N_sample, data_cr),
            )
        )
        random_eval = np.concatenate(
            (random_eval, evaluate(to, xi, sample, None, dm, N_sample, data_cr))
        )
    else:
        smace_eval = evaluate(to, xi, sample, smace_rank, dm, N_sample, data_cr)
        lime_eval = evaluate(to, xi, sample, lime_rank, dm, N_sample, data_cr)
        shap_eval = evaluate(to, xi, sample, shap_rank, dm, N_sample, data_cr)
        random_eval = evaluate(to, xi, sample, None, dm, N_sample, data_cr)

eval_ = pd.DataFrame()
eval_["SMACE"] = smace_eval.mean(0)
eval_["SHAP"] = shap_eval.mean(0)
eval_["LIME"] = lime_eval.mean(0)
eval_["random"] = random_eval.mean(0)
print(eval_)

auc = 1 / 2 * (eval_.iloc[0] + 2 * eval_.iloc[1:-1].sum() + eval_.iloc[-1])
print(auc)

file = os.path.join(root_dir, what)
with open(file + ".log", "w"):
    pass

eval_std = pd.DataFrame()
print(smace_eval)
eval_std["SMACE"] = smace_eval.std(0) / np.sqrt(N_sample)
eval_std["SHAP"] = shap_eval.std(0) / np.sqrt(N_sample)
eval_std["LIME"] = lime_eval.std(0) / np.sqrt(N_sample)
eval_std["random"] = random_eval.std(0) / np.sqrt(N_sample)

res = {"eval": eval_, "error": eval_std}
pickle.dump(res, open(file + ".p", "wb"))

AttributeError: ignored