In [1]:
!pip install lingam

Collecting lingam
  Downloading lingam-1.8.3-py3-none-any.whl.metadata (8.3 kB)
Collecting graphviz (from lingam)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting pygam (from lingam)
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting psy (from lingam)
  Downloading psy-0.0.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting semopy (from lingam)
  Downloading semopy-2.3.11.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting progressbar2 (from psy->lingam)
  Downloading progressbar2-4.4.2-py3-none-any.whl.metadata (17 kB)
Collecting numpy (from lingam)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy (fr

In [34]:
#German
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split

from csse import CSSE
from prepare_dataset import *

import pickle

import numpy as np
import pandas as pd
import graphviz
import lingam
from lingam.utils import print_causal_directions, print_dagc, make_dot

from tqdm import tqdm
import random as rnd

from IPython.display import display

import warnings

warnings.filterwarnings('ignore')

# Prepare CSSE

In [35]:
# Read Dataset German
df_main = prepare_german_dataset("german_credit.csv", "data/")

#Get the input features
columns = df_main.columns
class_name = 'default' # default = 0 = "Good class" / default = 1 = "Bad class" 
columns_tmp = list(columns)
columns_tmp.remove(class_name)

x_train, x_test, y_train, y_test = train_test_split(df_main[columns_tmp], df_main[class_name], test_size=0.1)

model = RandomForestClassifier(n_estimators = 120, n_jobs=-1, random_state=0)  
model.fit(x_train, y_train)

p = model.predict(x_test)

print(classification_report(y_test, p))

K = 5 #Number of counterfactual explanations to be obtained

explainerCSSE = CSSE(df_main[columns_tmp], model, K = K, num_gen = 3)

              precision    recall  f1-score   support

           0       0.77      0.87      0.81        68
           1       0.61      0.44      0.51        32

    accuracy                           0.73       100
   macro avg       0.69      0.65      0.66       100
weighted avg       0.72      0.73      0.72       100



In [127]:
import pickle

# Salvar o modelo
with open('model/rf.pkl', 'wb') as file:
    pickle.dump(model, file)

# Prepare Lingam - Bootstrap

In [36]:
model_lingam = lingam.DirectLiNGAM()

result_lingam_bt = model_lingam.bootstrap(df_main[columns_tmp], n_sampling=100)

causal_effects = result_lingam_bt.get_total_causal_effects(min_causal_effect=0.01)

df_causal_effects = pd.DataFrame(causal_effects)
labels = [f'{i}' for i in df_main[columns_tmp].columns]

df_causal_effects['from'] = df_causal_effects['from'].apply(lambda x : labels[x])
df_causal_effects['to'] = df_causal_effects['to'].apply(lambda x : labels[x])

causal_order = [labels[x] for x in model_lingam.causal_order_]

# Functions

In [37]:
def apply_causality(df):
#     print(f'df len = {df.shape}')
    df_apply_causal = pd.DataFrame(columns = df.columns)
    original = df.iloc[0]
    df_apply_causal.loc[0] = original
    for index, df_row in df.iloc[1:].iterrows():
        causal_ind = df_row.copy()
        for column in causal_order:
            value_diff = causal_ind[column] - original[column]
#             print(f"value_diff = {value_diff}")
            if value_diff != 0:
                tmp_effects = df_causal_effects[df_causal_effects['from'] == column]
                for index, row in tmp_effects.iterrows():
                    prob = rnd.random()
                    if row['probability'] >= prob:
                        causal_ind[row['to']] = causal_ind[row['to']] + (value_diff * row['effect'])
        df_apply_causal.loc[len(df_apply_causal)] = causal_ind
    
#     print(f'df_apply_causal len = {df_apply_causal.shape}')
    return df_apply_causal
    
def euclidean_distance(series1, series2):
    # Verificando se as séries têm o mesmo comprimento
    if len(series1) != len(series2):
        raise ValueError("As séries devem ter o mesmo comprimento.")
    
    # Calculando a diferença quadrática entre os elementos das séries
    squared_diff = (series1 - series2) ** 2
    
    # Calculando a soma das diferenças quadráticas
    sum_squared_diff = np.sum(squared_diff)
    
    # Calculando a raiz quadrada da soma
    euclidean_dist = np.sqrt(sum_squared_diff)
    
    return euclidean_dist

def get_contrafac_df_causal(solution_list_causal):
    lista_solution_causal = [[t.column for t in sublist] for sublist in solution_list_causal]

    # Inicializa uma lista para armazenar os resultados
    resultados = []

    # Loop sobre os valores na lista
    for lista_valores in lista_solution_causal:
        if len(lista_valores) > 1:
            for v1 in lista_valores:
                for v2 in lista_valores:
                    if v1 != v2:
                        # Cria uma condição para cada par de valores diferentes na lista
                        condicao = (df_causal_effects['to'].isin([v1, v2])) & (df_causal_effects['from'].isin([v1, v2]))
                        # Realiza a busca no DataFrame usando a condição e armazena os resultados
                        resultados.append(df_causal_effects[condicao])

    # Concatena os resultados em um único DataFrame
    if resultados:
        resultado_final = pd.concat(resultados)
        resultado_final = resultado_final.drop_duplicates().sort_values(by='probability')
    else:
        resultado_final = pd.DataFrame(columns = df_causal_effects.columns)
    
    return resultado_final

def analyse_contrafac(contrafac, df, original_ind):
    columns = [x.column for x in contrafac]
    condicao = (df['to'].isin(columns)) & (df['from'].isin(columns))
    ind = original_ind[columns]
    return [contrafac, df[condicao], ind]

def verificar_condicoes(row):
    if (row['from'] == 'mais' and row['to'] == 'mais' and row['effect'] > 0):
        return True
    elif row['from'] == 'menos' and row['to'] == 'menos' and row['effect'] > 0:
        return True
    elif row['from'] == 'mais' and row['to'] == 'menos' and row['effect'] < 0:
        return True
    elif row['from'] == 'menos' and row['to'] == 'mais' and row['effect'] < 0:
        return True
    else:
        return False

# CSSE Explain with causal

In [38]:
def get_causal_explain(X):
    original_ind = x_test.iloc[X].copy() #Original instance
    #self.ind_cur_class = ind_cur_class #Index in the shap corresponds to the original instance class
    explainerCSSE.current_class = p[X] #Original instance class
    explainerCSSE.original_ind = original_ind

    ind_cur_class = explainerCSSE.getBadClass()

    #Gets the valid values range of each feature
    features_range = []
    features_range = explainerCSSE.getFeaturesRange()

    #The DataFrame df will have the current population
    df = pd.DataFrame(columns=explainerCSSE.input_dataset.columns)

    #Generates the initial population with popinitial mutants        
    explainerCSSE.getPopInicial(df, features_range)
    df_causal = df.copy()
    dict_dfs = {}

    # for g in tqdm(range(explainerCSSE.num_gen), desc= "Processing..."):
    for g in tqdm(range(30), desc= "Processing..."):

        #To use on the parents of each generation
        parents = pd.DataFrame(columns=explainerCSSE.input_dataset.columns)

        #Copy parents to the next generation
        parents = df.copy()
        parents_causal = df_causal.copy()
    #     print(f"parents = {len(parents)}")
        dict_dfs[g] = {}

        dict_dfs[g]['original_parents'] = parents
        dict_dfs[g]['causal_parents'] = apply_causality(parents_causal)
    #     raise KeyboardInterrupt("Execução interrompida pelo usuário")
        #df will contain the new population
        df = pd.DataFrame(columns=explainerCSSE.input_dataset.columns)
        df_causal = pd.DataFrame(columns=explainerCSSE.input_dataset.columns)
        evaluation = []
        evaluation_causal = []

        #Assessing generation counterfactuals
        explainerCSSE.fitness(parents, evaluation, ind_cur_class)
        explainerCSSE.fitness(dict_dfs[g]['causal_parents'], evaluation_causal, ind_cur_class)

    #     print(len(df))
    #     dict_dfs[g]['original'] = df.copy()
    #     df_causal_applied = apply_causality(df)
    #     dict_dfs[g]['causal'] = df_causal_applied
    #     print(f"df depois do fitness = {len(df)}")
    #     raise KeyboardInterrupt("Execução interrompida pelo usuário")


        #The original individual will always be in the 0 position of the df - So that it is normalized too (it will be used later in the distance function)
        df.loc[0] = original_ind.copy()
        df_causal.loc[0] = original_ind.copy()

        #Copies to the next generation the per_elit best individuals
        explainerCSSE.elitism(evaluation, df, parents)
        explainerCSSE.elitism(evaluation_causal, df_causal, parents_causal)
    #     raise KeyboardInterrupt("Execução interrompida pelo usuário")
    #     print(f"df depois do elitism = {len(df)}")
        number_cross_repetitions = 0
        while len(df) < explainerCSSE.pop_size + 1: #+1, as the 1st position is used to store the reference individual
            number_cross_repetitions = explainerCSSE.crossover(df, parents, evaluation, number_cross_repetitions)
            number_cross_repetitions_causal = explainerCSSE.crossover(df_causal, parents_causal, evaluation_causal, number_cross_repetitions)

            mutation_op = rnd.random()
            if mutation_op <= explainerCSSE.mutation_proba:
                explainerCSSE.mutation(df, len(df) - 1, features_range)
                explainerCSSE.mutation(df_causal, len(df_causal) - 1, features_range)
    #     raise KeyboardInterrupt("Execução interrompida pelo usuário")

    # raise KeyboardInterrupt("Execução interrompida pelo usuário")

    evaluation = []
    evaluation_causal = []

    #Evaluating the latest generation
    explainerCSSE.fitness(df, evaluation, ind_cur_class)
    explainerCSSE.fitness(df_causal, evaluation_causal, ind_cur_class)

    #Order the last generation by distance to the original instance     
    evaluation.sort(key=lambda individual: individual.aval_norm)
    evaluation_causal.sort(key=lambda individual: individual.aval_norm) 

    #Getting the counterfactual set
    contrafactual_set = pd.DataFrame(columns=explainerCSSE.input_dataset.columns)
    contrafactual_set, solution_list = explainerCSSE.getContrafactual(df, evaluation)

    #Getting the counterfactual CAUSAL set
    contrafactual_set_causal = pd.DataFrame(columns=explainerCSSE.input_dataset.columns)
    contrafactual_set_causal, solution_list_causal = explainerCSSE.getContrafactual(df_causal, evaluation_causal) 

    dict_dfs['contrafactual_set_causal'] = contrafactual_set_causal
    dict_dfs['solution_list_causal'] = solution_list_causal
    
    df_contrafac_causal = get_contrafac_df_causal(solution_list_causal)

    return [solution_list_causal, solution_list, df_contrafac_causal, original_ind]

In [39]:
def run(original_row):
    response_list = get_causal_explain(original_row)
    list_analyse = []
    for contrafac_causal in response_list[0]:
        list_analyse.append(analyse_contrafac(contrafac_causal, response_list[2], response_list[3]))
    print(f"tamanho da list_analyse = {len(list_analyse)}")
    return list_analyse

In [76]:
run0 = run(0)

Processing...: 100%|██████████| 30/30 [01:09<00:00,  2.31s/it]

tamanho da list_analyse = 5





In [119]:
%time
global_quant_changes = 0
global_quant_causal_changes = 0
global_quant_causal_rules = 0
global_quant_zeros_causal = 0
global_quant_full_causal = 0
global_quant_causal_contrafac = 0

quant_original_instance = 10
for x in range(quant_original_instance):
    print(f"run {x}")
    runs = run(x)
    for content in runs:
        controle = {}
        causal = content[0]
        df = content[1]
        ori = content[2]
        
        num_changes = len(causal)
        global_quant_changes += num_changes
        
        num_causal_rules = len(df)
        global_quant_causal_rules += num_causal_rules
        
        for attr in causal:
            key = attr.column
            if attr.value > ori[key]:
                controle[key] = 'mais'
            else:
                controle[key] = 'menos'

        df_temp = df.copy()
        df_temp['from'] = df['from'].map(controle)
        df_temp['to'] = df['to'].map(controle)
        df_temp['causal'] = df_temp.apply(verificar_condicoes, axis = 1)
        
        causal_finds = df_temp['causal'].sum()
        global_quant_causal_changes += causal_finds

        if causal_finds > 0:
            global_quant_causal_contrafac += 1
        else:
            global_quant_zeros_causal += 1
            display(df_temp)
            print(f"original = {ori}")
            print(f"causal = {causal}")
        
        if causal_finds == num_causal_rules:
            global_quant_full_causal += 1
            if causal_finds > 2:
                display(df_temp)
                print(f"original = {ori}")
                print(f"causal = {causal}")

                
quant_contrafac = quant_original_instance*K

print()
print(f"quantas instancias contrafactuais encontradas = {quant_contrafac}")
print(f"quantidade de mudanças totais = {global_quant_changes}")

print(f"quantas instancias tiveram pelo menos uma relação causal satisfeita = {global_quant_causal_contrafac}/{quant_contrafac}")

print(f"quantas relações causais foram encontradas = {global_quant_causal_rules}")

print(f"quantas relações causais foram satisfeitas = {global_quant_causal_changes}/{global_quant_causal_rules}")

print(f"quantas instâncias não tiveram nenhuma relação causal satisfeita = {global_quant_zeros_causal}/{quant_contrafac}")
print(f"quantas instâncias tiveram TODAS as relaçoes causais satisfeitas = {global_quant_full_causal}/{global_quant_causal_contrafac}")

run 0


Processing...: 100%|██████████| 30/30 [01:00<00:00,  2.03s/it]


tamanho da list_analyse = 5
run 1


Processing...: 100%|██████████| 30/30 [01:03<00:00,  2.10s/it]


tamanho da list_analyse = 5
run 2


Processing...: 100%|██████████| 30/30 [01:01<00:00,  2.06s/it]


tamanho da list_analyse = 4


Unnamed: 0,from,to,effect,probability,causal


original = credit_amount    2171
Name: 66, dtype: int64
causal = [('credit_amount', 10013)]
run 3


Processing...: 100%|██████████| 30/30 [01:04<00:00,  2.14s/it]


tamanho da list_analyse = 5


Unnamed: 0,from,to,effect,probability,causal


original = account_check_status    1
Name: 504, dtype: int64
causal = [('account_check_status', 3)]


Unnamed: 0,from,to,effect,probability,causal


original = credit_history    2
Name: 504, dtype: int64
causal = [('credit_history', 4)]


Unnamed: 0,from,to,effect,probability,causal
278,mais,mais,0.002397,0.01,True
237,menos,mais,-0.008693,0.04,True
230,mais,mais,0.013293,0.05,True
176,menos,mais,-0.053418,0.21,True
161,mais,mais,0.130606,0.29,True
122,mais,mais,0.153827,0.48,True
87,mais,menos,-0.263119,0.7,True
89,mais,menos,-0.096716,0.7,True
86,mais,mais,0.202645,0.71,True
74,mais,menos,-0.19494,0.77,True


original = account_check_status     1
duration_in_month       24
credit_history           2
age                     24
Name: 504, dtype: int64
causal = [('account_check_status', 2), ('duration_in_month', 15), ('credit_history', 3), ('age', 36)]


Unnamed: 0,from,to,effect,probability,causal
231,mais,mais,0.095473,0.04,True
230,mais,mais,0.013293,0.05,True
192,mais,mais,0.027329,0.13,True
71,mais,mais,0.112467,0.78,True
65,mais,mais,0.739591,0.8,True
47,mais,mais,1.793295,0.87,True


original = credit_history        2
present_emp_since     2
age                  24
Name: 504, dtype: int64
causal = [('credit_history', 3), ('present_emp_since', 3), ('age', 66)]
run 4


Processing...: 100%|██████████| 30/30 [01:01<00:00,  2.05s/it]

tamanho da list_analyse = 5





Unnamed: 0,from,to,effect,probability,causal


original = account_check_status    4
Name: 304, dtype: int64
causal = [('account_check_status', 1)]
run 5


Processing...: 100%|██████████| 30/30 [01:01<00:00,  2.06s/it]

tamanho da list_analyse = 5





Unnamed: 0,from,to,effect,probability,causal
142,menos,menos,-40.122427,0.39,False


original = account_check_status       4
credit_amount           3079
Name: 370, dtype: int64
causal = [('account_check_status', 1), ('credit_amount', 438)]


Unnamed: 0,from,to,effect,probability,causal
260,menos,menos,0.160987,0.02,True
241,menos,menos,0.010367,0.03,True
247,menos,menos,0.281349,0.03,True
244,menos,menos,0.000497,0.03,True
180,menos,menos,0.011562,0.19,True
8,menos,menos,0.169039,0.97,True


original = account_check_status    4
savings                 5
personal_status_sex     3
Name: 370, dtype: int64
causal = [('account_check_status', 1), ('savings', 4), ('personal_status_sex', 2)]
run 6


Processing...: 100%|██████████| 30/30 [01:07<00:00,  2.27s/it]


tamanho da list_analyse = 4


Unnamed: 0,from,to,effect,probability,causal


original = savings    5
Name: 50, dtype: int64
causal = [('savings', 2)]
run 7


Processing...: 100%|██████████| 30/30 [00:57<00:00,  1.93s/it]

tamanho da list_analyse = 5





Unnamed: 0,from,to,effect,probability,causal


original = credit_amount    1559
Name: 843, dtype: int64
causal = [('credit_amount', 14600)]


Unnamed: 0,from,to,effect,probability,causal


original = account_check_status    4
Name: 843, dtype: int64
causal = [('account_check_status', 2)]
run 8


Processing...: 100%|██████████| 30/30 [01:02<00:00,  2.08s/it]


tamanho da list_analyse = 3
run 9


Processing...: 100%|██████████| 30/30 [00:59<00:00,  1.98s/it]

tamanho da list_analyse = 5





Unnamed: 0,from,to,effect,probability,causal


original = age    29
Name: 692, dtype: int64
causal = [('age', 24)]


Unnamed: 0,from,to,effect,probability,causal


original = credit_history    2
Name: 692, dtype: int64
causal = [('credit_history', 1)]


Unnamed: 0,from,to,effect,probability,causal


original = credit_amount    2896
Name: 692, dtype: int64
causal = [('credit_amount', 10656)]
quantas instancias contrafactuais encontradas = 50
quantidade de mudanças totais = 111
quantas instancias tiveram pelo menos uma relação causal satisfeita = 35/50
quantas relações causais foram encontradas = 181
quantas relações causais foram satisfeitas = 115.0/181
quantas instâncias não tiveram nenhuma relação causal satisfeita = 11/50
quantas instâncias não tiveram TODAS as relaçoes causais satisfeitas = 30/50


In [None]:
%time
global_quant_changes = 0
global_quant_causal_changes = 0
global_quant_causal_rules = 0
global_quant_zeros_causal = 0
global_quant_full_causal = 0
global_quant_causal_contrafac = 0

quant_original_instance = 10
for x in range(quant_original_instance):
    print(f"run {x}")
    runs = run(x)
    for content in runs:
        controle = {}
        causal = content[0]
        df = content[1]
        ori = content[2]
        
        num_changes = len(causal)
        global_quant_changes += num_changes
        
        num_causal_rules = len(df)
        global_quant_causal_rules += num_causal_rules
        
        for attr in causal:
            key = attr.column
            if attr.value > ori[key]:
                controle[key] = 'mais'
            else:
                controle[key] = 'menos'

        df_temp = df.copy()
        df_temp['from'] = df['from'].map(controle)
        df_temp['to'] = df['to'].map(controle)
        df_temp['causal'] = df_temp.apply(verificar_condicoes, axis = 1)
        
        causal_finds = df_temp['causal'].sum()
        global_quant_causal_changes += causal_finds

        if causal_finds > 0:
            global_quant_causal_contrafac += 1
        else:
            global_quant_zeros_causal += 1
            display(df_temp)
            print(f"original = {ori}")
            print(f"causal = {causal}")
        
        if causal_finds == num_causal_rules:
            global_quant_full_causal += 1
            if causal_finds > 2:
                display(df_temp)
                print(f"original = {ori}")
                print(f"causal = {causal}")

                
quant_contrafac = quant_original_instance*K

print()
print(f"quantas instancias contrafactuais encontradas = {quant_contrafac}")
print(f"quantidade de mudanças totais = {global_quant_changes}")

print(f"quantas instancias tiveram pelo menos uma relação causal satisfeita = {global_quant_causal_contrafac}/{quant_contrafac}")

print(f"quantas relações causais foram encontradas = {global_quant_causal_rules}")

print(f"quantas relações causais foram satisfeitas = {global_quant_causal_changes}/{global_quant_causal_rules}")

print(f"quantas instâncias não tiveram nenhuma relação causal satisfeita = {global_quant_zeros_causal}/{quant_contrafac}")
print(f"quantas instâncias tiveram TODAS as relaçoes causais satisfeitas = {global_quant_full_causal}/{global_quant_causal_contrafac}")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
run 0


Processing...: 100%|██████████| 30/30 [01:44<00:00,  3.48s/it]


tamanho da list_analyse = 5


Unnamed: 0,from,to,effect,probability,causal
282,mais,menos,-0.006532,0.01,True
233,mais,menos,-0.007936,0.04,True
180,menos,menos,0.1521,0.19,True
120,menos,mais,-0.375009,0.52,True
81,menos,mais,-0.247582,0.7,True
52,menos,menos,0.203373,0.81,True


original = account_check_status     2
duration_in_month       24
credit_history           4
Name: 153, dtype: int64
causal = [('account_check_status', 1), ('duration_in_month', 35), ('credit_history', 1)]
run 1


Processing...:  97%|█████████▋| 29/30 [01:36<00:03,  3.56s/it]