# 2. Genetic Algorithm

Here, we show how to use DiCE can be used to generate CFs for any ML model by using the genetic algorithm to find the best counterfactuals close to the query point. The genetic algorithm converges quickly, and promotes diverse counterfactuals. 

In [1]:
from utils import helpers  # helper functions

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

from model_interfaces import base_model
import data, dice
import model as dice_model

C:\Users\diddy\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\diddy\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
dataset = pd.read_csv('adultdataset.csv')
dataset.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,White,Female,60,0
1,30,Self-Employed,Assoc,Married,Professional,White,Male,65,1
2,32,Private,Some-college,Married,White-Collar,White,Male,50,0
3,20,Private,Some-college,Single,Service,White,Female,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,White,Male,50,0


In [3]:
target = dataset["income"]
# Split data into train and test
datasetX = dataset.drop("income", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('regressor', LGBMRegressor())])
model = clf.fit(x_train, y_train)

In [4]:
# initiate DiceGenetic
# Using sklearn backend

d = data.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')
m = dice_model.Model(model=model, backend="sklearn", model_type='regressor')
exp_genetic = dice.Dice(d, m, method='genetic')

In [7]:
# generate counterfactuals
query_instances = x_test[0:1]
dice_exp_genetic = exp_genetic.generate_counterfactuals(query_instances, desired_range=[0.6, 0.8], total_CFs=1000, proximity_weight=0.2, sparsity_weight=0.2, desired_class="opposite", verbose=True)
dice_exp_genetic.visualize_as_dataframe(show_only_changes=True)

  0%|          | 0/1 [00:00<?, ?it/s]

Initializing initial parameters to the genetic algorithm...
Initialization complete! Generating counterfactuals...


100%|██████████| 1/1 [10:15<00:00, 615.64s/it]

Diverse Counterfactuals found! total time taken: 10 min 15 sec
Query instance (original outcome: 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29,Private,HS-grad,Married,Blue-Collar,White,Female,38,0.159359



Diverse Counterfactual set (new outcome: [0.6, 0.8])


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,30.0,-,Masters,-,Professional,-,-,-,0.6851444602955848
1,30.0,-,Bachelors,-,Professional,-,-,-,0.6207997774580682
2,-,-,Bachelors,-,White-Collar,-,-,42.0,0.6948654799822137
3,-,-,Bachelors,-,White-Collar,-,-,43.0,0.6948654799822137
4,36.0,-,-,-,White-Collar,-,-,40.0,0.6464906263702314
...,...,...,...,...,...,...,...,...,...
995,35.0,-,Bachelors,-,Professional,-,Male,50.0,0.7110486767888322
996,35.0,-,Some-college,-,White-Collar,-,Male,50.0,0.642915306584574
997,35.0,-,Masters,-,Professional,-,Male,50.0,0.6867993999286414
998,35.0,-,Bachelors,-,Sales,-,Male,50.0,0.7576374895719927


In [8]:
exp_genetic.average_loss_list

[1248.0997634842583,
 1205.2303340084175,
 1181.2890793040635,
 1134.586941797585,
 1081.366506905614,
 1079.8701550952087]

In [9]:
#exp_genetic.final_cfs_df.to_csv("exp_genetic_1000.csv", index=False)