# Regression Counterfactual explanation using DiCE

Dataset: California Housing \
Model: Random Forest Regressor

In [17]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

import dice_ml
from dice_ml import Dice

housing_data = fetch_california_housing()
X = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)
y = pd.DataFrame(housing_data.target, columns=housing_data.target_names)

X.shape

(20640, 8)

In [20]:
# Split data into train and test
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

continuous_features_housing = housing_data.feature_names
categorical_features = x_train.columns.difference(continuous_features_housing)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features_housing),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
regr_housing = Pipeline(steps=[('preprocessor', transformations),
                               ('regressor', RandomForestRegressor())])
model_housing = regr_housing.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [24]:
d_housing = dice_ml.Data(dataframe=pd.concat([X,y], axis=1), continuous_features=continuous_features_housing, outcome_name=housing_data.target_names[0])
# We provide the type of model as a parameter (model_type)
m_housing = dice_ml.Model(model=model_housing, backend="sklearn", model_type='regressor')

In [26]:
exp_genetic_housing = dice_ml.Dice(d_housing, m_housing, method="genetic")

In [32]:
# Multiple queries can be given as input at once
query_instances_housing = x_test[3:5]
genetic_housing = exp_genetic_housing.generate_counterfactuals(query_instances_housing,
                                                               total_CFs=2,
                                                               desired_range=[3.0, 5.0])
genetic_housing.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 2/2 [00:01<00:00,  1.57it/s]

Query instance (original outcome : 0.8795899748802185)





Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,2.4511,37.0,4.992958,1.316901,390.0,2.746479,33.200001,-115.599998,0.87959



Diverse Counterfactual set (new outcome: [3.0, 5.0])


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,10.3682,-,8.1,1.1,-,2.6,33.61,-117.92,4.986429699999989
0,2.9167,43.0,4.6,1.2,-,1.6,34.01,-118.47,4.175560399999996


Query instance (original outcome : 4.084482669830322)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,5.0049,25.0,4.319261,1.039578,649.0,1.712401,37.790001,-122.43,4.084483



Diverse Counterfactual set (new outcome: [3.0, 5.0])


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,-,-,4.3,1.0,-,1.7,-,-,-
0,4.7159,-,4.3,1.0,-,0.7,32.54,-124.35,3.7156517999999994


## Explanation
For this example, we restrict the Counterfactual outcomes to [3.0, 5.0] range. \
The counterfactuals genereated from the two samples (x_test[3:5]), do not give a clear picture whether a single feature increse the median house value. However, it shows that Population has no impact on the median house value. 