# Environment setup

In [1]:
import sys
from pprint import pprint

sys.path.append("../")
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from fairlearn.metrics import demographic_parity_ratio
import wandb
from tqdm import tqdm

import numpy as np
import os
import logging
import warnings
logging.basicConfig(level=logging.ERROR)
sys.path.append("../")
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import tensorflow as tf

from utils import StandardScaleData_ExcludingFeature, LTNOps
import ltn
import KnowledgeBase

import warnings
warnings.filterwarnings("ignore")

`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df

In [2]:
## Set seed for reproducibility
SEED = 42

# `PYTHONHASHSEED` environment variable
os.environ['PYTHONHASHSEED'] = str(SEED)

# Python built-in random, numpy(+ scikit) and tensorflow seed
tf.keras.utils.set_random_seed(SEED)

# Enable TensorFlow op-determinism 
# from version 2.8 onwards https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism)
tf.config.experimental.enable_op_determinism()

In [3]:
project = "LTN - Demographic parity cross validation"
wandb_hp = dict(
    dataset='compas',
    sensitive_feature='sex',
    hidden_layer_sizes=(128, 256, 128)
)

# Dataset

In [4]:
dataset_orig = CompasDataset()



In [5]:
df, attributes = dataset_orig.convert_to_dataframe()
metadata = dataset_orig.metadata

In [6]:
# Protected attributes

protected_attributes = { attributes['protected_attribute_names'][k] : {
    'name': attributes['protected_attribute_names'][k],
    'index': attributes['feature_names'].index(attributes['protected_attribute_names'][k]),
    'privileged': attributes['privileged_protected_attributes'][0][0],
    'unprivileged': attributes['unprivileged_protected_attributes'][0][0],
    'maps': metadata['protected_attribute_maps'][k]
  }
 for k, _ in enumerate(attributes['protected_attribute_names']) }

pprint(protected_attributes)

{'race': {'index': 2,
          'maps': {0.0: 'Not Caucasian', 1.0: 'Caucasian'},
          'name': 'race',
          'privileged': 1.0,
          'unprivileged': 0.0},
 'sex': {'index': 0,
         'maps': {0.0: 'Male', 1.0: 'Female'},
         'name': 'sex',
         'privileged': 1.0,
         'unprivileged': 0.0}}


In [7]:
print("Metadata label_map:", metadata['label_maps'][0])
label_map = {
    "positive": [*metadata['label_maps'][0]][0],
    "negative": [*metadata['label_maps'][0]][1]
}
print("Custom label_map:", label_map)

Metadata label_map: {1.0: 'Did recid.', 0.0: 'No recid.'}
Custom label_map: {'positive': 1.0, 'negative': 0.0}


In [8]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3)
sss_splits = []
for train_index, test_index in sss.split(dataset_orig.features, np.squeeze(dataset_orig.labels)):
    Xtrain = dataset_orig.features[train_index]
    Xtest = dataset_orig.features[test_index]
    Ytrain = np.squeeze(dataset_orig.labels)[train_index]
    Ytest = np.squeeze(dataset_orig.labels)[test_index]
    sss_splits.append((Xtrain, Xtest, Ytrain, Ytest))

In [9]:
import statistics

idx = protected_attributes[wandb_hp['sensitive_feature']]['index']
train_dp = statistics.mean([demographic_parity_ratio(Ytrain, Ytrain, sensitive_features=Xtrain[:, idx]) for Xtrain, _, Ytrain, _ in sss_splits])
test_dp = statistics.mean([demographic_parity_ratio(Ytest, Ytest, sensitive_features=Xtest[:, idx]) for _, Xtest, _, Ytest in sss_splits])

wandb_hp['trainset_demographic_parity'] = train_dp
wandb_hp['testset_demographic_parity'] = test_dp
print(f"Trainset demographic parity: {wandb_hp['trainset_demographic_parity']}")
print(f"Testset demographic parity: {wandb_hp['testset_demographic_parity']}")

Trainset demographic parity: 0.7306092680142173
Testset demographic parity: 0.7414106228307491


In [10]:
idx

0

## Definition

In [11]:
ltnOps = LTNOps(ltn.fuzzy_ops.Implies_Reichenbach(), 1, 2)
train_accuracy = 0
train_di = 0
test_accuracy = 0
test_di = 0

for i, (Xtrain, Xtest, Ytrain, Ytest) in enumerate(sss_splits):

    print(f'Split #{i+1}')

    Xtrain, Xtest, scaler = StandardScaleData_ExcludingFeature(
        Xtrain, Xtest, protected_attributes[wandb_hp['sensitive_feature']]['index'])

    kb = KnowledgeBase.KnowledgeBase(
        Xtrain, Xtest,
        Ytrain, Ytest,
        label_map,
        protected_attributes[wandb_hp['sensitive_feature']]['privileged'],
        protected_attributes[wandb_hp['sensitive_feature']]['unprivileged'],
        hidden_layer_sizes=wandb_hp['hidden_layer_sizes'],
        fuzzy_ops=ltnOps,
        sensitive_feature_index=protected_attributes[wandb_hp['sensitive_feature']]['index'],
        config_file='./KnowledgeBaseAxioms.json'
    )
    
    wandb_hp['learning_rate'] = 0.001
    optimizer = tf.keras.optimizers.Adam(learning_rate=wandb_hp['learning_rate'])
    wandb_hp['optimizer'] = optimizer.__class__
    wandb_hp['epochs'] = 1000

    wandb_init = dict(
        project=project,
        name=f"{' | '.join([ax['name'] for ax in kb.config if ax['infos']['training']]) }",
        entity="albezjelt",
        config={
            **wandb_hp, 
            **kb.axioms,
            'weights': kb.weights,
            'config.json': kb.config},
        reinit=True
    )

    print(wandb_init['name'])

    for epoch in tqdm(range(wandb_hp['epochs'])):
        with tf.GradientTape() as tape:
            loss = 1. - kb.train_step()  # type: ignore
        grads = tape.gradient(loss, kb.trainable_variables)
        optimizer.apply_gradients(zip(grads, kb.trainable_variables))

    train_accuracy += kb.get_logs()['train_classification_metrics']['accuracy']
    test_accuracy += kb.get_logs()['test_classification_metrics']['accuracy']
    train_di += kb.get_logs()['fairness_metrics']['train_demographic_parity_ratio'] 
    test_di += kb.get_logs()['fairness_metrics']['test_demographic_parity_ratio']
    

Split #1
axiom_positive_class | axiom_negative_class


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
100%|██████████| 1000/1000 [00:08<00:00, 113.05it/s]


Split #2
axiom_positive_class | axiom_negative_class


100%|██████████| 1000/1000 [00:07<00:00, 141.24it/s]


Split #3
axiom_positive_class | axiom_negative_class


100%|██████████| 1000/1000 [00:07<00:00, 139.68it/s]


Split #4
axiom_positive_class | axiom_negative_class


100%|██████████| 1000/1000 [00:07<00:00, 139.90it/s]


Split #5
axiom_positive_class | axiom_negative_class


100%|██████████| 1000/1000 [00:07<00:00, 140.13it/s]


In [12]:
print('train acc', train_accuracy / 5)
print('train di', train_di / 5)
print('test acc', test_accuracy / 5)
print('test di', test_di / 5)

train acc 0.7436978683966636
train di 0.5285093942903112
test acc 0.6549972987574284
test di 0.5266596732358493
