In [1]:
import pandas as pd
sampled_data = pd.read_csv('sex_dataset.csv')

In [2]:
import numpy as np

def calculate_confusion_matrix(y_true, y_pred, group):
    group = group.astype(bool)

    y_true_group = y_true[group]
    y_pred_group = y_pred[group]

    # Calculate true positives, false positives, true negatives, and false negatives
    tp = np.sum((y_pred_group == 1) & (y_true_group == 1))
    tn = np.sum((y_pred_group == 0) & (y_true_group == 0))
    fp = np.sum((y_pred_group == 1) & (y_true_group == 0))
    fn = np.sum((y_pred_group == 0) & (y_true_group == 1))

    return tp, tn, fp, fn

def EqualOpportunityDifference(y, pred, group_a, group_b):
    tp_a, _, _, fn_a = calculate_confusion_matrix(y, pred, group_a)
    tp_b, _, _, fn_b = calculate_confusion_matrix(y, pred, group_b)
    tpr_a = tp_a / (tp_a + fn_a) if (tp_a + fn_a) > 0 else 0
    tpr_b = tp_b / (tp_b + fn_b) if (tp_b + fn_b) > 0 else 0
    return tpr_b - tpr_a

def FalsePositiveRateBalance(y, pred, group_a, group_b):
    _, tn_a, fp_a, _ = calculate_confusion_matrix(y, pred, group_a)
    _, tn_b, fp_b, _ = calculate_confusion_matrix(y, pred, group_b)
    fpr_a = fp_a / (fp_a + tn_a) if (fp_a + tn_a) > 0 else 0
    fpr_b = fp_b / (fp_b + tn_b) if (fp_b + tn_b) > 0 else 0
    return fpr_b - fpr_a

def EqualisedOdds(y, pred, group_a, group_b):
    return (EqualOpportunityDifference(y, pred, group_a, group_b) +
            FalsePositiveRateBalance(y, pred, group_a, group_b)) / 2

def PredictiveParityDifference(y, pred, group_a, group_b):
    tp_a, _, fp_a, _ = calculate_confusion_matrix(y, pred, group_a)
    tp_b, _, fp_b, _ = calculate_confusion_matrix(y, pred, group_b)
    precision_a = tp_a / (tp_a + fp_a) if (tp_a + fp_a) > 0 else 0
    precision_b = tp_b / (tp_b + fp_b) if (tp_b + fp_b) > 0 else 0
    return precision_b - precision_a

def StatisticalParityDifference(y, pred, group_a, group_b):
    positive_rate_a = pred[group_a].mean()
    positive_rate_b = pred[group_b].mean()
    return positive_rate_b - positive_rate_a


In [3]:
group_e = (sampled_data['derived_sex'] == 'Male').astype(int)
group_f = (sampled_data['derived_sex'] == 'Female').astype(int)

bias_metrics = {
    "Equal Opportunity Difference": EqualOpportunityDifference,
    "False Positive Rate Balance": FalsePositiveRateBalance,
    "Equalised Odds": EqualisedOdds,
    "Predictive Parity Difference": PredictiveParityDifference,
    "Statistical Parity Difference": StatisticalParityDifference
}

from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_score, roc_auc_score

# setup the metrics to be computed
from sklearn import metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score,
                }
                


In [4]:
from sklearn.model_selection import train_test_split

y = sampled_data['action_taken']

X = sampled_data.drop(['Unnamed: 0','action_taken', 'derived_race','derived_sex'], axis=1)

test_set = 0.2
seed = 123

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set, random_state=seed, stratify=y)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import pandas as pd

lr = LogisticRegression(random_state=10, solver="lbfgs", penalty="none", max_iter=1000)
mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

sex_metrics_all = pd.DataFrame()
k, i = True, 1
i = 1
for (train, test) in mv.split(X, y):
    lr.fit(X.iloc[train], y.iloc[train].values.ravel())
    ypred_prob = lr.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = lr.predict(X.iloc[test])

    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf == "AUC":
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    # Reset these lists inside the loop for each fold
    sex_metrics = []

    for bias in bias_metrics.keys():                                   
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])

        # Convert lists to DataFrames before concatenation
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)



    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)

    i += 1


sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [8]:
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
AUC,0.984604,0.000522
Accuracy,0.951635,0.000738
F1-Score,0.971444,0.000475
Precision,0.982941,0.001102
Recall,0.960217,0.001894


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [11]:
print("\nMALE/FEMALE:")
sex_summary


MALE/FEMALE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,0.00264,0.001852
Equalised Odds,-0.011654,0.005981
False Positive Rate Balance,-0.025949,0.011082
Predictive Parity Difference,0.004097,0.002337
Statistical Parity Difference,0.031263,0.069906


In [None]:

explainer = shap.LinearExplainer(lr, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [80]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(use_label_encoder=False)

mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

import pandas as pd
k, i = True, 1

for (train, test) in mv.split(X, y):
    # fit model
    xgb_clf = xgb_clf.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    # get predictions in the test set
    ypred_prob = xgb_clf.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = xgb_clf.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    sex_metrics = []
    for bias in bias_metrics.keys():                                  
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])

        # Convert lists to DataFrames before concatenation
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)

    # Concatenate the new DataFrames with the all metrics DataFrames
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)

    i += 1

sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [83]:
print("\nMALE/FEMALE:")
sex_summary


MALE/FEMALE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,0.002508,0.001623
Equalised Odds,-0.011883,0.006916
False Positive Rate Balance,-0.026274,0.012728
Predictive Parity Difference,0.00412,0.002642
Statistical Parity Difference,0.031263,0.065908


In [None]:

explainer = shap.LinearExplainer(xgb_clf, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

import pandas as pd
k, i = True, 1

for (train, test) in mv.split(X, y):
    # fit model
    random_forest = random_forest.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    # get predictions in the test set
    ypred_prob = random_forest.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = random_forest.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf == "AUC":
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    # compute performance metrics
    sex_metrics = []
    for bias in bias_metrics.keys():                                
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        # Convert lists to DataFrames before concatenation
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)
 

    # Concatenate the new DataFrames with the all metrics DataFrames
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)

    i += 1

sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:

if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [None]:
print("\nMALE/FEMALE:")
sex_summary

In [None]:

explainer = shap.LinearExplainer(random_forest, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [87]:
import pandas as pd
k, i = True, 1

from sklearn import svm

svm = svm.SVC(random_state=10, probability=True)

for (train, test) in mv.split(X, y):
    svm = svm.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    ypred_prob = svm.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = svm.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    sex_metrics = []
    for bias in bias_metrics.keys():                                   
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        # Convert lists to DataFrames before concatenation
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)


    # Concatenate the new DataFrames with the all metrics DataFrames
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)

    i += 1

sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [None]:
print("\nMALE/FEMALE:")
sex_summary

In [None]:

explainer = shap.LinearExplainer(svm, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [None]:
import lightgbm as lgb
import numpy as np

parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

lgb_model = lgb.train(parameters,
                  train_data,
                  valid_sets=[valid_data],
                  num_boost_round=5000)

mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

import pandas as pd
k, i = True, 1

for (train, test) in mv.split(X, y):
    # fit model
    lgb_model = lgb_model.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    # get predictions in the test set
    ypred_prob = lgb_model.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = lgb_model.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    # compute performance metrics
    sex_metrics = []
    for bias in bias_metrics.keys():                             
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        # Convert lists to DataFrames before concatenation
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)


    # Concatenate the new DataFrames with the all metrics DataFrames
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)
    
    i += 1

sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [None]:
print("\nMALE/FEMALE:")
sex_summary

In [None]:

explainer = shap.LinearExplainer(lgb_model, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")

