In [1]:
import pandas as pd
sampled_data = pd.read_csv('added_sex_bias.csv')

In [2]:
import numpy as np

def calculate_confusion_matrix(y_true, y_pred, group):
    # Ensure that group is a boolean array for indexing
    group = group.astype(bool)

    # Filter the predictions and true values based on the group
    y_true_group = y_true[group]
    y_pred_group = y_pred[group]

    # Calculate true positives, false positives, true negatives, and false negatives
    tp = np.sum((y_pred_group == 1) & (y_true_group == 1))
    tn = np.sum((y_pred_group == 0) & (y_true_group == 0))
    fp = np.sum((y_pred_group == 1) & (y_true_group == 0))
    fn = np.sum((y_pred_group == 0) & (y_true_group == 1))

    return tp, tn, fp, fn

def EqualOpportunityDifference(y, pred, group_a, group_b):
    tp_a, _, _, fn_a = calculate_confusion_matrix(y, pred, group_a)
    tp_b, _, _, fn_b = calculate_confusion_matrix(y, pred, group_b)
    tpr_a = tp_a / (tp_a + fn_a) if (tp_a + fn_a) > 0 else 0
    tpr_b = tp_b / (tp_b + fn_b) if (tp_b + fn_b) > 0 else 0
    return tpr_b - tpr_a

def FalsePositiveRateBalance(y, pred, group_a, group_b):
    _, tn_a, fp_a, _ = calculate_confusion_matrix(y, pred, group_a)
    _, tn_b, fp_b, _ = calculate_confusion_matrix(y, pred, group_b)
    fpr_a = fp_a / (fp_a + tn_a) if (fp_a + tn_a) > 0 else 0
    fpr_b = fp_b / (fp_b + tn_b) if (fp_b + tn_b) > 0 else 0
    return fpr_b - fpr_a

def EqualisedOdds(y, pred, group_a, group_b):
    return (EqualOpportunityDifference(y, pred, group_a, group_b) +
            FalsePositiveRateBalance(y, pred, group_a, group_b)) / 2

def PredictiveParityDifference(y, pred, group_a, group_b):
    tp_a, _, fp_a, _ = calculate_confusion_matrix(y, pred, group_a)
    tp_b, _, fp_b, _ = calculate_confusion_matrix(y, pred, group_b)
    precision_a = tp_a / (tp_a + fp_a) if (tp_a + fp_a) > 0 else 0
    precision_b = tp_b / (tp_b + fp_b) if (tp_b + fp_b) > 0 else 0
    return precision_b - precision_a

def StatisticalParityDifference(y, pred, group_a, group_b):
    positive_rate_a = pred[group_a].mean()
    positive_rate_b = pred[group_b].mean()
    return positive_rate_b - positive_rate_a


In [3]:
sampled_data['group_a'] = (sampled_data['derived_race'] == 'White').astype(int)
sampled_data['group_b'] = (sampled_data['derived_race'] != 'White').astype(int)
sampled_data['group_c'] = (sampled_data['derived_race'] != 'Black or African American').astype(int)
sampled_data['group_d'] = (sampled_data['derived_race'] == 'Black or African American').astype(int)
sampled_data['group_e'] = (sampled_data['derived_sex'] == 'Male').astype(int)
sampled_data['group_f'] = (sampled_data['derived_sex'] == 'Female').astype(int)
sampled_data['group_g'] = (sampled_data['derived_race'] == 'Asian').astype(int)
sampled_data['group_h'] = (sampled_data['derived_race'] == 'Black').astype(int)
sampled_data['group_i'] = (sampled_data['derived_race'] == 'White').astype(int)
sampled_data['group_j'] = (sampled_data['derived_race'] == 'Asian').astype(int)

group_a = sampled_data['group_a'].copy() 
group_b = sampled_data['group_b'].copy()
group_c = sampled_data['group_c'].copy() 
group_d = sampled_data['group_d'].copy()
group_e = sampled_data['group_e'].copy() 
group_f = sampled_data['group_f'].copy()
group_g = sampled_data['group_g'].copy() 
group_h = sampled_data['group_h'].copy()
group_i = sampled_data['group_i'].copy() 
group_j = sampled_data['group_j'].copy()

bias_metrics = {
    "Equal Opportunity Difference": EqualOpportunityDifference,
    "False Positive Rate Balance": FalsePositiveRateBalance,
    "Equalised Odds": EqualisedOdds,
    "Predictive Parity Difference": PredictiveParityDifference,
    "Statistical Parity Difference": StatisticalParityDifference
}

from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_score, roc_auc_score

# setup the metrics to be computed
from sklearn import metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score,
                }
                


In [4]:
from sklearn.model_selection import train_test_split

# Assuming sampled_data is your DataFrame
y = sampled_data['action_taken']

# Exclude both 'action_taken' and 'applicant_race_1' from the features
X = sampled_data.drop(['Unnamed: 0','action_taken', 'derived_race','derived_sex'], axis=1)

test_set = 0.2
seed = 123

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set, random_state=seed, stratify=y)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import pandas as pd

lr = LogisticRegression(random_state=10, solver="lbfgs", penalty="none", max_iter=1000)
mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

white_metrics_all = pd.DataFrame()
black_metrics_all = pd.DataFrame()
sex_metrics_all = pd.DataFrame()
asianblack_metrics_all = pd.DataFrame()
asianwhite_metrics_all = pd.DataFrame()
k, i = True, 1
i = 1
for (train, test) in mv.split(X, y):
    lr.fit(X.iloc[train], y.iloc[train].values.ravel())
    ypred_prob = lr.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = lr.predict(X.iloc[test])

    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    # Reset these lists inside the loop for each fold
    white_metrics = []
    black_metrics = []
    sex_metrics = []
    asianblack_metrics = []
    asianwhite_metrics = []

    for bias in bias_metrics.keys():
        white_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_a[test], group_b[test])])
        black_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_c[test], group_d[test])])                                    
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        asianblack_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_g[test], group_h[test])])
        asianwhite_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_i[test], group_j[test])])

        # Convert lists to DataFrames before concatenation
        white_df = pd.DataFrame(white_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        black_df = pd.DataFrame(black_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianwhite_df = pd.DataFrame(asianwhite_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianblack_df = pd.DataFrame(asianblack_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        

    # Concatenate the new DataFrames with the all metrics DataFrames
    white_metrics_all = pd.concat([white_metrics_all, white_df], axis=0)
    black_metrics_all = pd.concat([black_metrics_all, black_df], axis=0)
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)
    asianblack_metrics_all = pd.concat([asianblack_metrics_all, asianblack_df], axis=0)
    asianwhite_metrics_all = pd.concat([asianwhite_metrics_all, asianwhite_df], axis=0)
    
    i += 1

white_summary = white_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
black_summary = black_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianblack_summary = asianblack_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianwhite_summary = asianwhite_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [8]:
# If 'Value' is not a numeric type, you may need to convert it
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

# Now try creating the pivot table again
pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
AUC,0.984604,0.000522
Accuracy,0.951635,0.000738
F1-Score,0.971444,0.000475
Precision,0.982941,0.001102
Recall,0.960217,0.001894


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [9]:
print("WHITE/NOT WHITE:")
white_summary

WHITE/NOT WHITE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,-0.004201,0.001708
Equalised Odds,0.025488,0.010231
False Positive Rate Balance,0.055177,0.021563
Predictive Parity Difference,-0.017592,0.004013
Statistical Parity Difference,0.129906,0.29048


In [10]:
print("\nBLACK/NOT BLACK:")
black_summary


BLACK/NOT BLACK:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,-0.021904,0.004508
Equalised Odds,-0.059337,0.003517
False Positive Rate Balance,-0.096769,0.008748
Predictive Parity Difference,0.012413,0.002079
Statistical Parity Difference,0.168685,0.37719


In [11]:
print("\nMALE/FEMALE:")
sex_summary


MALE/FEMALE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,0.00264,0.001852
Equalised Odds,-0.011654,0.005981
False Positive Rate Balance,-0.025949,0.011082
Predictive Parity Difference,0.004097,0.002337
Statistical Parity Difference,0.031263,0.069906


In [12]:
print("\nASIAN/WHITE:")
asianwhite_summary


ASIAN/WHITE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,0.00852,0.000932
Equalised Odds,0.15505,0.024389
False Positive Rate Balance,0.30158,0.048213
Predictive Parity Difference,-0.03731,0.006416
Statistical Parity Difference,0.147573,0.329984


In [13]:
print("\ASIAN/BLACK:")
asianblack_summary

\ASIAN/BLACK:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,-0.969442,0.001513
Equalised Odds,-0.679288,0.020921
False Positive Rate Balance,-0.389134,0.041498
Predictive Parity Difference,-0.948622,0.00528
Statistical Parity Difference,0.01738,0.038863


In [None]:

explainer = shap.LinearExplainer(lr, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [80]:
from xgboost import XGBClassifier
# Initialize the XGBClassifier
xgb_clf = XGBClassifier(use_label_encoder=False)

mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

import pandas as pd
k, i = True, 1

for (train, test) in mv.split(X, y):
    # fit model
    xgb_clf = xgb_clf.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    # get predictions in the test set
    ypred_prob = xgb_clf.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = xgb_clf.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    # compute performance metrics
    white_metrics = []
    black_metrics = []
    sex_metrics = []
    for bias in bias_metrics.keys():
        white_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_a[test], group_b[test])])
        black_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_c[test], group_d[test])])                                    
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        asianblack_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_g[test], group_h[test])])
        asianwhite_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_i[test], group_j[test])])

        # Convert lists to DataFrames before concatenation
        white_df = pd.DataFrame(white_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        black_df = pd.DataFrame(black_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianwhite_df = pd.DataFrame(asianwhite_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianblack_df = pd.DataFrame(asianblack_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        

    # Concatenate the new DataFrames with the all metrics DataFrames
    white_metrics_all = pd.concat([white_metrics_all, white_df], axis=0)
    black_metrics_all = pd.concat([black_metrics_all, black_df], axis=0)
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)
    asianblack_metrics_all = pd.concat([asianblack_metrics_all, asianblack_df], axis=0)
    asianwhite_metrics_all = pd.concat([asianwhite_metrics_all, asianwhite_df], axis=0)
    
    i += 1

white_summary = white_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
black_summary = black_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianblack_summary = asianblack_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianwhite_summary = asianwhite_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:
# If 'Value' is not a numeric type, you may need to convert it
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

# Now try creating the pivot table again
pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [81]:
print("WHITE/NOT WHITE:")
white_summary

WHITE/NOT WHITE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,-0.001523,0.003754
Equalised Odds,0.015755,0.012902
False Positive Rate Balance,0.033034,0.028244
Predictive Parity Difference,-0.013065,0.005638
Statistical Parity Difference,0.129906,0.273867


In [82]:
print("\nBLACK/NOT BLACK:")
black_summary


BLACK/NOT BLACK:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,-0.021651,0.00432
Equalised Odds,-0.052048,0.009157
False Positive Rate Balance,-0.082444,0.018537
Predictive Parity Difference,0.007608,0.005758
Statistical Parity Difference,0.168685,0.355619


In [83]:
print("\nMALE/FEMALE:")
sex_summary


MALE/FEMALE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,0.002508,0.001623
Equalised Odds,-0.011883,0.006916
False Positive Rate Balance,-0.026274,0.012728
Predictive Parity Difference,0.00412,0.002642
Statistical Parity Difference,0.031263,0.065908


In [84]:
print("\nASIAN/WHITE:")
asianwhite_summary


ASIAN/WHITE:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,0.014022,0.005265
Equalised Odds,0.116182,0.043858
False Positive Rate Balance,0.218343,0.091746
Predictive Parity Difference,-0.026775,0.013399
Statistical Parity Difference,0.118059,0.276085


In [85]:
print("\ASIAN/BLACK:")
asianblack_summary

\ASIAN/BLACK:


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,Value,Value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2
Equal Opportunity Difference,-0.980034,0.009663
Equalised Odds,-0.644589,0.035926
False Positive Rate Balance,-0.309143,0.080852
Predictive Parity Difference,-0.958727,0.011735
Statistical Parity Difference,0.013904,0.032515


In [None]:

explainer = shap.LinearExplainer(xgb_clf, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

import pandas as pd
k, i = True, 1

for (train, test) in mv.split(X, y):
    # fit model
    random_forest = random_forest.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    # get predictions in the test set
    ypred_prob = random_forest.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = random_forest.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    # compute performance metrics
    white_metrics = []
    black_metrics = []
    sex_metrics = []
    for bias in bias_metrics.keys():
        white_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_a[test], group_b[test])])
        black_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_c[test], group_d[test])])                                    
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        asianblack_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_g[test], group_h[test])])
        asianwhite_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_i[test], group_j[test])])

        # Convert lists to DataFrames before concatenation
        white_df = pd.DataFrame(white_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        black_df = pd.DataFrame(black_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianwhite_df = pd.DataFrame(asianwhite_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianblack_df = pd.DataFrame(asianblack_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        

    # Concatenate the new DataFrames with the all metrics DataFrames
    white_metrics_all = pd.concat([white_metrics_all, white_df], axis=0)
    black_metrics_all = pd.concat([black_metrics_all, black_df], axis=0)
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)
    asianblack_metrics_all = pd.concat([asianblack_metrics_all, asianblack_df], axis=0)
    asianwhite_metrics_all = pd.concat([asianwhite_metrics_all, asianwhite_df], axis=0)
    
    i += 1

white_summary = white_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
black_summary = black_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianblack_summary = asianblack_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianwhite_summary = asianwhite_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:
# If 'Value' is not a numeric type, you may need to convert it
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

# Now try creating the pivot table again
pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [None]:
print("WHITE/NOT WHITE:")
white_summary

In [None]:
print("\nBLACK/NOT BLACK:")
black_summary

In [None]:
print("\nMALE/FEMALE:")
sex_summary

In [None]:
print("\nASIAN/WHITE:")
asianwhite_summary

In [None]:
print("\ASIAN/BLACK:")
asianblack_summary

In [None]:

explainer = shap.LinearExplainer(random_forest, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [87]:
# from sklearn.svm import SVC

# # Create an SVM classifier
# svm = SVC(random_state=42)

# mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

import pandas as pd
k, i = True, 1

from sklearn import svm

svm = svm.SVC(random_state=10, probability=True)

for (train, test) in mv.split(X, y):
    svm = svm.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    ypred_prob = svm.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = svm.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    white_metrics = []
    black_metrics = []
    sex_metrics = []
    for bias in bias_metrics.keys():
        white_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_a[test], group_b[test])])
        black_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_c[test], group_d[test])])                                    
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        asianblack_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_g[test], group_h[test])])
        asianwhite_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_i[test], group_j[test])])

        # Convert lists to DataFrames before concatenation
        white_df = pd.DataFrame(white_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        black_df = pd.DataFrame(black_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianwhite_df = pd.DataFrame(asianwhite_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianblack_df = pd.DataFrame(asianblack_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        

    # Concatenate the new DataFrames with the all metrics DataFrames
    white_metrics_all = pd.concat([white_metrics_all, white_df], axis=0)
    black_metrics_all = pd.concat([black_metrics_all, black_df], axis=0)
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)
    asianblack_metrics_all = pd.concat([asianblack_metrics_all, asianblack_df], axis=0)
    asianwhite_metrics_all = pd.concat([asianwhite_metrics_all, asianwhite_df], axis=0)
    
    i += 1

white_summary = white_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
black_summary = black_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianblack_summary = asianblack_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianwhite_summary = asianwhite_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:
# If 'Value' is not a numeric type, you may need to convert it
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

# Now try creating the pivot table again
pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [None]:
print("WHITE/NOT WHITE:")
white_summary

In [None]:
print("\nBLACK/NOT BLACK:")
black_summary

In [None]:
print("\nMALE/FEMALE:")
sex_summary

In [None]:
print("\nASIAN/WHITE:")
asianwhite_summary

In [None]:
print("\ASIAN/BLACK:")
asianblack_summary

In [None]:

explainer = shap.LinearExplainer(svm, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")



In [None]:
import lightgbm as lgb
import numpy as np

parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

lgb_model = lgb.train(parameters,
                  train_data,
                  valid_sets=[valid_data],
                  num_boost_round=5000)

mv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

import pandas as pd
k, i = True, 1

for (train, test) in mv.split(X, y):
    # fit model
    lgb_model = lgb_model.fit(X.iloc[train], y.iloc[train].values.ravel())
    
    # get predictions in the test set
    ypred_prob = lgb_model.predict_proba(X.iloc[test]).ravel()[1::2] # get probabilities
    ypred_class = lgb_model.predict(X.iloc[test])
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](y.iloc[test].values.ravel(), ypred_class)]]

    # concatenate results
    df_m = pd.DataFrame(metrics, columns=["Metric", "Value"])
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

    # compute performance metrics
    white_metrics = []
    black_metrics = []
    sex_metrics = []
    for bias in bias_metrics.keys():
        white_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_a[test], group_b[test])])
        black_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_c[test], group_d[test])])                                    
        sex_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_e[test], group_f[test])])
        asianblack_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_g[test], group_h[test])])
        asianwhite_metrics.append([bias, bias_metrics[bias](y.iloc[test].values.ravel(), ypred_class,
                                        group_i[test], group_j[test])])

        # Convert lists to DataFrames before concatenation
        white_df = pd.DataFrame(white_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        black_df = pd.DataFrame(black_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        sex_df = pd.DataFrame(sex_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianwhite_df = pd.DataFrame(asianwhite_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        asianblack_df = pd.DataFrame(asianblack_metrics, columns=["Metric", "Value"]).assign(Fold=i)
        

    # Concatenate the new DataFrames with the all metrics DataFrames
    white_metrics_all = pd.concat([white_metrics_all, white_df], axis=0)
    black_metrics_all = pd.concat([black_metrics_all, black_df], axis=0)
    sex_metrics_all = pd.concat([sex_metrics_all, sex_df], axis=0)
    asianblack_metrics_all = pd.concat([asianblack_metrics_all, asianblack_df], axis=0)
    asianwhite_metrics_all = pd.concat([asianwhite_metrics_all, asianwhite_df], axis=0)
    
    i += 1

white_summary = white_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
black_summary = black_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
sex_summary = sex_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianblack_summary = asianblack_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])
asianwhite_summary = asianwhite_metrics_all.pivot_table(index='Metric', values='Value', aggfunc=['mean', 'std'])

In [None]:
# If 'Value' is not a numeric type, you may need to convert it
if not pd.api.types.is_numeric_dtype(df_metrics['Value']):
    df_metrics['Value'] = pd.to_numeric(df_metrics['Value'], errors='coerce')

# Now try creating the pivot table again
pivot_table = df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])
df_metrics.pivot_table(index="Metric", values="Value", aggfunc=["mean", "std"])


In [None]:
df_metrics.boxplot(column='Value', by='Metric')

In [None]:
print("WHITE/NOT WHITE:")
white_summary

In [None]:
print("\nBLACK/NOT BLACK:")
black_summary

In [None]:
print("\nMALE/FEMALE:")
sex_summary

In [None]:
print("\nASIAN/WHITE:")
asianwhite_summary

In [None]:
print("\ASIAN/BLACK:")
asianblack_summary

In [None]:

explainer = shap.LinearExplainer(lgb_model, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")

