# Veritas Fairness Assement - Life Insurance Underwriting Study (sample code)



This notebook includes samples of code used in the analysis conducted during the life insurance underwriting case study.

It is applicable to insurance underwriting datasets including a life insurance dataset available on
[kaggle](https://www.kaggle.com/c/prudential-life-insurance-assessment/data)

## License

Written by Sankarshan Mridha (Swiss Re) and Laura Alvarez (Accenture) as an extension to Phase 1 Credit Scoring Use Case code https://github.com/veritas-project/phase1/tree/main/credit_scoring 

Contact email: Veritas@mas.gov.sg


Copyright © 2021 Monetary Authority of Singapore

Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License. You may obtain a copy of the
License at http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the Licens

## Imports

In [None]:
# Core Packages
import os
import sys

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, brier_score_loss, precision_score,\
recall_score, balanced_accuracy_score
import joblib
import seaborn as sns

SEED = 123
SEED = 123

In [None]:
# Our code (autoreload)
%load_ext autoreload
%autoreload 2
sys.path.append("../utils")
import utility as utils

In [None]:
# High-res plots
%config InlineBackend.figure_format = 'retina'

In [None]:
import warnings
warnings.filterwarnings('ignore') 

## Load Data

Please modify the following cell to update dataset file path 

In [None]:
all_data = pd.read_csv('../dataset.csv')

## Feature Engineering and Pipeline

In [None]:
all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']

med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)

mapper = {
    'Id': 'Insured ID',
    'InsuredInfo_6': 'Gender',
    'InsuredInfo_1': 'Race',
    'InsuredInfo_4': 'Nationality',
    'Family_Hist_1': 'Marital Status',
    'InsuredInfo_3': 'Occupation Type',
    'Employment_Info_2': 'Occupation Industry',
    'Wt': 'Weight',
    'Ht': 'Height',
    'Medical_History_4': 'Smoker Status',
    'Ins_Age': 'Age at Policy Inception',
    'Insurance_History_3': 'No. of Life Policies',
    'Insurance_History_2': 'No. of Accident Policies',
    'Insurance_History_7': 'No. of CI Policies',
    'Product_Info_3': 'Duration in force for Medical Plan'
}

all_data.rename(mapper=mapper, axis=1, inplace=True)
# Drop columns we do not have confidence in mapping to
drop_columns = ('Medical', 'Family', 'Insurance', 'Product', 'Employment', 'Insurance', 'InsuredInfo')
mask = all_data.columns.str.startswith(drop_columns)
all_data = all_data.iloc[:,~mask]
all_data.head()

### Create binary labels

In [None]:
# create labels
# 0: {1,2}
# 1: {7,8}
# -1: the rest
all_data['Risk'] = pd.cut(all_data.Response, bins=[0,2,6,8], labels=[0,-1,1])
all_data = all_data.astype({"Risk": int})
all_data.Risk.value_counts()

In [None]:
# Remove Response = -1
df = all_data.loc[all_data['Risk']!= -1].reset_index(drop=True)

### Train/test split

In [None]:
# prepare train & test datasets
columns_to_drop = ['Insured ID','Response','Risk', 'Nationality', 'Marital Status'] #droping race at pre-processing point 
X = df.drop(columns=columns_to_drop)
X = X.astype({"Occupation Industry": object, "Occupation Type": object, "Smoker Status": object, "Gender": object})
y = df['Risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
print(f"X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}")
print(f"y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}")

### Create masks for Fairness analysis

In [None]:
# Create a gender identifying mask
gender_mask = (X_test["Gender"] == 1)  # assuming 1: Male, 2: Female 
print('Percent Male:', round(np.mean(gender_mask), 5), 'Percent Female:', round(np.mean(~gender_mask), 5))

In [None]:
# Create a race identifying mask
race_mask = (X_test["Race"] == 1)  # assuming 1: Majority, 2: Other 
print('Percent Major:', round(np.mean(race_mask), 5), 'Percent Minor:', round(np.mean(~race_mask), 5))

### Pre-processing

In [None]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude=["object", "category"])),
    ('cat', categorical_transformer, selector(dtype_include=["object", "category"]))
], remainder='passthrough')

X_train_transformed = preprocessor.fit_transform(X_train.drop(columns='Race'))# dropping race prior to preprocessor
X_test_transformed = preprocessor.transform(X_test.drop(columns='Race'))# dropping race prior to preprocessor
print(f"X_train_transformed.shape: {X_train_transformed.shape}, X_test_transformed.shape: {X_test_transformed.shape}")

print(f"Class distribution: {np.unique(y_train, return_counts=True)}")

## Load Model

In [None]:
# load model
model_baseline = joblib.load('model/model_baseline_lr.pkl')

In [None]:
# predict probabilites
y_prob = model_baseline.predict_proba(X_test_transformed)[:,1]


# compute AUC
print(roc_auc_score(y_test, y_prob))

In [None]:
# compute classification metrics by 0.5 cutoff
y_pred = np.where(y_prob > 0.5, 1, 0)
print(classification_report(y_test, y_pred))

In [None]:
# compute ROC curve
fpr, tpr, th = roc_curve(y_test, y_prob)

In [None]:
# find optimal cutoff by max balanced accuracy
ba = (tpr + (1 - fpr))/2
best_ba = np.max(ba)
best_th = th[np.argmax(ba)]
best_th

In [None]:
# plot balanced accuracy and approval rate vs threshold
ba = 0.5*(tpr + 1 - fpr)
base_ar = np.mean(y_test.astype(int))
ar = base_ar*tpr + (1-base_ar)*fpr
plt.plot(th, ba, label='balanced accuracy')
plt.plot(th, ar, label='approval rate')
plt.plot()
plt.scatter(best_th, best_ba, c='r', marker='x', s=100, label='max bal acc')
plt.xlabel('Underwriting Threshold')
plt.title('Life Insurance Underwriting')
plt.xlim((y_prob.min(), y_prob.max()))
plt.legend(framealpha=0.3, facecolor='white', fontsize=12, loc='lower left')

In [None]:
# compute classification metrics by optimal cutoff
y_pred_ba = np.where(y_prob > best_th, 1, 0)
print(classification_report(y_test, y_pred_ba))

In [None]:
precision_score(y_test, y_pred_ba)

## Test Performance
Here we quantify the model's performance.

####  Code corresponding to section 2.7.3 Step 3: Build and Validate in Veritas Document 4 FEAT Principles Assessment Case Studies

In [None]:
test_bal_acc = metrics.balanced_accuracy_score(y_test, y_pred_ba)
print("Balanced accuracy on test set {:.4f} at threshold {:.4f}".format(test_bal_acc, best_th))

In [None]:
# compute confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred_ba)

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_counts,group_names)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
plt.xlabel("Predicted Class")
plt.ylabel("True Label")
plt.show()

In [None]:
# plot ROC curve
def plot_roc(model, X, y):
    figure = plt.figure(figsize=(5,5))
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Random Chance', alpha=.8)
    metrics.plot_roc_curve(model, X, y, name='model', alpha=0.3, lw=2, ax=plt.gca())
    plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=13)
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.show()

plot_roc(model_baseline, X_test_transformed, y_test)

In [None]:
# plot ROC curve with lines for gender subgroups
def plot_roc(model, X, y):
    figure = plt.figure(figsize=(5,5))
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Random Chance', alpha=.8)
    metrics.plot_roc_curve(model, X, y, name='model', alpha=0.3, lw=2, ax=plt.gca())
    metrics.plot_roc_curve(model, X[gender_mask], y[gender_mask], name='Male', alpha=0.3, lw=2, ax=plt.gca())
    metrics.plot_roc_curve(model, X[~gender_mask], y[~gender_mask], name='Female', alpha=0.3, lw=2, ax=plt.gca())
    plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=13)
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.show()

plot_roc(model_baseline, X_test_transformed, y_test)

In [None]:
auc = metrics.roc_auc_score(y_test, y_prob)
print(auc)
auc_male = metrics.roc_auc_score(y_test[gender_mask], y_prob[gender_mask])
print(auc_male)
auc_female = metrics.roc_auc_score(y_test[~gender_mask], y_prob[~gender_mask])
print(auc_female)

In [None]:
# Bootstrap uncertainty analysis

# Metrics based on predictions
prediction_metrics = {'True Positive Rate (i.e. sensitivity, or recall)': metrics.recall_score,
                      'True Negative Rate (i.e. specificity)': lambda x, y: metrics.recall_score(x, y, pos_label=0),
                      'Balanced Accuracy': metrics.balanced_accuracy_score,
                      'Positive Predictive Value (precision)': metrics.precision_score}

# Metrics based on probabilities
probability_metrics = {'Area Under ROC': metrics.roc_auc_score}

for name, metric_func in prediction_metrics.items():
    print(name, ":", utils.format_uncertainty(*utils.bootstrap_conf_int(y_test.values, y_pred_ba, metric_func, k=25)))

for name, metric_func in probability_metrics.items():
    print(name, ":", utils.format_uncertainty(*utils.bootstrap_conf_int(y_test.values, y_prob, metric_func, k=25)))



In [None]:
# Calibration curve
def plot_calibration(bin_true_prob, bin_pred_prob):
    plt.figure(figsize=(7, 7))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "k:", label="perfectly calibrated")
    ax1.plot(bin_pred_prob, bin_true_prob, "s-",
             label="model")

    ax2.hist([y_prob[y_test == 1], y_prob[y_test == 0]], label=["healthy", "risky"],
              histtype='bar', stacked=True)

    ax1.set_ylabel("Fraction of healthy applicants", fontsize=14)
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right", fontsize=12)
    ax1.set_title('Model Calibration (reliability curve)', fontsize=16)

    ax2.set_xlabel("Model Output Probabilities (binned)", fontsize=14)
    ax2.set_ylabel("Count", fontsize=14)
    ax2.legend(loc="upper left", ncol=2, fontsize=12)

    plt.tight_layout()
    plt.show()

bin_true_prob, bin_pred_prob = calibration_curve(y_test, y_prob, n_bins=10)
plot_calibration(bin_true_prob, bin_pred_prob)

In [None]:
# run Isotonic calibration
clf_isotonic = CalibratedClassifierCV(model_baseline, cv=3, method='isotonic')
clf_isotonic = clf_isotonic.fit(X_train_transformed, y_train)
model_baseline_iso = clf_isotonic.predict_proba(X_test_transformed)[:, 1]
model_baseline_score = brier_score_loss(y_test, y_prob)
clf_isotonic_score = brier_score_loss(y_test, model_baseline_iso)
print(model_baseline_score, clf_isotonic_score)

As calibrated model has similar brier score, calibration is not required.

## Fairness
Here we compute some fairness metrics with respect to gender.

####  Code corresponding to section 2.7.4 Part C – Measuring Disadvantage in Veritas Document 4 FEAT Principles Assessment Case Studies

In [None]:
def group_thresholds(y_prob, threshold_mask, th_A=0.5, th_B=0.5):
    """
    Helper function to apply classification thresholds based on a mask
    
    Parameters:
    ----------
        y_test::pd.DataFrame or similar: Ground truth Labels
        y_probs::np.array or similar: Predicted test probabilities
        threshold_mask::pd.Series or similar: Boolean array of the variable to apply the classification thresholds on
        th_A::float: custom threshold for Group A, defaults to 0.5
        th_B::float: customt hreshold for Group B, defaults to 0.5
    
    Returns:
    --------
        y_pred::array: array of predicted classifications
    """
    y_pred = np.zeros_like(y_prob)
    y_pred[threshold_mask] = (y_prob[threshold_mask] >= th_A).astype(int)
    y_pred[~threshold_mask] = (y_prob[~threshold_mask] >= th_B).astype(int)
    return y_pred

In [None]:
def group_fairness(y_test, y_probs, variable_mask, th_A=0.5, th_B=0.5, verbose=True):
    """
    Helper function to evaluate group fairness given custom thresholds
    
    Parameters:
    ----------
        y_test::pd.DataFrame or similar: Ground truth Labels
        y_probs::np.array or similar: Predicted test probabilities
        variable_mask::pd.Series or similar: Boolean array of the variable to evaluate group fairness on
        th_A::float: custom threshold for Group A, defaults to 0.5
        th_B::float: customt hreshold for Group B, defaults to 0.5
        verbose::bool: 
    
    Returns:
    --------
        fairness_metrics::dict: dictionary of computed fairness metrics
    """
    d = {}
    # Run and store fairness analysis
    analysis = utils.FairnessAnalysis(y_test.astype(int), y_probs, variable_mask)
    metrics = analysis.compute(th_A, th_B)
    for attr, name in utils.FairnessAnalysis.metric_names.items():
        metric = round(getattr(metrics, attr), 3)
        if verbose:
            print(f"{name}: {metric}")
        d[name] = metric
    return d

In [None]:
def group_performance_metrics(y_true, y_probs, variable_mask, th_A=0.5, th_B=0.5,Group_1mask='Group_a',Group_0mask='Group_b'):
    """
    Helper function to evaluate group rates given custom thresholds
    
    Parameters:
    ----------
        y_true::pd.DataFrame or similar: Ground truth Labels
        y_probs::np.array or similar: Predicted test probabilities
        variable_mask::pd.Series or similar: Boolean array of the variable to evaluate group fairness on
        th_A::float: custom threshold for Group A, defaults to 0.5
        th_B::float: customt hreshold for Group B, defaults to 0.5
        verbose::bool: 
    
    Returns:
    --------
        performance_metrics::df: dataframe of computed performance metrics per group
    """

    # Run and store performance metrics per group
    analysis = utils.FairnessAnalysis(y_true.astype(int), y_probs, variable_mask)
    perf_metrics = analysis.compute_performance_rates(th_A,th_B)
    perf_metrics.rename(columns={'Group_a': Group_1mask, 'Group_b': Group_0mask}, inplace=True)
    return perf_metrics

In [None]:
# Run fairness analysis
race_analysis = utils.FairnessAnalysis(y_test.astype(int), y_prob, race_mask)
race_metrics = race_analysis.compute(best_th)
for attr, name in utils.FairnessAnalysis.metric_names.items():
    print(name, ":", round(getattr(race_metrics, attr), 3))

In [None]:
# Bootstrap Uncertainty
bs_metrics = []
np.random.seed(0)
for i in range(25):
    idx = np.random.choice(len(y_test), len(y_test), replace=True)
    tmp = utils.FairnessAnalysis(y_test.astype(int).values[idx], y_prob[idx], race_mask.values[idx])
    tmp2 = tmp.compute(best_th)
    bs_metrics.append(tmp2)

bs_metrics = np.array(bs_metrics)

In [None]:
for i, attr in enumerate(race_metrics._fields):
    print(utils.FairnessAnalysis.metric_names[attr], ":", 
          utils.format_uncertainty(bs_metrics[:, i].mean(), 2 * bs_metrics[:, i].std()))

In [None]:
perform_rates_race_best_th = group_performance_metrics(y_test, y_prob, race_mask, th_A=best_th, th_B=best_th,Group_1mask='Chinese', Group_0mask='Others')
perform_rates_race_best_th

In [None]:
perform_rates_race_default_th = group_performance_metrics(y_test, y_prob, race_mask, th_A=0.5, th_B=0.5,Group_1mask='Chinese', Group_0mask='Others')
perform_rates_race_default_th

## Performance-Fairness Tradeoffs
Here we explore fairness-performance tradeoffs stemming from our choice of lending threshold.

####  Code corresponding to section 2.7.4 Part C – Measuring Disadvantage in Veritas Document 4 FEAT Principles Assessment Case Studies

For the protected feature of race, the performance-fairness tradeoff analysis below was run for illustrative purposes only, as no mitigation is required (FNR ratio observed is 0.987 +/- 0.172)

In [None]:
print(f"Best Balanced Accuracy (single): {best_ba:.5f} with TH: {best_th:.3f}")

In [None]:
def postprocess_mitigation(y_test, y_probs, variable_mask, variable_metrics, verbose=True):
    """
    Helper function to evaluate group fairness in a one-vs-rest manner.
    
    Group A refers to `col`; Group B refers to excluding `col`
    
    Parameters:
    -----------
        y_test::pd.DataFrame or similar: Ground truth Labels
        y_probs::np.array or similar: Predicted test probabilities
        variable_mask::pd.Series or similar: Boolean array of the variable to evaluate group fairness on
        verbose::bool: 
        
    Returns:
    --------
        Tuple of: 
            split_sweep::named tuples: Group Metrics computed for a grid of th_a, th_b
            th_a::np.array: grid used to compute th_a
            th_b::np.array: grid used to compute th_b
            best_con_th_a::float: best constrainted threshold for group A
            best_con_th_b::float: best constrainted threshold for group B
        fairness_metrics::dict: dictionary of computed fairness metrics
    """
    # Run fairness analysis and tune grid to find best thresholds
    analysis = utils.FairnessAnalysis(y_test.astype(int), y_probs, variable_mask)
    th_a = np.linspace(0.3, 0.7, 500)
    th_b = np.linspace(0.3, 0.7, 500)
    grid_th_a, grid_th_b = np.meshgrid(th_a, th_b, sparse=True)
    metrics_split_sweep = analysis.compute(grid_th_a, grid_th_b)
    
    bal_acc_grid = metrics_split_sweep.bal_acc
    idx = np.unravel_index(bal_acc_grid.argmax(), bal_acc_grid.shape)
    best_th_a, best_th_b = th_a[idx[1]], th_b[idx[0]]
    if verbose:
        name = variable_mask.name if isinstance(variable_mask, pd.Series) else 'A'
        print(f"Best Balanced Accuracy (split): {bal_acc_grid.max():.5f} with {name}-majority TH: {best_th_a:.3f}, {name}-minority TH: {best_th_b:.3f}")

    # Find bal accuracy when fairness constrained to 4/5th threshold
    constrained_bal_acc = np.copy(bal_acc_grid)
    if variable_metrics.fnr_ratio<1:
        constrained_bal_acc[np.where(np.absolute(metrics_split_sweep.fnr_ratio) < 0.8)] = 0
    else:
        constrained_bal_acc[np.where(np.absolute(metrics_split_sweep.fnr_ratio) > 1.2)] = 0

    idx = np.unravel_index(constrained_bal_acc.argmax(), constrained_bal_acc.shape)
    best_con_th_a, best_con_th_b = th_a[idx[1]], th_b[idx[0]]
    if verbose:
        name = variable_mask.name if isinstance(variable_mask, pd.Series) else 'A'
        print(f"Best Fairness-Constrained Balanced Accuracy: {constrained_bal_acc.max():.5f} with {name}-majority TH: {best_con_th_a:.3f}, {name}-minority TH: {best_con_th_b:.3f}")
    
    return (metrics_split_sweep, th_a, th_b, best_th_a, best_th_b, best_con_th_a, best_con_th_b)

In [None]:
race_metrics = postprocess_mitigation(y_test, y_prob, race_mask, race_metrics)
race_split_sweep, th_a, th_b, best_th_a, best_th_b, best_con_th_a, best_con_th_b = race_metrics

In [None]:
# Plot
plt.figure(figsize=(9,8))
plt.title('Fairness vs. Performance Tradeoffs', fontsize=18)
plt.xlabel('Approval Threshold Men', fontsize=16)
plt.ylabel('Approval Threshold Women', fontsize=16)
plt.xlim(np.min(th_a), np.max(th_a))
plt.ylim(np.min(th_b), np.max(th_b))

bal_acc_lns = plt.contourf(th_a, th_b, race_split_sweep.bal_acc, levels=20)

eo_lns = plt.contour(th_a, th_b, race_split_sweep.fnr_ratio, colors='white', levels=[0.4,0.8, 1,1.2, 2,  4, 6, 10,12])
eo_lns.collections[-1].set_label('FNR Ratio')

cbar = plt.colorbar(bal_acc_lns)
cbar.set_label('Model Performance (balanced accuracy)', fontsize=14)
plt.clabel(eo_lns, inline=1,fmt='%1.2f', fontsize=14)

# Mark maximums
# plt.plot([0, 1], [0, 1], c='gray', ls=':', label='single threshold')
plt.scatter(best_th_a, best_th_b, c='b', marker='d', s=100, label= 'max bal acc', zorder=2)
plt.scatter(best_th, best_th, c='r', marker='x', s=100, label= 'single TH bal acc', zorder=2)
plt.scatter(best_con_th_a, best_con_th_b, c='purple', marker='*', s=100, label= 'FNR bal acc', zorder=2)
lgnd = plt.legend(framealpha=0.3, facecolor='black', fontsize=12, loc='lower right')
for text in lgnd.get_texts():
    text.set_color("white")
plt.show()

### Confusion matrices, classification report and other metrics after mitigation

#### Split threshold

In [None]:
fairness_split_th = group_fairness(y_test, y_prob, race_mask, th_A=best_th_a, th_B=best_th_b)

In [None]:
y_pred_split_th = group_thresholds(y_prob, race_mask, best_th_a, best_th_b, )

In [None]:
print(classification_report(y_test, y_pred_split_th))

In [None]:
perform_rates_split_th = group_performance_metrics(y_test, y_prob, race_mask, th_A=best_th_a, th_B=best_th_b,Group_1mask='Chinese', Group_0mask='Others')
perform_rates_split_th

#### Post-processing

In [None]:
y_pred_mit = group_thresholds(y_prob, race_mask, best_con_th_a, best_con_th_b, )

In [None]:
print(classification_report(y_test, y_pred_mit))

In [None]:
# compute confusion matrix after mitigation - overall
cf_matrix_mit = confusion_matrix(y_test, y_pred_mit) 
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix_mit.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_counts,group_names)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_mit, annot=labels, fmt='', cmap='Blues')
plt.xlabel("Predicted Class")
plt.ylabel("True Label")
plt.show()

In [None]:
# compute confusion matrix after mitigation - Chinese
cf_matrix_mit_chinese = confusion_matrix(y_test[race_mask], y_pred_mit[race_mask]) 
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix_mit_chinese.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_counts,group_names)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_mit_chinese, annot=labels, fmt='', cmap='Blues')
plt.xlabel("Predicted Class")
plt.ylabel("True Label")
plt.show()

In [None]:
# compute confusion matrix after mitigation - Others
cf_matrix_mit_others = confusion_matrix(y_test[~race_mask], y_pred_mit[~race_mask]) 
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix_mit_others.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_counts,group_names)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_mit_others, annot=labels, fmt='', cmap='Blues')
plt.xlabel("Predicted Class")
plt.ylabel("True Label")
plt.show()

In [None]:
fairness_mit = group_fairness(y_test, y_prob, race_mask, th_A=best_con_th_a, th_B=best_con_th_b)

In [None]:
perform_rates_mit = group_performance_metrics(y_test, y_prob, race_mask, th_A=best_con_th_a, th_B=best_con_th_b,Group_1mask='Chinese', Group_0mask='Others')
perform_rates_mit

### Effect of post-processing mitigation for race on gender

In [None]:
# Using the thresholds optimised for fairness with respect to i.e. race 
# Investigate the effect of applying these thresholds on a different protected feature i. gender
    #(1) Apply thresholds optimised for fairness for race - using race mask
    #(2) Calculate fairness metrics - using gender mask

#### Fairness Analysis for race before mitigation for gender

In [None]:
# Run fairness analysis
gender_analysis = utils.FairnessAnalysis(y_test.astype(int), y_prob, gender_mask)
gender_metrics = gender_analysis.compute(best_th)
for attr, name in utils.FairnessAnalysis.metric_names.items():
    print(name, ":", round(getattr(gender_metrics, attr), 3))

#### Fairness Analysis for gender after mitigation for race

In [None]:
race_mitigation_gender_analysis = utils.FairnessAnalysisSecondary(y_test.astype(int), y_prob, race_mask, best_con_th_a,best_con_th_b)
race_mitigation_gender_metrics = race_mitigation_gender_analysis.compute_secondary(gender_mask)
for attr, name in utils.FairnessAnalysisSecondary.metric_names.items():
    print(name, ":", round(getattr(race_mitigation_gender_metrics, attr), 3))