In [None]:
import pandas as pd
from google.colab import auth, drive
auth.authenticate_user()

# Mount Google Drive
drive.mount('/content/drive')


# Process Data

In [None]:
dir = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model/summaries_predictions/'

final_df=pd.read_csv(dir + "15_gemini_predictions-gemini_summaries_tabular.csv")
final_df.head()

In [None]:
likely = final_df[final_df['gemini_pred'] != 'POSSIBLE_AD'].copy()
likely['pred'] = [0 if x == 'UNLIKELY_AD' else 1 for x in likely['gemini_pred']]
print(likely['gemini_pred'].value_counts())
print(likely['pred'].value_counts())
true_pred = sum(likely['pred'] == likely['case_status'])
acc = true_pred/len(likely)
print('Accuracy: %s' % round(acc,4))

In [None]:
possible = final_df[final_df['gemini_pred'] != 'LIKELY_AD'].copy()
print(possible['gemini_pred'].value_counts())
possible['pred'] = [0 if x == 'UNLIKELY_AD' else 1 for x in possible['gemini_pred']]
print(possible['gemini_pred'].value_counts())
print(possible['pred'].value_counts())
true_pred = sum(possible['pred'] == possible['case_status'])
acc = true_pred/len(possible)
print('Accuracy: %s' % round(acc,4))

# Test Performance

⚖️ Why These Metrics?

In a classification task (e.g., predicting case_status), fairness means the model's decisions should not unfairly disadvantage people from certain groups (like gender, race, age, etc.).

To assess this, we compare how well or how consistently the model performs across different subgroups. Each of the three metrics measures a specific kind of fairness.

🔹 1. Equal Opportunity (TPR)

The True Positive Rate (TP / [TP + FN]) should be equal across groups.

Why it matters:

This ensures that qualified individuals (i.e., people who truly belong to the positive class) are equally likely to be correctly identified, no matter their demographic. It aims for equal performance for both positive and negative predictions across groups, ensuring that neither group benefits or suffers disproportionately from the model's decisions.

Example: In healthcare, all patients who actually need treatment (true positives) should have an equal chance of being correctly identified — regardless of race or gender.

Bias implication:

If Group A has TPR = 0.9 and Group B has TPR = 0.6 → the model is less likely to help Group B, even if they qualify. That’s discriminatory.

🔹 2. Predictive Parity (Precision)
The Precision (TP / [TP + FP]) should be equal across groups.

Why it matters:

This ensures that those who are predicted to be positive actually are, at equal rates across groups.

In practical terms, it means the model is equally trustworthy in its positive predictions for all groups.

Bias implication:

If Group A has 80% precision and Group B has 50% → Group B suffers more false alarms or unjustified interventions, which is unfair.

🔹 3. Equalized Odds (TPR & FPR)

Both TPR and FPR (False Positive Rate = FP / [FP + TN]) should be equal across groups.

Why it matters:

This is the most comprehensive fairness metric.

It ensures the model:

✅ correctly identifies positives at the same rate (Equal Opportunity)

❌ doesn’t falsely accuse negatives at different rates (False alarms)

Bias implication:

If Group A has FPR = 0.1 and Group B has FPR = 0.3 → Group B receives more false accusations, which can be harmful or costly (e.g., wrongful arrests, misdiagnoses).

🧠 Summary Table

Metric	Measures	Prevents Bias Where...

Equal Opportunity	TPR	Qualified individuals are denied more often in some groups

Predictive Parity	Precision	Some groups receive more incorrect positive predictions

Equalized Odds	TPR & FPR	Some groups are both over- or under-predicted unfairly


#### Overall

In [None]:
from sklearn.metrics import confusion_matrix

# Get ground truth and predictions
y_true = likely['case_status']
y_pred = likely['pred']

# Compute confusion matrix
try:
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
except ValueError:
    # Handle edge cases where only one class is present
    tn = fp = fn = tp = 0
    for actual, pred in zip(y_true, y_pred):
        if actual == 1 and pred == 1:
            tp += 1
        elif actual == 1 and pred == 0:
            fn += 1
        elif actual == 0 and pred == 1:
            fp += 1
        elif actual == 0 and pred == 0:
            tn += 1

# Compute fairness-related metrics
tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0  # Sensitivity / Equal Opportunity
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0  # False Positive Rate / Equalized Odds
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0  # Precision / Predictive Parity
sample_size = len(likely)

# Print results
print("📊 Overall Model Performance on Data")
print("-" * 40)
print(f"TPR:       {tpr:.4f}")
print(f"FPR:       {fpr:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sample Size:  {sample_size}")

In [None]:
# Get ground truth and predictions
y_true = possible['case_status']
y_pred = possible['pred']

# Compute confusion matrix
try:
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
except ValueError:
    # Handle edge cases where only one class is present
    tn = fp = fn = tp = 0
    for actual, pred in zip(y_true, y_pred):
        if actual == 1 and pred == 1:
            tp += 1
        elif actual == 1 and pred == 0:
            fn += 1
        elif actual == 0 and pred == 1:
            fp += 1
        elif actual == 0 and pred == 0:
            tn += 1

# Compute fairness-related metrics
tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0  # Sensitivity / Equal Opportunity
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0  # False Positive Rate / Equalized Odds
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0  # Precision / Predictive Parity
sample_size = len(possible)

# Print results
print("📊 Overall Model Performance on Data")
print("-" * 40)
print(f"TPR:       {tpr:.4f}")
print(f"FPR:       {fpr:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sample Size:  {sample_size}")

# Bias Validation - Likely

## Sample performance

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from tabulate import tabulate

# === Age binning ===
def bin_age(df):
    bin_edges = [0, 70, 80, 90, float('inf')]
    labels = ['<70', '70-80', '80-90', '>90']
    df['age_group'] = pd.cut(df['age'], bins=bin_edges, labels=labels, right=False)
    return df

# === Bootstrap metrics: TPR, FPR, Precision only ===
def bootstrap_metrics(y_true, y_pred, n_iterations=100):
    stats = {'TPR': [], 'FPR': [], 'Precision': []}
    for _ in range(n_iterations):
        try:
            sample = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}).sample(frac=1.0, replace=True)
            yt = sample['y_true']
            yp = sample['y_pred']
            tn, fp, fn, tp = confusion_matrix(yt, yp, labels=[0, 1]).ravel()
            tpr = tp / (tp + fn) if (tp + fn) > 0 else np.nan
            fpr = fp / (fp + tn) if (fp + tn) > 0 else np.nan
            precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
            stats['TPR'].append(tpr)
            stats['FPR'].append(fpr)
            stats['Precision'].append(precision)
        except:
            continue

    return {
        'TPR': (np.nanpercentile(stats['TPR'], 2.5), np.nanpercentile(stats['TPR'], 97.5)),
        'FPR': (np.nanpercentile(stats['FPR'], 2.5), np.nanpercentile(stats['FPR'], 97.5)),
        'Precision': (np.nanpercentile(stats['Precision'], 2.5), np.nanpercentile(stats['Precision'], 97.5)),
    }

# === Apply binning to age ===
likely = bin_age(likely)

# === Demographic groups ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

# === Run analysis for Train and Test ===
data= likely
records = []

for group_col in demographic_groups:
    if group_col not in data.columns:
        continue

    group_values = data[group_col].dropna().unique()

    for value in group_values:
        subset = data[data[group_col] == value]
        if len(subset) < 10:
            continue  # skip small groups

        y_true = subset['case_status']
        y_pred = subset['pred']
        ci = bootstrap_metrics(y_true, y_pred)

        records.append({
            'Group': group_col,
            'Subgroup': value,
            'Equal Opportunity (TPR)': f"{ci['TPR'][0]:.3f}–{ci['TPR'][1]:.3f}",
            'False Positive Rate (FPR)': f"{ci['FPR'][0]:.3f}–{ci['FPR'][1]:.3f}",
            'Precision (Predictive Parity)': f"{ci['Precision'][0]:.3f}–{ci['Precision'][1]:.3f}",
            'Sample Size': len(subset)
        })

df_metrics = pd.DataFrame(records)
print(f"\n=== Fairness Metrics Per Subgroup ===")
print(tabulate(df_metrics, headers="keys", tablefmt="fancy_grid", showindex=False))


## Check Equalized Opportunity Violations -  pairwise

`proportions_ztest`: Compares proportions between two independent groups to see if they're different

In [None]:
from statsmodels.stats.proportion import proportions_ztest
from sklearn.metrics import confusion_matrix
import pandas as pd

def compute_tpr_components(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    return tp, fn  # TPR = TP / (TP + FN)

def test_tpr_difference(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    values = df[group_col].dropna().unique()
    results = []

    for i in range(len(values)):
        for j in range(i+1, len(values)):
            g1, g2 = values[i], values[j]
            df1 = df[df[group_col] == g1]
            df2 = df[df[group_col] == g2]

            if len(df1) < 10 or len(df2) < 10:
                continue  # skip small groups

            try:
                tp1, fn1 = compute_tpr_components(df1[y_true_col], df1[y_pred_col])
                tp2, fn2 = compute_tpr_components(df2[y_true_col], df2[y_pred_col])
            except:
                continue

            count = [tp1, tp2]
            nobs = [tp1 + fn1, tp2 + fn2]

            if min(nobs) == 0:
                continue  # skip invalid

            stat, pval = proportions_ztest(count, nobs)
            if pval < 0.05:
                results.append({
                    'Demographic': group_col,
                    'Group 1': g1,
                    'Group 2': g2,
                    'TPR 1': f"{tp1 / nobs[0]:.3f}",
                    'TPR 2': f"{tp2 / nobs[1]:.3f}",
                    'p-value': round(pval, 4)
                })

    return pd.DataFrame(results)

# === Run for each demographic column ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

for group_col in demographic_groups:
    print(f"\n=== TPR differences in {group_col} ===")
    df_tpr_diff = test_tpr_difference(likely, group_col)
    if df_tpr_diff.empty:
        print("No significant TPR differences.")
    else:
        print(df_tpr_diff)


## Check Equalized Opportunity Violations - group

`chi2_contingency`

In [None]:
from sklearn.metrics import confusion_matrix
from scipy.stats import chi2_contingency
import pandas as pd

def chi2_test_tpr(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    table = []
    labels = []

    for val in df[group_col].dropna().unique():
        subset = df[df[group_col] == val]
        if len(subset) < 10:
            continue

        try:
            tn, fp, fn, tp = confusion_matrix(subset[y_true_col], subset[y_pred_col], labels=[0,1]).ravel()
        except:
            continue

        table.append([tp, fn])  # TPR = TP / (TP + FN)
        labels.append(val)

    if len(table) < 2:
        return None, None, None

    stat, pval, _, expected = chi2_contingency(table)

    return pd.DataFrame(table, index=labels, columns=['TP', 'FN']), pval, stat

# Example usage:
for group in demographic_groups:
    print(f"\n=== TPR difference test across {group} ===")
    contingency_df, pval, stat = chi2_test_tpr(likely, group)

    if contingency_df is None:
        print("Not enough valid groups.")
    else:
        print(contingency_df)
        print(f"Chi-squared p-value: {pval:.4f}")
        if pval < 0.05:
            print("✅ Significant TPR difference across subgroups.")
        else:
            print("❌ No significant TPR difference across subgroups.")


## Check Predictive Parity Violations -  Pairwise

In [None]:
from statsmodels.stats.proportion import proportions_ztest
from sklearn.metrics import confusion_matrix
import pandas as pd

def compute_precision_components(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    return tp, fp  # Precision = TP / (TP + FP)

def test_precision_difference(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    values = df[group_col].dropna().unique()
    results = []

    for i in range(len(values)):
        for j in range(i+1, len(values)):
            g1, g2 = values[i], values[j]
            df1 = df[df[group_col] == g1]
            df2 = df[df[group_col] == g2]

            if len(df1) < 10 or len(df2) < 10:
                continue  # skip small groups

            try:
                tp1, fp1 = compute_precision_components(df1[y_true_col], df1[y_pred_col])
                tp2, fp2 = compute_precision_components(df2[y_true_col], df2[y_pred_col])
            except:
                continue

            count = [tp1, tp2]
            nobs = [tp1 + fp1, tp2 + fp2]

            if min(nobs) == 0:
                continue  # skip invalid

            stat, pval = proportions_ztest(count, nobs)
            if pval < 0.05:
                results.append({
                    'Demographic': group_col,
                    'Group 1': g1,
                    'Group 2': g2,
                    'Precision 1': f"{tp1 / nobs[0]:.3f}",
                    'Precision 2': f"{tp2 / nobs[1]:.3f}",
                    'p-value': round(pval, 4)
                })

    return pd.DataFrame(results)
for group_col in demographic_groups:
    print(f"\n=== Precision differences in {group_col} ===")
    df_precision_diff = test_precision_difference(likely, group_col)
    if df_precision_diff.empty:
        print("No significant Precision differences.")
    else:
        print(df_precision_diff)

## Check Predictive Parity Violations -  group

In [None]:
from sklearn.metrics import confusion_matrix
from scipy.stats import chi2_contingency
import pandas as pd

def chi2_test_precision(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    table = []
    labels = []

    for val in df[group_col].dropna().unique():
        subset = df[df[group_col] == val]
        if len(subset) < 10:
            continue

        try:
            tn, fp, fn, tp = confusion_matrix(subset[y_true_col], subset[y_pred_col], labels=[0, 1]).ravel()
        except:
            continue

        table.append([tp, fp])  # Precision = TP / (TP + FP)
        labels.append(val)

    if len(table) < 2:
        return None, None, None

    stat, pval, _, expected = chi2_contingency(table)

    return pd.DataFrame(table, index=labels, columns=['TP', 'FP']), pval, stat

for group in demographic_groups:
    print(f"\n=== Precision difference test across {group} ===")
    contingency_df, pval, stat = chi2_test_precision(likely, group)

    if contingency_df is None:
        print("Not enough valid groups.")
    else:
        print(contingency_df)
        print(f"Chi-squared p-value: {pval:.4f}")
        if pval < 0.05:
            print("✅ Significant Precision difference across subgroups.")
        else:
            print("❌ No significant Precision difference across subgroups.")

## Check Equalized odds Violations -  pairwise


In [None]:
# pip install fairlearn

In [None]:
import pandas as pd
import numpy as np
from fairlearn.metrics import equalized_odds_difference
from itertools import combinations
from tabulate import tabulate

# === Age binning ===
def bin_age(df):
    bin_edges = [0, 70, 80, 90, float('inf')]
    labels = ['<70', '70-80', '80-90', '>90']
    df['age_group'] = pd.cut(df['age'], bins=bin_edges, labels=labels, right=False)
    return df

# === Bootstrap EOD for two subgroups only ===
def bootstrap_pairwise_eod(y_true, y_pred, sensitive_features, n_iterations=100):
    eods = []
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred, 'sensitive': sensitive_features})

    for _ in range(n_iterations):
        sample = data.sample(frac=1.0, replace=True)
        try:
            eod = equalized_odds_difference(sample['y_true'], sample['y_pred'], sensitive_features=sample['sensitive'])
            eods.append(eod)
        except:
            continue

    if len(eods) == 0:
        return (np.nan, np.nan)

    return (
        np.nanpercentile(eods, 2.5),
        np.nanpercentile(eods, 97.5)
    )

# === Apply age binning ===
likely = bin_age(likely)

# === Filter to test set only ===
test_data = likely

# === Demographic groups ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

# === Pairwise EOD comparisons on test set ===
records = []

for group in demographic_groups:
    if group not in test_data.columns:
        continue

    group_values = test_data[group].dropna().unique()

    for g1, g2 in combinations(group_values, 2):
        pair_df = test_data[test_data[group].isin([g1, g2])]
        if len(pair_df) < 20:
            continue  # skip small pair

        y_true = pair_df['case_status']
        y_pred = pair_df['pred']
        sensitive_pair = pair_df[group]

        # Compute bootstrapped EOD CI
        eod_low, eod_high = bootstrap_pairwise_eod(y_true, y_pred, sensitive_pair)

        records.append({
            'Group': group,
            'Subgroup 1': g1,
            'Subgroup 2': g2,
            'EOD (95% CI)': f"{eod_low:.3f}–{eod_high:.3f}",
            'Sample Size': len(pair_df),
            'Flag (EOD > 0.1)': '⚠️' if eod_low > 0.1 else ''
        })

# === Print nicely ===
df_pairwise = pd.DataFrame(records)
print("\n=== Pairwise Equalized Odds Report ===")
print(tabulate(df_pairwise, headers="keys", tablefmt="fancy_grid", showindex=False))


## Check Equalized odds Violations -  group

https://fairlearn.org/v0.7.0/api_reference/fairlearn.metrics.html

fairlearn.metrics.equalized_odds_difference(y_true, y_pred, *, sensitive_features, method='between_groups', sample_weight=None)[source]
Calculate the equalized odds difference.

The greater of two metrics: true_positive_rate_difference and false_positive_rate_difference. The former is the difference between the largest and smallest of
, across all values
 of the sensitive feature(s). The latter is defined similarly, but for
. The equalized odds difference of 0 means that all groups have the same true positive, true negative, false positive, and false negative rates.

In [None]:
import pandas as pd
import numpy as np
from fairlearn.metrics import equalized_odds_difference

# === Age binning ===
def bin_age(df):
    bin_edges = [0, 70, 80, 90, float('inf')]
    labels = ['<70', '70-80', '80-90', '>90']
    df['age_group'] = pd.cut(df['age'], bins=bin_edges, labels=labels, right=False)
    return df

# === Bootstrap EOD CI only ===
def bootstrap_eod(y_true, y_pred, sensitive_features, n_iterations=100):
    eods = []
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred, 'sensitive': sensitive_features})

    for _ in range(n_iterations):
        sample = data.sample(frac=1.0, replace=True)
        try:
            eod = equalized_odds_difference(sample['y_true'], sample['y_pred'], sensitive_features=sample['sensitive'])
            eods.append(eod)
        except:
            continue

    if len(eods) == 0:
        return (np.nan, np.nan)

    return (
        np.nanpercentile(eods, 2.5),
        np.nanpercentile(eods, 97.5)
    )

# === Apply age binning ===
likely = bin_age(likely)

# === Demographic groups ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

# === Run analysis for Train and Test ===
data=likely
records = []

for group in demographic_groups:
    if group not in data.columns:
        continue

    sensitive = data[group]
    valid = sensitive.notna()
    y_true = data.loc[valid, 'case_status']
    y_pred = data.loc[valid, 'pred']
    sensitive_group = sensitive[valid]

    if len(sensitive_group.unique()) < 2:
        continue  # skip if only one group value

    # Compute EOD CI
    eod_low, eod_high = bootstrap_eod(y_true, y_pred, sensitive_group)

    records.append({
        'Group': group,
        'Equalized Odds Difference (95% CI)': f"{eod_low:.3f}–{eod_high:.3f}",
        'Sample Size': len(sensitive_group),
        'Flag (EOD > 0.1)': '⚠️' if eod_low > 0.1 else ''
    })

# Create and display results
df_metrics = pd.DataFrame(records)
df_metrics.set_index('Group', inplace=True)

print(f"\n=== Equalized Odds Report ===")
print(df_metrics)


# Bias Validation - Possible

## Sample performance

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from tabulate import tabulate

# === Age binning ===
def bin_age(df):
    bin_edges = [0, 70, 80, 90, float('inf')]
    labels = ['<70', '70-80', '80-90', '>90']
    df['age_group'] = pd.cut(df['age'], bins=bin_edges, labels=labels, right=False)
    return df

# === Bootstrap metrics: TPR, FPR, Precision only ===
def bootstrap_metrics(y_true, y_pred, n_iterations=100):
    stats = {'TPR': [], 'FPR': [], 'Precision': []}
    for _ in range(n_iterations):
        try:
            sample = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}).sample(frac=1.0, replace=True)
            yt = sample['y_true']
            yp = sample['y_pred']
            tn, fp, fn, tp = confusion_matrix(yt, yp, labels=[0, 1]).ravel()
            tpr = tp / (tp + fn) if (tp + fn) > 0 else np.nan
            fpr = fp / (fp + tn) if (fp + tn) > 0 else np.nan
            precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
            stats['TPR'].append(tpr)
            stats['FPR'].append(fpr)
            stats['Precision'].append(precision)
        except:
            continue

    return {
        'TPR': (np.nanpercentile(stats['TPR'], 2.5), np.nanpercentile(stats['TPR'], 97.5)),
        'FPR': (np.nanpercentile(stats['FPR'], 2.5), np.nanpercentile(stats['FPR'], 97.5)),
        'Precision': (np.nanpercentile(stats['Precision'], 2.5), np.nanpercentile(stats['Precision'], 97.5)),
    }

# === Apply binning to age ===
possible = bin_age(possible)

# === Demographic groups ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

# === Run analysis for Train and Test ===
data= possible
records = []

for group_col in demographic_groups:
    if group_col not in data.columns:
        continue

    group_values = data[group_col].dropna().unique()

    for value in group_values:
        subset = data[data[group_col] == value]
        if len(subset) < 10:
            continue  # skip small groups

        y_true = subset['case_status']
        y_pred = subset['pred']
        ci = bootstrap_metrics(y_true, y_pred)

        records.append({
            'Group': group_col,
            'Subgroup': value,
            'Equal Opportunity (TPR)': f"{ci['TPR'][0]:.3f}–{ci['TPR'][1]:.3f}",
            'False Positive Rate (FPR)': f"{ci['FPR'][0]:.3f}–{ci['FPR'][1]:.3f}",
            'Precision (Predictive Parity)': f"{ci['Precision'][0]:.3f}–{ci['Precision'][1]:.3f}",
            'Sample Size': len(subset)
        })

df_metrics = pd.DataFrame(records)
print(f"\n=== Fairness Metrics Per Subgroup ===")
print(tabulate(df_metrics, headers="keys", tablefmt="fancy_grid", showindex=False))


## Check Equalized Opportunity Violations -  pairwise

`proportions_ztest`: Compares proportions between two independent groups to see if they're different

In [None]:
from statsmodels.stats.proportion import proportions_ztest
from sklearn.metrics import confusion_matrix
import pandas as pd

def compute_tpr_components(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    return tp, fn  # TPR = TP / (TP + FN)

def test_tpr_difference(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    values = df[group_col].dropna().unique()
    results = []

    for i in range(len(values)):
        for j in range(i+1, len(values)):
            g1, g2 = values[i], values[j]
            df1 = df[df[group_col] == g1]
            df2 = df[df[group_col] == g2]

            if len(df1) < 10 or len(df2) < 10:
                continue  # skip small groups

            try:
                tp1, fn1 = compute_tpr_components(df1[y_true_col], df1[y_pred_col])
                tp2, fn2 = compute_tpr_components(df2[y_true_col], df2[y_pred_col])
            except:
                continue

            count = [tp1, tp2]
            nobs = [tp1 + fn1, tp2 + fn2]

            if min(nobs) == 0:
                continue  # skip invalid

            stat, pval = proportions_ztest(count, nobs)
            if pval < 0.05:
                results.append({
                    'Demographic': group_col,
                    'Group 1': g1,
                    'Group 2': g2,
                    'TPR 1': f"{tp1 / nobs[0]:.3f}",
                    'TPR 2': f"{tp2 / nobs[1]:.3f}",
                    'p-value': round(pval, 4)
                })

    return pd.DataFrame(results)

# === Run for each demographic column ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

for group_col in demographic_groups:
    print(f"\n=== TPR differences in {group_col} ===")
    df_tpr_diff = test_tpr_difference(possible, group_col)
    if df_tpr_diff.empty:
        print("No significant TPR differences.")
    else:
        print(df_tpr_diff)


## Check Equalized Opportunity Violations - group

`chi2_contingency`

In [None]:
from sklearn.metrics import confusion_matrix
from scipy.stats import chi2_contingency
import pandas as pd

def chi2_test_tpr(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    table = []
    labels = []

    for val in df[group_col].dropna().unique():
        subset = df[df[group_col] == val]
        if len(subset) < 10:
            continue

        try:
            tn, fp, fn, tp = confusion_matrix(subset[y_true_col], subset[y_pred_col], labels=[0,1]).ravel()
        except:
            continue

        table.append([tp, fn])  # TPR = TP / (TP + FN)
        labels.append(val)

    if len(table) < 2:
        return None, None, None

    stat, pval, _, expected = chi2_contingency(table)

    return pd.DataFrame(table, index=labels, columns=['TP', 'FN']), pval, stat

# Example usage:
for group in demographic_groups:
    print(f"\n=== TPR difference test across {group} ===")
    contingency_df, pval, stat = chi2_test_tpr(likely, group)

    if contingency_df is None:
        print("Not enough valid groups.")
    else:
        print(contingency_df)
        print(f"Chi-squared p-value: {pval:.4f}")
        if pval < 0.05:
            print("✅ Significant TPR difference across subgroups.")
        else:
            print("❌ No significant TPR difference across subgroups.")


## Check Predictive Parity Violations -  Pairwise

In [None]:
from statsmodels.stats.proportion import proportions_ztest
from sklearn.metrics import confusion_matrix
import pandas as pd

def compute_precision_components(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    return tp, fp  # Precision = TP / (TP + FP)

def test_precision_difference(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    values = df[group_col].dropna().unique()
    results = []

    for i in range(len(values)):
        for j in range(i+1, len(values)):
            g1, g2 = values[i], values[j]
            df1 = df[df[group_col] == g1]
            df2 = df[df[group_col] == g2]

            if len(df1) < 10 or len(df2) < 10:
                continue  # skip small groups

            try:
                tp1, fp1 = compute_precision_components(df1[y_true_col], df1[y_pred_col])
                tp2, fp2 = compute_precision_components(df2[y_true_col], df2[y_pred_col])
            except:
                continue

            count = [tp1, tp2]
            nobs = [tp1 + fp1, tp2 + fp2]

            if min(nobs) == 0:
                continue  # skip invalid

            stat, pval = proportions_ztest(count, nobs)
            if pval < 0.05:
                results.append({
                    'Demographic': group_col,
                    'Group 1': g1,
                    'Group 2': g2,
                    'Precision 1': f"{tp1 / nobs[0]:.3f}",
                    'Precision 2': f"{tp2 / nobs[1]:.3f}",
                    'p-value': round(pval, 4)
                })

    return pd.DataFrame(results)
for group_col in demographic_groups:
    print(f"\n=== Precision differences in {group_col} ===")
    df_precision_diff = test_precision_difference(possible, group_col)
    if df_precision_diff.empty:
        print("No significant Precision differences.")
    else:
        print(df_precision_diff)

## Check Predictive Parity Violations -  group

In [None]:
from sklearn.metrics import confusion_matrix
from scipy.stats import chi2_contingency
import pandas as pd

def chi2_test_precision(df, group_col, y_true_col='case_status', y_pred_col='pred'):
    table = []
    labels = []

    for val in df[group_col].dropna().unique():
        subset = df[df[group_col] == val]
        if len(subset) < 10:
            continue

        try:
            tn, fp, fn, tp = confusion_matrix(subset[y_true_col], subset[y_pred_col], labels=[0, 1]).ravel()
        except:
            continue

        table.append([tp, fp])  # Precision = TP / (TP + FP)
        labels.append(val)

    if len(table) < 2:
        return None, None, None

    stat, pval, _, expected = chi2_contingency(table)

    return pd.DataFrame(table, index=labels, columns=['TP', 'FP']), pval, stat

for group in demographic_groups:
    print(f"\n=== Precision difference test across {group} ===")
    contingency_df, pval, stat = chi2_test_precision(possible, group)

    if contingency_df is None:
        print("Not enough valid groups.")
    else:
        print(contingency_df)
        print(f"Chi-squared p-value: {pval:.4f}")
        if pval < 0.05:
            print("✅ Significant Precision difference across subgroups.")
        else:
            print("❌ No significant Precision difference across subgroups.")

## Check Equalized odds Violations -  pairwise


In [None]:
import pandas as pd
import numpy as np
from fairlearn.metrics import equalized_odds_difference
from itertools import combinations
from tabulate import tabulate

# === Age binning ===
def bin_age(df):
    bin_edges = [0, 70, 80, 90, float('inf')]
    labels = ['<70', '70-80', '80-90', '>90']
    df['age_group'] = pd.cut(df['age'], bins=bin_edges, labels=labels, right=False)
    return df

# === Bootstrap EOD for two subgroups only ===
def bootstrap_pairwise_eod(y_true, y_pred, sensitive_features, n_iterations=100):
    eods = []
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred, 'sensitive': sensitive_features})

    for _ in range(n_iterations):
        sample = data.sample(frac=1.0, replace=True)
        try:
            eod = equalized_odds_difference(sample['y_true'], sample['y_pred'], sensitive_features=sample['sensitive'])
            eods.append(eod)
        except:
            continue

    if len(eods) == 0:
        return (np.nan, np.nan)

    return (
        np.nanpercentile(eods, 2.5),
        np.nanpercentile(eods, 97.5)
    )

# === Apply age binning ===
possible = bin_age(possible)

# === Filter to test set only ===
test_data = possible

# === Demographic groups ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

# === Pairwise EOD comparisons on test set ===
records = []

for group in demographic_groups:
    if group not in test_data.columns:
        continue

    group_values = test_data[group].dropna().unique()

    for g1, g2 in combinations(group_values, 2):
        pair_df = test_data[test_data[group].isin([g1, g2])]
        if len(pair_df) < 20:
            continue  # skip small pair

        y_true = pair_df['case_status']
        y_pred = pair_df['pred']
        sensitive_pair = pair_df[group]

        # Compute bootstrapped EOD CI
        eod_low, eod_high = bootstrap_pairwise_eod(y_true, y_pred, sensitive_pair)

        records.append({
            'Group': group,
            'Subgroup 1': g1,
            'Subgroup 2': g2,
            'EOD (95% CI)': f"{eod_low:.3f}–{eod_high:.3f}",
            'Sample Size': len(pair_df),
            'Flag (EOD > 0.1)': '⚠️' if eod_low > 0.1 else ''
        })

# === Print nicely ===
df_pairwise = pd.DataFrame(records)
print("\n=== Pairwise Equalized Odds Report ===")
print(tabulate(df_pairwise, headers="keys", tablefmt="fancy_grid", showindex=False))


## Check Equalized odds Violations -  group

https://fairlearn.org/v0.7.0/api_reference/fairlearn.metrics.html

fairlearn.metrics.equalized_odds_difference(y_true, y_pred, *, sensitive_features, method='between_groups', sample_weight=None)[source]
Calculate the equalized odds difference.

The greater of two metrics: true_positive_rate_difference and false_positive_rate_difference. The former is the difference between the largest and smallest of
, across all values
 of the sensitive feature(s). The latter is defined similarly, but for
. The equalized odds difference of 0 means that all groups have the same true positive, true negative, false positive, and false negative rates.

In [None]:
import pandas as pd
import numpy as np
from fairlearn.metrics import equalized_odds_difference

# === Age binning ===
def bin_age(df):
    bin_edges = [0, 70, 80, 90, float('inf')]
    labels = ['<70', '70-80', '80-90', '>90']
    df['age_group'] = pd.cut(df['age'], bins=bin_edges, labels=labels, right=False)
    return df

# === Bootstrap EOD CI only ===
def bootstrap_eod(y_true, y_pred, sensitive_features, n_iterations=100):
    eods = []
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred, 'sensitive': sensitive_features})

    for _ in range(n_iterations):
        sample = data.sample(frac=1.0, replace=True)
        try:
            eod = equalized_odds_difference(sample['y_true'], sample['y_pred'], sensitive_features=sample['sensitive'])
            eods.append(eod)
        except:
            continue

    if len(eods) == 0:
        return (np.nan, np.nan)

    return (
        np.nanpercentile(eods, 2.5),
        np.nanpercentile(eods, 97.5)
    )

# === Apply age binning ===
possible = bin_age(possible)

# === Demographic groups ===
demographic_groups = ['age_group', 'gender', 'insurance_group', 'language_group',
                      'race_group1', 'race_group2', 'race_group3', 'race_group4']

# === Run analysis for Train and Test ===
data=possible
records = []

for group in demographic_groups:
    if group not in data.columns:
        continue

    sensitive = data[group]
    valid = sensitive.notna()
    y_true = data.loc[valid, 'case_status']
    y_pred = data.loc[valid, 'pred']
    sensitive_group = sensitive[valid]

    if len(sensitive_group.unique()) < 2:
        continue  # skip if only one group value

    # Compute EOD CI
    eod_low, eod_high = bootstrap_eod(y_true, y_pred, sensitive_group)

    records.append({
        'Group': group,
        'Equalized Odds Difference (95% CI)': f"{eod_low:.3f}–{eod_high:.3f}",
        'Sample Size': len(sensitive_group),
        'Flag (EOD > 0.1)': '⚠️' if eod_low > 0.1 else ''
    })

# Create and display results
df_metrics = pd.DataFrame(records)
df_metrics.set_index('Group', inplace=True)

print(f"\n=== Equalized Odds Report ===")
print(df_metrics)
