In [None]:
import pandas as pd
import numpy as np
import os
import sys

# Append the parent directory (root) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from eda_toolkit import ensure_directory

from tqdm import tqdm
from aequitas.audit import Audit

from py_scripts.functions import perform_bootstrapped_audit, plot_metrics

## Set Paths

In [None]:
data_path = os.path.join(os.pardir, "public_data/")

In [None]:
# Create base path
base_path = os.path.join(os.pardir)

# create image paths
image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Use the function to ensure'data' directory exists
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

In [None]:
df = pd.read_csv(os.path.join(data_path, "adult_predictions.csv")).set_index("Adult_ID")

In [None]:
df.head()  # inspect first 5 rows of data

## Apply the Audit Method

In [None]:
audit = Audit(df=df, score_column="predicted", label_column="income")
audit.audit()

## Inspect Disparity DataFrame

In [None]:
audit.disparity_df

In [None]:
## output disparity dataframe to csv file in data path
audit.disparity_df.to_csv(os.path.join(data_path, "disparity_metrics.csv"))

In [None]:
seeds = list(range(2000))
n_iterations = 2000  # Number of bootstrapping iterations
sample_size = 5000  # Sample size for each iteration

stratify_columns = ["race"]
categorical_columns = [
    "sex",
    "race",
]
score_column = "predicted"
label_column = "income"
bootstrap_method = "stratified"  # stratify or 'balanced'

## Run Bootstrapped Disparity Metrics

In [None]:
results_dict = perform_bootstrapped_audit(
    df=df,
    seeds=seeds,
    n_iterations=n_iterations,
    sample_size=sample_size,
    stratify_columns=stratify_columns,
    categorical_columns=categorical_columns,
    score_column=score_column,
    label_column=label_column,
    return_disparity_metrics=True,
)

# Access the results
all_metrics_stratified = results_dict["all_metrics"]

In [None]:
all_metrics_stratified

In [None]:
metric_cols = [
    "pprev_disparity",
    "fpr_disparity",
    "tnr_disparity",
    "tpr_disparity",
    "fnr_disparity",
    "precision_disparity",
]

nondisparity_columns = [x.replace("_disparity", "") for x in metric_cols]

In [None]:
all_metrics_stratified["attribute_name"].value_counts()

## Plot Disparity Metrics

In [None]:
plot_metrics(
    all_metrics_stratified,
    categories="all",
    metric_cols=metric_cols,
    include_legend=True,
    save_plots=True,
    cmap="tab20c",
    image_path_png="image_path_png",
)