In [None]:
import pathlib as pl

import glob, os
import pandas as pd
import numpy as np
import itertools, warnings

from collections import Counter, defaultdict
from typing import List, Dict, Tuple


# -----------------------------------------------------------------------
# CAMERA-READY PLOTTING (thanks Alex Boyd!)
# -----------------------------------------------------------------------
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.ticker import MultipleLocator, PercentFormatter
# The following code is borrowed from material provided by Alex!
FULL_WIDTH = 5.50107
COL_WIDTH  = 4.50461

# Accessibility
sns.set_palette(sns.color_palette("colorblind"))
matplotlib.rcParams["axes.prop_cycle"] = matplotlib.cycler(color=sns.color_palette("colorblind"))

# Put at top of plotting script (requires tex be installed though)
matplotlib.rc('font', family='serif', size=20)
matplotlib.rc('text', usetex=True)


def adjust(fig, left=0.0, right=1.0, bottom=0.0, top=1.0, wspace=0.0, hspace=0.0):
    fig.subplots_adjust(
        left   = left,  # the left side of the subplots of the figure
        right  = right,  # the right side of the subplots of the figure
        bottom = bottom,  # the bottom of the subplots of the figure
        top    = top,  # the top of the subplots of the figure
        wspace = wspace,  # the amount of width reserved for blank space between subplots
        hspace = hspace,  # the amount of height reserved for white space between subplots
    )
    
def save_fig(fig, name, **kwargs):
    basedir = os.makedirs("./camera_ready/images", exist_ok=True)
    fig.savefig(f"./camera_ready/images/{name}.pdf", bbox_inches="tight", **kwargs)

def disable_axis(ax):
    ax.set_zorder(-100)  # Avoids a visual rendering bug
    ax.set_xticks([])
    ax.set_xticklabels([])
    ax.set_yticks([])
    ax.set_yticklabels([])
    plt.setp(ax.spines.values(), color=None)

## 1. Load model files

Run `post-process-results.ipynb` first to generate a compiled version of the results.

In [None]:
RESULTS_DIR = "../results"

# list all the score files per dataset
DATASET_2_FILEPATHS = {
    "USE-5": f"{RESULTS_DIR}/USE-5-no-maxpmi-constraint.csv.gz",
    # Baselines below ----
    "Winobias": f"{RESULTS_DIR}/Winobias-no-maxpmi-constraint.csv.gz",
    "Winogender": f"{RESULTS_DIR}/Winogender-no-maxpmi-constraint.csv.gz",
    # We define this ordering so that we can automatically obtain the same coloring scheme as
    # the one used for word analysis
    "USE-10": f"{RESULTS_DIR}/USE-10-no-maxpmi-constraint.csv.gz",
    "USE-20": f"{RESULTS_DIR}/USE-20-no-maxpmi-constraint.csv.gz",
}

DATASET_NAMES = list(DATASET_2_FILEPATHS.keys())
print(" Dataset names:\n  ->", DATASET_NAMES, "\n")

# Read each individual filepath, creating an association <str, dataframe>.
# every str should have a list of the same size.
DATASET_2_FILES = {name: pd.read_csv(fp) for name, fp in DATASET_2_FILEPATHS.items()}
DATASET_2_FILES = {name: df.sort_values(["model", "orig_index"]).reset_index(drop=True) for name, df in DATASET_2_FILES.items()}

# ------------------------------------------------------------------
# Determine whether the number of evaluated models are the same
# ------------------------------------------------------------------

MODELS, NUM_EVAL_MODELS = [], []

for dataset, df in DATASET_2_FILES.items():
    print("Number of evaluated models for dataset", dataset, "is", df["model"].nunique())
    MODELS.extend(df["model"].unique())
    NUM_EVAL_MODELS.append(df["model"].nunique())
    
# We force the number of models to be the same across all datasets
if len(set(NUM_EVAL_MODELS)) != 1:
    warnings.warn(f"Inconsistent number of models across the different evaluation mber models: {NUM_EVAL_MODELS}")

NUM_EVAL_MODELS = NUM_EVAL_MODELS[0]
print("Evaluating", NUM_EVAL_MODELS, "models:")
MODELS = list(sorted(set(MODELS)))
print(" -", "\n - ".join(MODELS))

In [None]:
# ------------------------------------------------------------------------
# Validation (!sanity check)
# ------------------------------------------------------------------------
# When selecting a data slice from the big dataframe
# we must guarantee that the sentences match to one another
# (that is necessary because the remaining of the code is relying
# on ordering of the dataframes)
def check_slices(dataset: pd.DataFrame, data2files: dict, models: List[str]):
    """Check for the ordering of the rows in ``dataset`` correspond to the
    ones in ``data2files``. Since the data2files are ordered by models,
    we will focus on that."""
    slices = []
    for model in models:
        df = data2files[dataset]
        df = df[df["model"] == model].copy()
        if len(slices) > 1:
            assert np.array_equal(slices[-1]["template"].values, df["template"].values)    
        slices.append(df)
        
    
for dataset in DATASET_NAMES:
    print("Checking slices for dataset:", dataset)
    check_slices(dataset=dataset, data2files=DATASET_2_FILES, models=MODELS)

## Data Analysis - Filtering using $\eta$

In this section, we observe how the number of templates changes as we increase the max gender pmi difference. We observe that little to no evaluation examples remain after enforcing smaller values of $\mathrm{MaxPMI(s)}$. Conversely, as we relax the constraint, more and more examples are included.

In [None]:
from metrics import filter_eta_and_count_examples


MAXGENDER_COL = "max_gender_pmi"
FILTERING_ETA = np.linspace(0.0, 2.5, 101)[::-1]
print("Processing column", MAXGENDER_COL, "for values", FILTERING_ETA)

FILTER_CURVES_RESULTS = filter_eta_and_count_examples(
    name_and_dataset=DATASET_2_FILES,
    etas=FILTERING_ETA,
    col=MAXGENDER_COL,
    constant=NUM_EVAL_MODELS, 
)


fig, ax = plt.subplots(1,1, figsize=(FULL_WIDTH, FULL_WIDTH*2/3))
sns.lineplot(FILTER_CURVES_RESULTS, x="filter", y="freq", hue="dataset", lw=2) #set y="counts" to plot absolute values instead
ax.spines[['right', 'top']].set_visible(False)

ax.set_xlabel("$\eta$")
ax.set_ylabel("Percentage of Dataset")
ax.legend(title="Dataset", loc="upper left", bbox_to_anchor=(0.56, 0.70))

ax.xaxis.set_major_locator(MultipleLocator(0.5))
ax.xaxis.set_minor_locator(MultipleLocator(0.25))

ax.yaxis.set_major_locator(MultipleLocator(0.20))
ax.yaxis.set_major_formatter(PercentFormatter(1.0))  # 1.0 is to be treated as 100%
# Add grid
ax.grid(axis='x', which='major', linewidth=1, linestyle=":", color="lightgray")
ax.grid(axis='y', which="major", linewidth=1, linestyle=':', color="lightgray")

# Set axis limits
ax.set_xlim((0, 2))
ax.set_ylim((0, 1))
adjust(fig)
save_fig(fig, "lineplot__datasetpct_vs_maxpmi", dpi=100)

## Fairness metrics - Fixed threshold & AUC


In [None]:
from metrics import *

# fairness col in natural log space
FAIRNESS_COL = "FM_logprob"

# probability space threshold
_FAIRNESS_THRESHOLD = 1.65

**Natural logarithm base**: To report the results in natural logarithm, use the following cell. 
While earlier versions of the paper included the natural logarithm results, in the camera ready version of the paper, we decided to use the **base 10** since it is more intuitive and easy to reason about.

**Base 10 logarithm**: To report the results for the camera ready version of the paper, we use the base 10, since it makes it easier to think about the meaning of the value in the plots. We stick to the default value of 1.65, such that the results found in earlier versions of the paper (eg, [paper at the NeurIPS SOLAR workshop in 2023](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=nMwgV2UAAAAJ&sortby=pubdate&citation_for_view=nMwgV2UAAAAJ:_kc_bZDykSQC)) can be replicated.

In [None]:
FAIRNESS_THRESHOLD = np.log10(_FAIRNESS_THRESHOLD)
print(FAIRNESS_THRESHOLD)
MAX_AUC = 6
FAIRNESS_EPSILONS = np.linspace(0, MAX_AUC, 101)

FAIR_THRESHOLDS, FAIR_AUC = compute_neutralpct(
    DATASET_2_FILES,
    MODELS,
    DATASET_NAMES,
    FAIRNESS_EPSILONS,
    FAIRNESS_COL,
    use_log10=use_log_10_base,
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH, FULL_WIDTH))
sns.boxplot(FAIR_AUC, y="dataset", x="auc", ax=ax)
ax.axvline(MAX_AUC, ls="--", color="black", label="max auc")
ax.set_ylabel("Dataset")
ax.set_xlabel("Area under the fairness curve")
ax.spines[['right', 'top']].set_visible(False)

### Fairness AUC (discriminated by the different fairness thresholds)


The following table represents the AuFC measure for the different filtering values that we used to compute the AuFC. 


In [None]:
# Transform the long table into a wide table, by extending it with the dataset names
FAIR_AUC["dataset_"] = FAIR_AUC["dataset"].apply(lambda x: x if x != "USE-5" else "USE-05")
pd.pivot_table(FAIR_AUC, index="model", values=["auc"], columns=["dataset_"]).style.format('{:.2f}')

In [None]:
def fairness_threshold_plots(fairthresholds, fairauc, datasetnames, models, use_exp=None):
    models, tag = models[0], models[1]
    
    # For every dataset create a plot
    for dataset in datasetnames:
        # Obtain the subset corresponding to the desired dataset
        ft_df = fairthresholds[fairthresholds["dataset"] == dataset].copy()
        
        # Plot only the specified models
        ft_df = ft_df[ft_df["model"].isin(models)]
        
        # Obtain the AUC for that model and dataset
        aucs = fairauc[(fairauc["dataset"] == dataset) & (fairauc["model"].isin(models))]
        
        ft_df["Deduplicated"] = ft_df["model"].apply(lambda x: "(D)" in x)
        ft_df["Model"] = ft_df["model"].apply(lambda x: x.replace(" (D)", ""))
            
        if all(["pythia" in m for m in models]):
            ft_df["Model"] = ft_df["Model"].apply(lambda x: x.replace("pythia-", ""))
        
        if dataset in ("Winobias", "Winogender"):
            fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH/2, 2))
            ax.set_xlim((0, 5))

        else:
            fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH/2, 2))
            ax.set_xlim((0, 5))

            
        adjust(fig)
        ax.spines[['right', 'top']].set_visible(False)

        if use_exp is not None:
            ft_df["fairness_eps"] = ft_df["fairness_eps"].apply(use_exp)

        # Plot one line per model
        # Plot one line using different stule but same color if the model is deduplicated
        
        if ft_df["Deduplicated"].nunique() > 1:
            kwargs = dict(style="Deduplicated")
        else:
            kwargs = dict()
        
        sns.lineplot(ft_df, x="fairness_eps", y="pct_examples", hue="Model", lw=2, ax=ax, **kwargs)
        # ax.axvline(FAIRNESS_THRESHOLD, color="black", alpha=0.5)
        ax.set_title(dataset, fontsize=12)
        ax.set_xlabel("threshold", fontsize=12)
        ax.set_ylabel("fairness metric", fontsize=12)
        ax.set_ylim((0, 1))
        
        ax.xaxis.set_major_locator(MultipleLocator(1))
        ax.xaxis.set_minor_locator(MultipleLocator(0.5))

        ax.yaxis.set_major_locator(MultipleLocator(0.20))

        # Add axis formatting
        ax.yaxis.set_major_formatter(PercentFormatter(1.0))  # 1.0 is to be treated as 100%

        ax.grid(axis='x', which="major", linewidth=1, linestyle='--', color="lightgray")
        ax.grid(axis='x', which="minor", linewidth=1, linestyle=':', color="lightgray")

        ax.tick_params(axis='both', which='major', labelsize=12)
        ax.tick_params(axis='both', which='minor', labelsize=8)
        
        # Legend
        ax.legend(loc="upper left", bbox_to_anchor=(0.5, 0.9), fontsize=12)
        save_fig(fig, f"lineplot__{dataset}_{tag}_in_func_eps", dpi=100)
        plt.show()

### AuFC: Pythia models

In [None]:
pythia_models = [
    'pythia-70m',
    'pythia-70m (D)',
    # 'pythia-2.8b',
    # 'pythia-2.8b (D)',
    'pythia-6.9b',
    'pythia-6.9b (D)',
    'pythia-12b',
    'pythia-12b (D)',
    # 'gpt-j-6b'
], "pythia"

In [None]:
fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, pythia_models)

In [None]:
## uncomment expression below if you want to plot the x axis in the probability space
# (it assumes that fair thresholds and fair auc were previously computed in the log 10.)
# fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, pythia_models, use_exp=lambda x: 10**x)

### AuFC: OPT models

In [None]:
opt_models = [
    'opt-125m',
    'opt-2.7b',
    'opt-350m',
    'opt-6.7b',
], "opt"

fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, opt_models)

### AuFC: mpt * llama

In [None]:
misc_models = [
    'llama-2-13b',
    'llama-2-7b',
    'llama-2-70b',
    'mpt-30b',
    'mpt-7b',
    "OLMo-1B",
    "OLMo-7B",
    "Mistral-7B-v0.1",
    "Mixtral-8x7B-v0.1",
], "others"

fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, misc_models)

Let us create the grid for the fairness threshold picture in the paper.

In [None]:
def individual_fairness_threshold_plot(fairthresholds, fairauc, dataset, models, max_auc, ax, use_exp=None, simplify=True):
    # Obtain the subset corresponding to the desired dataset
    ft_df = fairthresholds[fairthresholds["dataset"] == dataset].copy()

    # Plot only the specified models
    ft_df = ft_df[ft_df["model"].isin(models)]

    # Obtain the AUC for that model and dataset
    aucs = fairauc[(fairauc["dataset"] == dataset) & (fairauc["model"].isin(models))]

    ft_df["Original"] = ft_df["model"].apply(lambda x: "No" if "(D)" in x else "Yes")
    ft_df["Model"] = ft_df["model"].apply(lambda x: x.replace(" (D)", ""))
    
    if simplify and all(["pythia" in m for m in models]):
        ft_df["Model"] = ft_df["Model"].apply(lambda x: x.replace("pythia-", ""))

    if use_exp is not None:
        ft_df["fairness_eps"] = ft_df["fairness_eps"].apply(use_exp)

    
    kwargs = {"style": "Original"} if ft_df["Original"].nunique() > 1 else {}    
    sns.lineplot(ft_df, x="fairness_eps", y="pct_examples", hue="Model", lw=1, ax=ax, alpha=0.8, **kwargs)
    # ax.axvline(FAIRNESS_THRESHOLD, color="black", alpha=0.5)
    ax.set_title(dataset, fontsize=15)
    ax.set_xlabel("threshold")
    ax.set_ylabel("fairness metric")
    ax.set_xlim((0, max_auc))
    ax.set_ylim((0, 1))

    ax.xaxis.set_major_locator(MultipleLocator(2))
    ax.xaxis.set_minor_locator(MultipleLocator(1))
    ax.yaxis.set_major_locator(MultipleLocator(0.25))

    # Add axis formatting
    # ax.yaxis.set_major_formatter(PercentFormatter(1.0))  # 1.0 is to be treated as 100%

    ax.grid(axis='x', which="major", linewidth=1, linestyle='--', color="lightgray")
    # ax.grid(axis='x', which="minor", linewidth=1, linestyle=':', color="lightgray")

    # Legend
    ax.legend(loc="upper left", bbox_to_anchor=(0.40, 0.75), fontsize=12)
    
    
# Separate plotting the data from formatting the figure
def plot_results_fairness(ax, name, **kwargs):
    if name == "USE-5":
        individual_fairness_threshold_plot(dataset="USE-5", ax=ax, max_auc=MAX_AUC, **kwargs)

    elif name == "Winobias":
        individual_fairness_threshold_plot(dataset="Winobias", ax=ax, max_auc=3, **kwargs)
    elif name == "Winogender":
        individual_fairness_threshold_plot(dataset="Winogender", ax=ax, max_auc=3, **kwargs)
    elif name == "USE-10":
        individual_fairness_threshold_plot(dataset="USE-10", ax=ax, max_auc=MAX_AUC, **kwargs)
    elif name == "USE-20":
        individual_fairness_threshold_plot(dataset="USE-20", ax=ax, max_auc=MAX_AUC, **kwargs)
    else:
        raise NotImplemented(f"Unexpected plot: {name}")

    
def make_figure(is_horizontal, plot_results, dataset_names, models, **kwargs):
    models, tag = models
    if is_horizontal:
        
        mosaic = []
        width_ratios = []
        for name in dataset_names:
            mosaic.append(name); width_ratios.append(1)
            mosaic.append("."); width_ratios.append(0.2)
            
        if len(dataset_names) == 5:
            width_ratios = [1, 0.2, 0.75, 0.2, 0.75, 0.2, 1, 0.2, 1, 0.2]
            
        fig, axd = plt.subplot_mosaic(
            mosaic=[mosaic[:-1]],
            gridspec_kw={"width_ratios": width_ratios[:-1]},
            figsize=(FULL_WIDTH, 2),
            sharey=True,
        )
        
    else:
        AB_gap, BC_gap = 0.2, 0.2

        fig, axd = plt.subplot_mosaic(
            mosaic=[
                ["A"], 
                ['.'], 
                ["B"], 
                ['.'],
                ["C"],
            ],
            gridspec_kw={"height_ratios": [1, AB_gap, 1, BC_gap, 1]},
            figsize=(2, FULL_WIDTH),
            sharey=True,
        )
    

    adjust(fig)
    
    for name, ax in axd.items():
        plot_results(ax, name, models=models,**kwargs)
        ax.spines[['right', 'top']].set_visible(False)
        ax.tick_params(axis='both', which='major', labelsize=14)
        ax.set_xlabel("threshold", fontsize=14)
        ax.set_ylabel("fairness metric", fontsize=14)

        
        if ax != axd[dataset_names[-1]]:
            ax.legend([],[], frameon=False)
        else:
            ax.legend(loc="upper center", ncol=1, bbox_to_anchor=(0.8, 0.95), fontsize=12)

    return fig, tag


for models in [pythia_models, opt_models, misc_models]:
    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES,#
                #dataset_names=DATASET_NAMES[0:1] + DATASET_NAMES[-2:], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models, simplify=False),
    )
    save_fig(fig, f"lineplots5__{tag}__fairness_metric_in_func_eps", dpi=150)

In [None]:
for models in [pythia_models, opt_models, misc_models]:

    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES[0:1] + DATASET_NAMES[-3:], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models, simplify=False),
    )
    save_fig(fig, f"lineplots4__ours__{tag}__fairness_metric_in_func_eps", dpi=100)

In [None]:
for models in [pythia_models, opt_models, misc_models]:
    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES[0:1] + DATASET_NAMES[-2:], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models),
    )
    save_fig(fig, f"lineplots3_ours__{tag}__fairness_metric_in_func_eps", dpi=100)

In [None]:
for models in [pythia_models, opt_models, misc_models]:
    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES[1:3], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models, simplify=False),
    )
    save_fig(fig, f"lineplots2_others__{tag}__fairness_metric_in_func_eps", dpi=100)

## Fairness Neutrality, Unstereo Score (US)

In this section, we aim to compute the different skews of the models for various constrained settings. 
In particular, we will compute:

1. **Fairness metric**: focus on the computation of the neutral examples, i.e., the examples whose test sentence pair likelihoods are within $\exp^{\epsilon_f}$
2. Difference in predicted female vs predicted male: if the sentences are not being predicted neutral, how is the model assigning the probability? 

In [None]:
FAIRNESS_THRESHOLD, FAIRNESS_COL

In [None]:
print("-"*80)
print(f"Using threshold: {FAIRNESS_THRESHOLD:.4f} to compute fairness metric")
print("-"*80)

# Original dataset (before applying any of the max pmi constraints)
BEFORE_FILTER = {dataset: df.copy() for dataset, df in DATASET_2_FILES.items()}

# Use this version to use the natural logarithm
# BEFORE_FILTER = compute_skews_(BEFORE_FILTER, FAIRNESS_COL, 0.5)
# use this version to use the base 10 results
BEFORE_FILTER = compute_skews_(BEFORE_FILTER, FAIRNESS_COL, FAIRNESS_THRESHOLD, use_base_10=use_log_10_base)

In [None]:
BEFORE_FILTER["USE-5"].head(2)

### Neutrality and AuFC (per constrained setting)

While we propose a pipeline to create benchmarks that satisfy the gender co-occurrence constraints, in our experiments we do not immediately restrict our benchmarks. The main goal being that we'd like to be able to study the effect of stricter PMI constraints. For that reason, in the following setting, we will compute the value of Neutrality and AuFC for $\eta \in \{0.3, 0.5, 0.65, 0.8, 1\}$. The stricter setup being $\eta = 0.3$ and the least strict being $\eta = 1$. The original unconstrained version of the dataset (stored in variable `BEFORE_FILTER[<dataset>]`) is denoted $\eta = \infty$ in the paper.

In [None]:
PMI_THRESHOLDS = [0.3, 0.5, 0.65, 0.8, 1.0]

print(f"Fairness col: '{FAIRNESS_COL}' and threshold: '{FAIRNESS_THRESHOLD}'")
AFTER_FILTER = {}
# Filter out the dataset_w_constraints according to the different PMI thresholds (or \epsilon_k)
for pmi_threshold in PMI_THRESHOLDS:
    # Create the different filters for each dataset
    print("eta =", pmi_threshold)
    AFTER_FILTER[pmi_threshold] = {
        dataset: filter_data_by_col_val(df.copy(), col=MAXGENDER_COL, thres=pmi_threshold).copy()
        for dataset, df in BEFORE_FILTER.items()
    } 

# For each filtered version of the dataset, compute the corresponding skews and metrics
AFTER_FILTER = {
    filt: compute_skews_(bias_files, FAIRNESS_COL, FAIRNESS_THRESHOLD, use_base_10=use_log_10_base) for filt, bias_files in AFTER_FILTER.items()
}

In [None]:
def merge_results(data2files) -> pd.DataFrame:
    return pd.merge(
        # Compute unstereo score
        compute_neutral_pct_w_std(data2files), 
        # Compute predictive disparity metric
        compute_female_male_skews(data2files, MODELS),
        on=["dataset", "model"],
        how="inner"
    )

METRICS_BEFORE_FILTER = merge_results(BEFORE_FILTER)
METRICS_AFTER_FILTER = {eta: merge_results(AFTER_FILTER[eta]) for eta in AFTER_FILTER.keys()}

#### Number of examples before and after the filters

In [None]:
print("All examples:")
print({dataset: len(df) / NUM_EVAL_MODELS for dataset, df in BEFORE_FILTER.items()})


for eps, eps_values in AFTER_FILTER.items():
    print()
    print("Number of examples after filter", eps)
    print({dataset: len(df) / NUM_EVAL_MODELS for dataset, df in eps_values.items()})

### Create tables

In [None]:
import re

def model2latex(model: str):    
    if "pythia" in model:
        return "\\" + re.sub(r"pythia-(.+)", r"pyths{\1}", model)
    elif "opt" in model:
        return "\\" + re.sub(r"opt-(.+)", r"opts{\1}", model)
    elif "mpt" in model:
        return "\\" + re.sub(r"mpt-(.+)", r"mpts{\1}", model)
    elif "llama-2" in model:
        return "\\" + re.sub(r"llama-2-(.+)", r"llamas{\1}", model)
    elif "gpt-j" in model:
        return "\\" + "gptj"
    else:
        return model
        

def print_results(data, value):
    table = pd.pivot(data, values=[value], index="model", columns=["dataset"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table["model"] = table["model"].apply(model2latex)
    print(table.set_index("model").to_latex())

    
def get_results(data, value):
    table = pd.pivot(data, values=[value], index="model", columns=["dataset"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table["model"] = table["model"].apply(model2latex)
    return table.set_index("model")

### Neutral fairness

In [None]:
print("-" * 80, "\n")
print("NO FILTER")
print("\n", "-" * 80, "\n\n")
print_results(METRICS_BEFORE_FILTER, "neutral_final")


for eps, df in METRICS_AFTER_FILTER.items():
    print("-" * 80, "\n")
    print(f"FILTER = {eps}")
    print_results(METRICS_AFTER_FILTER[eps], "neutral_final")
    print("-" * 80, "\n\n")

### Create tables w/ fairness gap


#### Table 1.

In [None]:
print("NO FILTER")
r = get_results(METRICS_BEFORE_FILTER, "neutral_avg")
r.to_csv("camera_ready/table/neutral_avg__unfiltered.csv")
fairness_gap_tables = {"unfiltered": r}

for eps, df in METRICS_AFTER_FILTER.items():
    print(f"FILTER = {eps}")
    r = get_results(METRICS_AFTER_FILTER[eps], "neutral_avg")
    r.to_csv(f"camera_ready/table/neutral_avg__filtered__{eps}.csv")
    fairness_gap_tables[eps] = r
    
    
orig = fairness_gap_tables["unfiltered"]
delta_08 = fairness_gap_tables[0.8] - orig
delta_065 = fairness_gap_tables[0.65] - orig

df = pd.DataFrame()
df.index = orig.index
assert all(df.index == delta_08.index)
assert all(df.index == delta_065.index)

for dataset in ["USE-05", "Winobias", "Winogender"]:
    df.insert(len(df.columns), f"{dataset}__Orig", orig[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.8}", delta_08[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.65}", delta_065[dataset])
    
print(df.style.format('{:.2f}').to_latex())

In [None]:
df = pd.DataFrame()
df.index = orig.index
assert all(df.index == delta_08.index)
assert all(df.index == delta_065.index)

for dataset in ["USE-10", "USE-20"]:
    df.insert(len(df.columns), f"{dataset}__Orig", orig[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.8}", delta_08[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.65}", delta_065[dataset])
    
print(df.style.format('{:.2f}').to_latex())

#### Table 2. Impact of training data deduplication at $\eta = 0.65$

In [None]:
eta = 0.65
tab2 = fairness_gap_tables[eta].reset_index().copy()
tab2 = tab2[tab2["model"].apply(lambda s: s.startswith("\pyths"))]

tab2_dedup_mask = tab2["model"].apply(lambda s: '(D)' in s)
# original models
tab2_orig = tab2[~tab2_dedup_mask].sort_values("model")
tab2_orig = tab2_orig.set_index("model")

# deduplicate models
tab2_dedup = tab2[tab2_dedup_mask].sort_values("model")
tab2_dedup["model"] = tab2_dedup["model"].apply(lambda s: s.replace(" (D)", ""))
tab2_dedup = tab2_dedup.set_index("model")

assert all(tab2_dedup.index == tab2_orig.index)

print((tab2_dedup - tab2_orig).style.format('{:.2f}').to_latex())

### AuFC

In [None]:
AUFC_BASE_DIR = "./camera_ready/table/aufc"

def print_results_aufc(data_auc, filepath):
    table = pd.pivot(data_auc, values=["auc"], index="model", columns=["dataset_"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table_str = table.set_index("model").style.format('{:.2f}').to_latex()
    with open(filepath, "w") as f:
        f.write(table_str)
    
    # To latex file, leveraging rendering commands for model names
    table["model"] = table["model"].apply(model2latex)
    table_str = table.set_index("model").style.format('{:.2f}').to_latex()
    print(table_str)

In [None]:
# We need to create a file for landing page with the different metrics.
# Ideally, we create a different json file for every dataset
# where json file contains for every filter/max pmi constraint the models'
# values for a given metric.
# -----------------------
# Example for dataset X
# -----------------------
# {
#   none: {
#     neutral__avg: {
#        model1: 98.32,
#        ...
#        modeln: ...
#     }, 
#     neutral__std: {
#
#     },   
#     aufc: {
#
#     },  
#     male_rel_ratio: {
#
#     },      
#   },
#   0.5: {
#     ...
#   },
#   ...
# }
# ---------------------------------------------------------------
METRICS_FOR_LANDING_PAGE = {name: {} for name in DATASET_NAMES}

neutral__avg = {None: compute_female_male_skews(BEFORE_FILTER, MODELS)}
neutral__std = {None: compute_neutral_pct_w_std(BEFORE_FILTER)}

for eps in AFTER_FILTER.keys():
    neutral__avg[eps] = compute_female_male_skews(AFTER_FILTER[eps], MODELS)
    neutral__std[eps] = compute_neutral_pct_w_std(AFTER_FILTER[eps])

    
fair_auc_landing_page = {None: compute_neutralpct(
    DATASET_2_FILES,
    MODELS,
    DATASET_NAMES,
    FAIRNESS_EPSILONS,
    FAIRNESS_COL,
    use_log10=use_log_10_base,
)[1]}

for eps, df in AFTER_FILTER.items():
    _, fair_auc = compute_neutralpct(df, MODELS, DATASET_NAMES, FAIRNESS_EPSILONS, FAIRNESS_COL)
    fair_auc_landing_page[eps] = fair_auc

In [None]:
FAIRNESS_THRESHOLD = np.log10(_FAIRNESS_THRESHOLD)
print(FAIRNESS_THRESHOLD)
MAX_AUC = 6
FAIRNESS_EPSILONS = np.linspace(0, MAX_AUC, 101)

FAIR_THRESHOLDS, FAIR_AUC = compute_neutralpct(
    DATASET_2_FILES,
    MODELS,
    DATASET_NAMES,
    FAIRNESS_EPSILONS,
    FAIRNESS_COL,
    use_log10=use_log_10_base,
)

print("-" * 80, "\n")
print("-" * 80, "\n")
FAIR_AUC["dataset_"] = FAIR_AUC["dataset"].apply(lambda x: x if x != "USE-5" else "USE-05")
print_results_aufc(FAIR_AUC, f"{AUFC_BASE_DIR}/unfiltered.tex")


for eps, df in AFTER_FILTER.items():
    print("-" * 80, "\n")
    print(f"FILTER = {eps}")
    print("-" * 80, "\n")
    FAIR_THRESHOLDS, FAIR_AUC = compute_neutralpct(df, MODELS, DATASET_NAMES, FAIRNESS_EPSILONS, FAIRNESS_COL)
    FAIR_AUC["dataset_"] = FAIR_AUC["dataset"].apply(lambda x: x if x != "USE-5" else "USE-05")
    print_results_aufc(FAIR_AUC, f"{AUFC_BASE_DIR}/filter_{str(eps).replace('.', '')}.tex")
    fair_auc_landing_page[eps] = FAIR_AUC