In [1]:
import pathlib as pl

import glob
import pandas as pd
import numpy as np
import itertools, warnings

from collections import Counter, defaultdict
from typing import List, Dict, Tuple


# -----------------------------------------------------------------------
# CAMERA-READY PLOTTING (thanks Alex Boyd!)
# -----------------------------------------------------------------------
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.ticker import MultipleLocator, PercentFormatter
# The following code is borrowed from material provided by Alex!
FULL_WIDTH = 5.50107
COL_WIDTH  = 4.50461

# Accessibility
sns.set_palette(sns.color_palette("colorblind"))
matplotlib.rcParams["axes.prop_cycle"] = matplotlib.cycler(color=sns.color_palette("colorblind"))

# Put at top of plotting script (requires tex be installed though)
matplotlib.rc('font', family='serif', size=20)
matplotlib.rc('text', usetex=True)


def adjust(fig, left=0.0, right=1.0, bottom=0.0, top=1.0, wspace=0.0, hspace=0.0):
    fig.subplots_adjust(
        left   = left,  # the left side of the subplots of the figure
        right  = right,  # the right side of the subplots of the figure
        bottom = bottom,  # the bottom of the subplots of the figure
        top    = top,  # the top of the subplots of the figure
        wspace = wspace,  # the amount of width reserved for blank space between subplots
        hspace = hspace,  # the amount of height reserved for white space between subplots
    )
    
def save_fig(fig, name, **kwargs):
    fig.savefig(f"./camera_ready/images/{name}.pdf", bbox_inches="tight", **kwargs)

def disable_axis(ax):
    ax.set_zorder(-100)  # Avoids a visual rendering bug
    ax.set_xticks([])
    ax.set_xticklabels([])
    ax.set_yticks([])
    ax.set_yticklabels([])
    plt.setp(ax.spines.values(), color=None)

## 1. Load model files

Run `post-process-results.ipynb` first to generate a compiled version of the results.

In [2]:
RESULTS_DIR = "../results"

# list all the score files per dataset
DATASET_2_FILEPATHS = {
    "USE-5": f"{RESULTS_DIR}/USE-5-no-maxpmi-constraint.csv.gz",
    # Baselines below ----
    "Winobias": f"{RESULTS_DIR}/Winobias-no-maxpmi-constraint.csv.gz",
    "Winogender": f"{RESULTS_DIR}/Winogender-no-maxpmi-constraint.csv.gz",
    # We define this ordering so that we can automatically obtain the same coloring scheme as
    # the one used for word analysis
    "USE-10": f"{RESULTS_DIR}/USE-10-no-maxpmi-constraint.csv.gz",
    "USE-20": f"{RESULTS_DIR}/USE-20-no-maxpmi-constraint.csv.gz",
}

DATASET_NAMES = list(DATASET_2_FILEPATHS.keys())
print(" Dataset names:\n  ->", DATASET_NAMES, "\n")

# Read each individual filepath, creating an association <str, dataframe>.
# every str should have a list of the same size.
DATASET_2_FILES = {name: pd.read_csv(fp) for name, fp in DATASET_2_FILEPATHS.items()}
DATASET_2_FILES = {name: df.sort_values(["model", "orig_index"]).reset_index(drop=True) for name, df in DATASET_2_FILES.items()}

# ------------------------------------------------------------------
# Determine whether the number of evaluated models are the same
# ------------------------------------------------------------------

MODELS, NUM_EVAL_MODELS = [], []

for dataset, df in DATASET_2_FILES.items():
    print("Number of evaluated models for dataset", dataset, "is", df["model"].nunique())
    MODELS.extend(df["model"].unique())
    NUM_EVAL_MODELS.append(df["model"].nunique())
    
# We force the number of models to be the same across all datasets
if len(set(NUM_EVAL_MODELS)) != 1:
    warnings.warn(f"Inconsistent number of models across the different evaluation mber models: {NUM_EVAL_MODELS}")

NUM_EVAL_MODELS = NUM_EVAL_MODELS[0]
print("Evaluating", NUM_EVAL_MODELS, "models:")
MODELS = list(sorted(set(MODELS)))
print(" -", "\n - ".join(MODELS))

 Dataset names:
  -> ['USE-5', 'Winobias', 'Winogender', 'USE-10', 'USE-20'] 

Number of evaluated models for dataset USE-5 is 28
Number of evaluated models for dataset Winobias is 28
Number of evaluated models for dataset Winogender is 28
Number of evaluated models for dataset USE-10 is 28
Number of evaluated models for dataset USE-20 is 28
Evaluating 28 models:
 - Mistral-7B-v0.1
 - Mixtral-8x7B-v0.1
 - OLMo-1B
 - OLMo-7B
 - gpt-j-6b
 - llama-2-13b
 - llama-2-70b
 - llama-2-7b
 - mpt-30b
 - mpt-7b
 - opt-125m
 - opt-2.7b
 - opt-350m
 - opt-6.7b
 - pythia-1.4b
 - pythia-1.4b (D)
 - pythia-12b
 - pythia-12b (D)
 - pythia-160m
 - pythia-160m (D)
 - pythia-2.8b
 - pythia-2.8b (D)
 - pythia-410m
 - pythia-410m (D)
 - pythia-6.9b
 - pythia-6.9b (D)
 - pythia-70m
 - pythia-70m (D)


In [3]:
# ------------------------------------------------------------------------
# Validation (!sanity check)
# ------------------------------------------------------------------------
# When selecting a data slice from the big dataframe
# we must guarantee that the sentences match to one another
# (that is necessary because the remaining of the code is relying
# on ordering of the dataframes)
def check_slices(dataset: pd.DataFrame, data2files: dict, models: List[str]):
    """Check for the ordering of the rows in ``dataset`` correspond to the
    ones in ``data2files``. Since the data2files are ordered by models,
    we will focus on that."""
    slices = []
    for model in models:
        df = data2files[dataset]
        df = df[df["model"] == model].copy()
        if len(slices) > 1:
            assert np.array_equal(slices[-1]["template"].values, df["template"].values)    
        slices.append(df)
        
    
for dataset in DATASET_NAMES:
    print("Checking slices for dataset:", dataset)
    check_slices(dataset=dataset, data2files=DATASET_2_FILES, models=MODELS)

Checking slices for dataset: USE-5
Checking slices for dataset: Winobias
Checking slices for dataset: Winogender
Checking slices for dataset: USE-10
Checking slices for dataset: USE-20


In [4]:
from metrics import filter_eta_and_count_examples

MAXGENDER_COL = "max_gender_pmi"
FILTERING_ETA = np.linspace(0.0, 2.5, 101)[::-1]
print("Processing column", MAXGENDER_COL, "for values", FILTERING_ETA)

FILTER_CURVES_RESULTS = filter_eta_and_count_examples(
    name_and_dataset=DATASET_2_FILES,
    etas=FILTERING_ETA,
    col=MAXGENDER_COL,
    constant=NUM_EVAL_MODELS, 
)

Processing column max_gender_pmi for values [2.5   2.475 2.45  2.425 2.4   2.375 2.35  2.325 2.3   2.275 2.25  2.225
 2.2   2.175 2.15  2.125 2.1   2.075 2.05  2.025 2.    1.975 1.95  1.925
 1.9   1.875 1.85  1.825 1.8   1.775 1.75  1.725 1.7   1.675 1.65  1.625
 1.6   1.575 1.55  1.525 1.5   1.475 1.45  1.425 1.4   1.375 1.35  1.325
 1.3   1.275 1.25  1.225 1.2   1.175 1.15  1.125 1.1   1.075 1.05  1.025
 1.    0.975 0.95  0.925 0.9   0.875 0.85  0.825 0.8   0.775 0.75  0.725
 0.7   0.675 0.65  0.625 0.6   0.575 0.55  0.525 0.5   0.475 0.45  0.425
 0.4   0.375 0.35  0.325 0.3   0.275 0.25  0.225 0.2   0.175 0.15  0.125
 0.1   0.075 0.05  0.025 0.   ]


## Fairness metrics - Fixed threshold & AUC


In [5]:
from metrics import *

# fairness col in natural log space
FAIRNESS_COL = "FM_logprob"

# probability space threshold
_FAIRNESS_THRESHOLD = 1.65

In [6]:
FAIRNESS_THRESHOLD = np.log10(_FAIRNESS_THRESHOLD)
print(FAIRNESS_THRESHOLD)
MAX_AUC = 6
FAIRNESS_EPSILONS = np.linspace(0, MAX_AUC, 101)

FAIR_THRESHOLDS, FAIR_AUC = compute_neutralpct(
    DATASET_2_FILES,
    MODELS,
    DATASET_NAMES,
    FAIRNESS_EPSILONS,
    FAIRNESS_COL,
    use_log10=use_log_10_base,
)

0.21748394421390624


In [7]:
print("-"*80)
print(f"Using threshold: {FAIRNESS_THRESHOLD:.4f} to compute fairness metric")
print("-"*80)

# Original dataset (before applying any of the max pmi constraints)
BEFORE_FILTER = {dataset: df.copy() for dataset, df in DATASET_2_FILES.items()}

# Use this version to use the natural logarithm
# BEFORE_FILTER = compute_skews_(BEFORE_FILTER, FAIRNESS_COL, 0.5)
# use this version to use the base 10 results
BEFORE_FILTER = compute_skews_(BEFORE_FILTER, FAIRNESS_COL, FAIRNESS_THRESHOLD, use_base_10=use_log_10_base)

--------------------------------------------------------------------------------
Using threshold: 0.2175 to compute fairness metric
--------------------------------------------------------------------------------
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624


### Neutrality and AuFC (per constrained setting)

While we propose a pipeline to create benchmarks that satisfy the gender co-occurrence constraints, in our experiments we do not immediately restrict our benchmarks. The main goal being that we'd like to be able to study the effect of stricter PMI constraints. For that reason, in the following setting, we will compute the value of Neutrality and AuFC for $\eta \in \{0.3, 0.5, 0.65, 0.8, 1\}$. The stricter setup being $\eta = 0.3$ and the least strict being $\eta = 1$. The original unconstrained version of the dataset (stored in variable `BEFORE_FILTER[<dataset>]`) is denoted $\eta = \infty$ in the paper.

In [8]:
PMI_THRESHOLDS = [0.3, 0.5, 0.65, 0.8, 1.0]

print(f"Fairness col: '{FAIRNESS_COL}' and threshold: '{FAIRNESS_THRESHOLD}'")
AFTER_FILTER = {}
# Filter out the dataset_w_constraints according to the different PMI thresholds (or \epsilon_k)
for pmi_threshold in PMI_THRESHOLDS:
    # Create the different filters for each dataset
    print("eta =", pmi_threshold)
    AFTER_FILTER[pmi_threshold] = {
        dataset: filter_data_by_col_val(df.copy(), col=MAXGENDER_COL, thres=pmi_threshold).copy()
        for dataset, df in BEFORE_FILTER.items()
    } 

# For each filtered version of the dataset, compute the corresponding skews and metrics
AFTER_FILTER = {
    filt: compute_skews_(bias_files, FAIRNESS_COL, FAIRNESS_THRESHOLD, use_base_10=use_log_10_base) for filt, bias_files in AFTER_FILTER.items()
}

Fairness col: 'FM_logprob' and threshold: '0.21748394421390624'
eta = 0.3
eta = 0.5
eta = 0.65
eta = 0.8
eta = 1.0
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_base10 0.21748394421390624
FM_logprob_

In [9]:
def merge_results(data2files) -> pd.DataFrame:
    return pd.merge(
        # Compute unstereo score
        compute_neutral_pct_w_std(data2files), 
        # Compute predictive disparity metric
        compute_female_male_skews(data2files, MODELS),
        on=["dataset", "model"],
        how="inner"
    )

METRICS_BEFORE_FILTER = merge_results(BEFORE_FILTER)
METRICS_AFTER_FILTER = {eta: merge_results(AFTER_FILTER[eta]) for eta in AFTER_FILTER.keys()}

#### Number of examples before and after the filters

In [10]:
print("All examples:")
print({dataset: len(df) / NUM_EVAL_MODELS for dataset, df in BEFORE_FILTER.items()})


for eps, eps_values in AFTER_FILTER.items():
    print()
    print("Number of examples after filter", eps)
    print({dataset: len(df) / NUM_EVAL_MODELS for dataset, df in eps_values.items()})

All examples:
{'USE-5': 4405.0, 'Winobias': 1586.0, 'Winogender': 240.0, 'USE-10': 4740.0, 'USE-20': 4839.0}

Number of examples after filter 0.3
{'USE-5': 1556.0, 'Winobias': 22.0, 'Winogender': 16.0, 'USE-10': 601.0, 'USE-20': 133.0}

Number of examples after filter 0.5
{'USE-5': 3069.0, 'Winobias': 186.0, 'Winogender': 69.0, 'USE-10': 2397.0, 'USE-20': 1456.0}

Number of examples after filter 0.65
{'USE-5': 3698.0, 'Winobias': 409.0, 'Winogender': 107.0, 'USE-10': 3401.0, 'USE-20': 2828.0}

Number of examples after filter 0.8
{'USE-5': 3978.0, 'Winobias': 675.0, 'Winogender': 150.0, 'USE-10': 3916.0, 'USE-20': 3561.0}

Number of examples after filter 1.0
{'USE-5': 4263.0, 'Winobias': 879.0, 'Winogender': 188.0, 'USE-10': 4396.0, 'USE-20': 4296.0}


### Create tables

In [11]:
import re

def model2latex(model: str):    
    if "pythia" in model:
        return "\\" + re.sub(r"pythia-(.+)", r"pyths{\1}", model)
    elif "opt" in model:
        return "\\" + re.sub(r"opt-(.+)", r"opts{\1}", model)
    elif "mpt" in model:
        return "\\" + re.sub(r"mpt-(.+)", r"mpts{\1}", model)
    elif "llama-2" in model:
        return "\\" + re.sub(r"llama-2-(.+)", r"llamas{\1}", model)
    elif "gpt-j" in model:
        return "\\" + "gptj"
    else:
        return model
        

def print_results(data, value):
    table = pd.pivot(data, values=[value], index="model", columns=["dataset"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table["model"] = table["model"].apply(model2latex)
    print(table.set_index("model").to_latex())

    
def get_results(data, value):
    table = pd.pivot(data, values=[value], index="model", columns=["dataset"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table["model"] = table["model"].apply(model2latex)
    return table.set_index("model")

### pred female - pred male

In [12]:
get_results(METRICS_BEFORE_FILTER, "pct_fem_min_mal");
get_results(METRICS_AFTER_FILTER[0.8], "pct_fem_min_mal");
get_results(METRICS_AFTER_FILTER[0.65], "pct_fem_min_mal");
get_results(METRICS_AFTER_FILTER[0.5], "pct_fem_min_mal");

In [13]:
# obtain latex tables

print("-" * 80, "\n")
print("NO FILTER")
print("\n", "-" * 80, "\n\n")
print_results(METRICS_BEFORE_FILTER, "pct_fem_min_mal")

for eps, df in METRICS_AFTER_FILTER.items():
    print("-" * 80, "\n")
    print(f"FILTER = {eps}")
    print_results(METRICS_AFTER_FILTER[eps], "pct_fem_min_mal")
    print("-" * 80, "\n\n")

-------------------------------------------------------------------------------- 

NO FILTER

 -------------------------------------------------------------------------------- 


\begin{tabular}{llllll}
\toprule
 & USE-05 & USE-10 & USE-20 & Winobias & Winogender \\
model &  &  &  &  &  \\
\midrule
Mistral-7B-v0.1 & -33.26 & -11.96 & 6.78 & -53.85 & -35.42 \\
Mixtral-8x7B-v0.1 & -48.74 & -25.61 & -4.15 & -53.34 & -42.92 \\
OLMo-1B & 39.27 & 27.26 & 34.24 & -58.39 & -44.58 \\
OLMo-7B & 44.93 & 36.67 & 40.57 & -56.87 & -52.08 \\
\gptj & 12.01 & 18.54 & 22.38 & -44.89 & -41.25 \\
\llamas{13b} & -27.06 & -12.49 & -6.65 & -55.80 & -40.00 \\
\llamas{70b} & -24.97 & -4.81 & 4.38 & -55.74 & -35.42 \\
\llamas{7b} & -20.75 & -12.11 & -3.29 & -56.12 & -40.83 \\
\mpts{30b} & 54.51 & 34.18 & 32.47 & -58.13 & -42.50 \\
\mpts{7b} & -3.52 & 13.97 & 21.49 & -51.01 & -46.67 \\
\opts{125m} & -72.51 & -54.68 & -36.21 & -47.41 & -37.08 \\
\opts{2.7b} & -51.42 & -32.43 & -15.75 & -50.69 & -42.08 \\
\opts{35