# Benchmark Statistics

In this notebook, we assess the evaluated benchmarks in terms of the length, average PMI diff, average Max Gender PMI diff in each sentence, number of gendered words, template length and position of the pronouns. 

In [None]:
import pathlib as pl

import glob
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_palette(sns.color_palette("colorblind"))

import itertools, warnings

from collections import Counter, defaultdict
from typing import List, Dict, Tuple

# -------------------------------------------------------------------
# Utility constants used across evaluation notebooks
from utils import GROUP_PAIRED_WORDLIST, FEMALE_WORDS, MALE_WORDS
# Utility methods used across evaluation notebooks
from utils import get_model_size, canonic_model_name

##### Load the word-level PMI 

The word-level PMI was pre computed from PILE it is computed based on the counts made available by [Razeghi et al. 2022](https://aclanthology.org/2022.emnlp-demos.39/).
The file consists of precomputed pointwise mutual information (PMI) values for each word (row) and specific gendered words (as indicated in the column names, e.g., "pmi_her" defines the PMI value between every word and the word "her").

In [None]:
BASE_DIR = ".."

# loads the PMI information precomputed based on the PILE co-occurrence counts
GENDER_PMI = pd.read_csv(f"{BASE_DIR}/word2gender_pmi_PILE.csv", index_col=0)
print("Length:", len(GENDER_PMI))
GENDER_PMI.describe()

#### PMI difference between gendered words

Compute the PMI difference between col1 and col2.

In [None]:
def get_pmi_diff(
    df: pd.DataFrame, 
    col1: str,
    col2: str,
    clip: int=None,
    missing_val: float=0.0,
    prefix_col: str="pmi__",
) -> pd.DataFrame:
    """Obtains the PMI difference between columns col1 and col2. 
    
    Parameters
    ----------
    df: pandas.DataFrame
    
    col1: str
        The female word to use for computing the PMI. Should be one of the
        available suffixes in the provided dataframe's columns.
    
    col2: str
        The male word to use for computing the PMI. Should be one of the
        available suffixes in the provided dataframe's columns.
        
    clip: int, optional
        Positive integer, specifies the cap. If not specified, the pmi
        difference is only computed for words that co-occur with both
        (col1, col2). If specified, we will fill the PMI value with 0
        (ideally it would be a very negative number). You can tweak
        this value using 'missing_val'.
    
    missing_val: float, default 0
        Default value used to replace values that are clipped.
    
    prefix_col: str
        The prefix anteceding the col1 and col2 in the provided dataframe.
        In our files, we prefixes all columns with gendered lexicons using
        the "pmi__" prefix.
    
    Note
    ----
    To replicate the values of the paper you should pass female lexicon words
    as col1 and male lexicon words as col2.
    """
    assert f"{prefix_col}{col1}" in df.columns, f"column {col1} is undefined in dataframe"
    assert f"{prefix_col}{col2}" in df.columns, f"column {col2} is undefined in dataframe"
    
    if clip is None:
        result = df[["word", f"{prefix_col}{col1}", f"{prefix_col}{col2}"]].dropna()
    else:
        result = df[["word", f"{prefix_col}{col1}", f"{prefix_col}{col2}"]].fillna(missing_val)
        
    print(f"('{col1}', '{col2}') pmi-defined words: {len(result)}")
    result[f"pmi({col1})-pmi({col2})"] = result[f"{prefix_col}{col1}"] - result[f"{prefix_col}{col2}"]
    
    if clip is not None:
        result[f"pmi({col1})-pmi({col2})"].clip(lower=-clip, upper=clip, inplace=True)
    return result


def get_gender_pairs_matrix(
    gender_pmi_df: pd.DataFrame,
    parallel_terms: list,
    **kwargs,
) -> pd.DataFrame:
    """Compute the pmi difference between the pairs of parallel terms. 
    
    Examples of parallel terms can be (she, he). In the gendered setting, it
    expects the first term in the pair to refer to feminine and the
    second term in the pair to be referring to masculine.
    
    Parameters
    ----------
    gender_pmi_df: pandas.DataFrame
        The PMI of every word (row) and a specific word. 
        
    parallel_terms: list of <str, str> pairs
        List of gendered words whose PMI is present in 'gender_pmi_df'.
        
    Returns
    -------
    pandas.DataFrame
        Table with original PMI per word as well as the difference between
        the specified words. Resulting columns will be named as 
        '{word1}-{word2}', where word1 and word2 are the first and second
        words in the specified pairs.
    """
    # dataframe with all the group pairs PMI (per word)
    # (words for which no PMI diff is define)
    pairs = gender_pmi_df[["word"]].copy().set_index("word")
    num_words = []

    for fword, mword in parallel_terms:
        try:
            # Compute the pmi difference between fword and mword
            d = get_pmi_diff(gender_pmi_df, fword, mword, **kwargs).set_index("word")
            # Rename to be easier to visualize
            d = d.rename({f"pmi({fword})-pmi({mword})": f"{fword}-{mword}"}, axis=1)
            # Number of well-defined words for each of the gender pairs
            num_words.append((f"{fword}-{mword}", len(d)))
            pairs = pairs.join(d[[f"{fword}-{mword}"]])
        except:
            print(f"Pair ({fword}, {mword}) doesn't exist...")

    return pairs, num_words


# Since we may want to perform some correlation with other gendered words
# we also define the PMI diff between words and other gendered word pairs
GENDER_PAIRS, GENDER_PAIRS_NUM_WORDS = get_gender_pairs_matrix(GENDER_PMI, GROUP_PAIRED_WORDLIST)

# ----------------------------------------------------------------------------
# compute PMI diff used in the main paper
# ----------------------------------------------------------------------------
# Most analysis will focus on the pmi_diff(she, he)
PMI_DIFF = get_pmi_diff(GENDER_PMI, "she", "he").sort_values("pmi(she)-pmi(he)")
# rename pmi difference column to be something less verbose :b
PMI_DIFF = PMI_DIFF.rename({"pmi(she)-pmi(he)": "pmi_diff"}, axis=1)
PMI_DIFF.sample(15, random_state=81273)

#### Read files

Read the scores assigned to each test sentence pair for the proposed benchmarks: Ours-05, Ours-10, Ours-20, as well as the scores assigned to the sentence pairs in WinoBias and WinoGender. There should be 23 files for each benchmark (46 for Winobias, since there are dev and test files).

In [None]:
BASE_DIR = ".."

# list all the score files per dataset
DATASET_2_FILEPATHS = {
    "Ours-05": glob.glob(f"{BASE_DIR}/results-words5/final-results/*__scores__*.csv"),
    # Baselines below ----
    "Winobias": glob.glob(f"{BASE_DIR}/results-baselines/final-results/*Winobias*__scores__*.csv"),
    "Winogender": glob.glob(f"{BASE_DIR}/results-baselines/final-results/*Winogender*__scores__*.csv"),
    # "StereoSet": glob.glob(f"{BASE_DIR}/results-baselines/final-results/*StereoSet*__scores__*.csv"),
    # We specify this order so that we can automatically obtain the same coloring scheme as
    # the one used for word analysis
    "Ours-10": glob.glob(f"{BASE_DIR}/results-words10/final-results/*__scores__*.csv"),
    "Ours-20": glob.glob(f"{BASE_DIR}/results-words20/final-results/*__scores__*.csv"),
}
DATASET_2_FILEPATHS = {k: sorted(v) for k, v in DATASET_2_FILEPATHS.items()}
DATASET_NAMES = list(DATASET_2_FILEPATHS.keys())
print(DATASET_NAMES)

for name, files in DATASET_2_FILEPATHS.items():
    print(name, len(files), "files")

In [None]:
# Read the files paths
# --------------------------------
# When reading the filepaths, there are a few things we'd like to do
# 1. record which model it belongs to
def get_model_name(filepath: str, suffix="__scores__") -> str:
    """This method assumes that the model name follows a given suffix"""
    model_name = filepath.rpartition(suffix)[-1]
    # remove the extension
    model_name = model_name.rpartition(".")[0]
    if model_name.startswith("__extra__ucinlp1__"):
        # print(model_name)
        model_name = model_name.replace("__extra__ucinlp1__","").replace("__hf_models_", "")
        # print(model_name)
    return model_name
    
# -----------------------------------------------------------------
# For datasets containing multiple splits, separated across files
# it will be the case, that we will have multiple model names for
# the same dataset name.
# -----------------------------------------------------------------
# We will send a warning and merge the two files. Assuming
# they are part of the same dataset. Please make sure that
# the listed files are not redundant and that indeed can be
# merged!
# -----------------------------------------------------------------
DATASET_2_FILES = defaultdict(list)
for name, filepaths in DATASET_2_FILEPATHS.items():
    models = {fp: get_model_name(fp) for fp in filepaths}
    models_2_fp, models_2_data = defaultdict(list), defaultdict(list)
    
    for fp, model_name in models.items():
        models_2_data[model_name].append(pd.read_csv(fp, index_col=0))
        models_2_fp[model_name].append(fp)
    
    for model_name, dfs in models_2_data.items():
        if len(dfs) > 1:
            # print()
            # print(f"Dataset '{name}' contains more than one filepath per model. {models_2_fp[model_name]}")
            dfs_lens = [len(d) for d in dfs]
            dfs = pd.concat(dfs).reset_index(drop=True)
            assert len(dfs) == sum(dfs_lens), "Invalid result when merging dataframes"
        else:
            dfs = dfs[0]
                
        dfs["dataset"] = name
        dfs["is_deduped"] = model_name.endswith("deduped")
        dfs["__model"] = dfs["model"].apply(lambda x: x.replace("__extra__ucinlp1__", "").replace("__hf_models_", ""))
        dfs["model"] = dfs["__model"].apply(canonic_model_name)
        dfs["model_basename"] = dfs["model"].apply(lambda x: x.replace(" (D)", ""))

        dfs["__model_size"] = dfs["model"].apply(get_model_size)
        DATASET_2_FILES[name].append(dfs)

DATASET_2_FILES = {k: pd.concat(v) for k, v in DATASET_2_FILES.items()}
# Filter models by a single file
DATASET_2_FILES = {k: df[df["__model"] == "EleutherAI__gpt-j-6b"].reset_index(drop=True) for k, df in DATASET_2_FILES.items()}

# comment section below to obtain results w/o likely/unliley
# filter the results by the "natural examples"
for dataset in DATASET_2_FILES:
    df = DATASET_2_FILES[dataset]
    if "is_natural" in df.columns:
        DATASET_2_FILES[dataset] = df[df["is_natural"]].reset_index(drop=True)
        print(dataset, len(df), len(DATASET_2_FILES[dataset]))
    else:
        print(dataset, len(df))

In [None]:
# collect the templates
DATANAME_TO_TEMPLATES = {k: v["template"].values.tolist() for k, v in DATASET_2_FILES.items()}

# list the names of the datasets in our analysis
DATANAMES = list(DATANAME_TO_TEMPLATES.keys())
print("Considering the following for the analysis", DATANAMES)
DATASET_2_FILES["Ours-05"].head(2)

## Compute statistics of benchmarks

We'd like to compare different properties of the evaluated benchmarks. Namely, we'd like to compare the positions of the pronouns, their length, the diversity of words, etc. To perform this analysis, we will work on a template level (the sentence with the placeholder mask) of each benchmark. Then, we will transform them into their canonic form according to the following rules:

1. Remove pronoun placeholder, since we do not want it to be mapped to any PMI word;
2. Lowercase the templates;
3. Remove stopwords and punctuation.

In [None]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string, re
nltk.download('stopwords')

NLTK_STOPWORDS = set(stopwords.words('english'))
# We know that some sentences have some other references to other entities,
# let's keep some pronouns
print(len(NLTK_STOPWORDS))
NLTK_STOPWORDS -= set(["she", "her", "hers", "he", "his", "him"])
print(len(NLTK_STOPWORDS))

PUNCT = string.punctuation

In [None]:
import string, re
TEMPLATE_LENGTH = defaultdict(list)

for dataset in DATANAME_TO_TEMPLATES.keys():
    # lower case
    templates = [t.lower() for t in DATANAME_TO_TEMPLATES[dataset]]
    # rename the reflexive pronoun to be matched by the regex below
    templates = [re.sub(r"\{pronoun2\}self", "{pronoun2}", t) for t in templates]
    templates = [re.sub(r"\{pronoun[0-2]{0,1}?\}", " PRONOUN ", t) for t in templates]
    
    # compute number of words (do not consider punctuation)
    num_words = [len(word_tokenize(re.sub("PRONOUN", "", t))) for t in templates]
    # compute number of pronouns based on the placeholder masks. This is because
    # the non placeholder pronouns will be already counted by the gender lexicon computation
    pronouns = [re.findall(r"PRONOUN", t) for t in templates]
    
    TEMPLATE_LENGTH["dataset"].extend([dataset] * len(templates))
    TEMPLATE_LENGTH["num_words"].extend(num_words)
    TEMPLATE_LENGTH["num_pronouns"].extend([len(p) for p in pronouns])
    
    for t, ps in zip(templates, pronouns):
        
        # Replace all punctuation with an empty string
        t = t.translate(str.maketrans('', '', string.punctuation)) 
        t_words = t.split()
        pronoun_indices = [ix for ix, w in enumerate(t_words) if w in ps]
        if len(pronoun_indices) == 0:
            print(dataset, t_words)
        TEMPLATE_LENGTH["pronoun_positions"].append(pronoun_indices)
        TEMPLATE_LENGTH["first_pos"].append(pronoun_indices[0])
        TEMPLATE_LENGTH["last_pos"].append(pronoun_indices[-1])
        TEMPLATE_LENGTH["avg_pronoun_pos_in_sentence"].append(np.mean(pronoun_indices))
        

TEMPLATE_LENGTH = pd.DataFrame(TEMPLATE_LENGTH)
TEMPLATE_PROPERTIES = ["dataset", "num_words", "num_pronouns", "first_pos", "last_pos", "avg_pronoun_pos_in_sentence"]

In [None]:
print("Median values for each property by dataset")
TEMPLATE_LENGTH[TEMPLATE_PROPERTIES].groupby("dataset").median().T

In [None]:
print("Max values for each property by dataset")
TEMPLATE_LENGTH[TEMPLATE_PROPERTIES].groupby("dataset").max().T

In [None]:
# Canonic templates
DATANAME_TO_TEMPLATES_CANONIC: Dict[str, List[str]] = {k: v.copy() for k, v in DATANAME_TO_TEMPLATES.items()}

for dataset in DATANAME_TO_TEMPLATES_CANONIC.keys():
    # Lower case
    templates = [t.lower() for t in DATANAME_TO_TEMPLATES_CANONIC[dataset]]
    
    # Remove pronouns first
    templates = [t.replace("{pronoun2}self", "") for t in templates]
    templates = [re.sub(r"\{pronoun([0-2]{1})?\}", "", t) for t in templates]
    
    # Remove stopwords and punct
    templates = [[w for w in word_tokenize(t) if w not in PUNCT and w not in NLTK_STOPWORDS] for t in templates]
    
    DATANAME_TO_TEMPLATES_CANONIC[dataset] = templates

## Analysis 1: Do generated sentences contain gendered language? 

Gendered language is language whose gender is explicitly marked in the word. For example, the words "mother", "her", "woman", "unwomanly" are all gendered words as they are associated with a specific gender.

In this section, we exploit `DATANAME_TO_TEMPLATES_CANONIC` and adopt a bag of words approach to count the number of gendered expressions (or lexicon) occurring in each benchmark.

In [None]:
# FEMALE_WORDS and MALE_WORDS are the words whose PMI values are precomputed.
# FEMALE_LEXICON and MALE_LEXICON represent a larger set of words encompassing a wider range of gendered expressions.
FEMALE_LEXICON = list(FEMALE_WORDS)
MALE_LEXICON = list(MALE_WORDS)

# Create directory to place the wordlists used in this study
!mkdir gender-wordlist

# ------------------------------------------------
# BIAS BENCH 
# ------------------------------------------------
## Obtain the gender-wordlist used in the BIASBENCH paper: https://arxiv.org/pdf/2110.08527.pdf
!wget -P gender-wordlist https://raw.githubusercontent.com/McGill-NLP/bias-bench/main/data/bias_attribute_words.json

import json
with open("gender-wordlist/bias_attribute_words.json") as f:
    BB_GENDER_PAIR = json.load(f)["gender"]
    BB_GENDER_PAIR_M, BB_GENDER_PAIR_F = zip(*BB_GENDER_PAIR)
print("BIASBench:", len(BB_GENDER_PAIR)) 

FEMALE_LEXICON += BB_GENDER_PAIR_F
MALE_LEXICON += BB_GENDER_PAIR_M

# ------------------------------------------------
# NAMES BENCH 
# ------------------------------------------------
# based on https://github.com/McGill-NLP/bias-bench/blob/main/data/seat/angry_black_woman_stereotype.jsonl
FEMALE_LEXICON += ["Allison","Anne","Carrie","Emily","Jill","Laurie","Kristen","Meredith","Molly","Amy","Claire","Katie","Madeline","Katelyn","Emma","Aisha","Ebony","Keisha","Latonya","Lakisha","Latoya","Tamika","Imani","Shanice","Aaliyah","Precious","Nia","Deja","Latanya","Latisha"]

# Based on https://github.com/McGill-NLP/bias-bench/blob/main/data/seat/weat6.jsonl
FEMALE_LEXICON += ["Amy","Joan","Lisa","Sarah","Diana","Kate","Ann","Donna"]
MALE_LEXICON += [ "John", "Paul","Mike","Kevin","Steve","Greg","Jeff","Bill"]

# Based on https://github.com/McGill-NLP/bias-bench/blob/main/data/seat/weat6b.jsonl
FEMALE_LEXICON += [  "female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
MALE_LEXICON += [  "male","man","boy","brother","he","him","his","son"]

# Based on https://github.com/McGill-NLP/bias-bench/blob/main/data/seat/weat8.jsonl
FEMALE_LEXICON += ["sister","mother","aunt","grandmother","daughter","she","hers","her"]
MALE_LEXICON += ["brother","father","uncle","grandfather","son","he","his","him"]

# Others
FEMALE_LEXICON += ["granddaughter","granddaughters"]
MALE_LEXICON += ["grandson","grandsons"]

# Names based on wikipedia (?)
# https://en.wikipedia.org/wiki/Category:English_masculine_given_names
MALE_LEXICON += ["brad", "cyrus"]

# UNIQUE
FEMALE_LEXICON = sorted(set([w.lower() for w in FEMALE_LEXICON]))
MALE_LEXICON = sorted(set([w.lower() for w in MALE_LEXICON]))

print("Size of the female word bank:", len(FEMALE_LEXICON))
print("Size of the male word bank:", len(MALE_LEXICON))

Having created the gendered lexicon we can now determine how many of the words are present in each sentence.

In [None]:
def compute_num_gendered_expressions(sentences: List[List[str]], male_wordlist: List[str], female_wordlist: List[str]):
    # Helps make access O(1), in the future could have weights based on the gendered lexicon (e.g., how much skew)
    male_exps, female_exps = {w: 1 for w in male_wordlist}, {w: 1 for w in female_wordlist}
    
    len_sents = []
    male_counts = []
    female_counts = []
    for sent in sentences:
        male_tks = [male_exps.get(t, 0) for t in sent]
        female_tks = [female_exps.get(t, 0) for t in sent]
        
        len_sents.append(len(sent))
        male_counts.append(sum(male_tks))
        female_counts.append(sum(female_tks))
    
    return {"male_counts": male_counts, "female_counts": female_counts, "counts": len_sents}

# -----------------------------------------------------------
# number of gendered expressions per sentence x benchmark
# -----------------------------------------------------------
results_gendered_lexicon = defaultdict(list)

for dataset, templates in DATANAME_TO_TEMPLATES_CANONIC.items():
    canonic_results = compute_num_gendered_expressions(templates, MALE_LEXICON, FEMALE_LEXICON)
    
    results_gendered_lexicon["dataset"].extend([dataset] * len(templates))
    # Number of words in MALE_LEXICON
    results_gendered_lexicon["male_counts"].extend(canonic_results["male_counts"])
    # Number of words in FEMALE_LEXICON
    results_gendered_lexicon["female_counts"].extend(canonic_results["female_counts"])
    # Number of words
    results_gendered_lexicon["word_counts"].extend(canonic_results["counts"])
    
results_gendered_lexicon = pd.DataFrame(results_gendered_lexicon)
results_gendered_lexicon.insert(3, "male+female counts", results_gendered_lexicon["male_counts"] + results_gendered_lexicon["female_counts"])
results_gendered_lexicon.head()

In [None]:
print("average value per sentence in each dataset") 
results_gendered_lexicon.groupby("dataset").mean()

## Analysis 2 - Distribution of MaxGender across benchmarks

We conjecture that accounting for gender correlations at a sentence level may lead to different evaluation results. in particular, we prompted ChatGPT to generate a benchmark with supposedly gender invariant sentences. 

In order to obtain a better idea of how "gender-related" our dataset is, we compute the distribution of MaxGender(s) per dataset.

In [None]:
# Convert dataframe to mapping from word to pmi diff for easy access
WORD2PMI = PMI_DIFF[["word", "pmi_diff"]].set_index("word").to_dict()["pmi_diff"]

# Computes the pmi per each word in templates
PMI_PER_TEMPLATES = {name: [] for name in DATANAMES}

# Computes the pmi per word in each template
PMIWORDS_PER_TEMPLATES = {name: [] for name in DATANAMES}


for dataset, templates in DATANAME_TO_TEMPLATES_CANONIC.items():
    for template in templates:
        pmi = np.array([WORD2PMI.get(w) for w in template if WORD2PMI.get(w) is not None])
        pmiwords = [(w, WORD2PMI.get(w)) for w in template if WORD2PMI.get(w) is not None]
        
        PMI_PER_TEMPLATES[dataset].append(pmi)
        # one-to-one mapping between words and pmi
        PMIWORDS_PER_TEMPLATES[dataset].append(pmiwords)   

In [None]:
DATASET_ORDERING = ["Ours-05", "Ours-10", "Ours-20", "Winobias", "Winogender"]

In [None]:
def sentence_gender_max(sentence_pmis: List[float]) -> float:
    """Determines the maximum absolute gender correlation in the provided list."""
    if len(sentence_pmis) > 0:
        idx = np.argmax(np.abs(sentence_pmis))
        return sentence_pmis[idx]
    
def compute_measure_per_sentence(
    pmi_per_templates: Dict[str, List[List[float]]],
    measure_fn: callable,
) -> pd.DataFrame:
    """Applies the measure function to the PMI-scores associated with each template.
    
    Parameters
    ----------
    pmi_per_templates: Dict[str, List[List[float]]]
        The mapping between the datasets and the list of templates scores.
        Each template score is a list of scores (potentially one per each word).
        
    measure_fn: callable(List[float]) -> float
        Aggregating function of the list of scores assigned to each template.
        One example could be 'sentence_gender_max'.
        
    Returns
    -------
    pandas.DataFrame
        A long table containing a score per template in each dataset (dubbed 'value'),
        as well as whether it is invalid (i.e., empty list of scores).
    """
    results = defaultdict(list)
    for dataset, sentences in pmi_per_templates.items():
        for ix, sentence in enumerate(sentences):
            val = measure_fn(sentence)

            results["dataset"].append(dataset)
            # for this work, the sentence must not have been sorted before in this notebook!!
            results["template_idx"].append(ix)
            results["value"].append(val)
            results["is_invalid"].append(len(sentence) == 0)

    return pd.DataFrame(results)

In [None]:
# -------------------------------------------------------
# Compute the gender max metric per sentence
# -------------------------------------------------------
RESULTS_GENDER_MAX_PER_SENT = compute_measure_per_sentence(PMI_PER_TEMPLATES, sentence_gender_max)

# -------------------------------------------------------
# plot the gender max per sentence
# -------------------------------------------------------
plt.figure(figsize=(6, 4), dpi=150)
sns.boxplot(RESULTS_GENDER_MAX_PER_SENT, x="dataset", y="value")
plt.ylim(-3, 3)
plt.ylabel("Gender Max word-PMI (per sentence)")
plt.show()

# Table results
RESULTS_GENDER_MAX_PER_SENT[["dataset", "value"]].groupby("dataset").describe().T[DATASET_ORDERING].style.format('{:.2f}')

#### Number of templates per dataset (original)

In [None]:
# Number of templates per dataset
num_templates = RESULTS_GENDER_MAX_PER_SENT.groupby("dataset").count()[["value"]]
num_templates.rename({"value": "orig_num_templates"}, axis=1, inplace=True)
num_templates

## Analysis 3: Impact of $\varepsilon_k$  on benchmark size?

In this section, we assess the impact of different constraints on the strength of the words' gender correlations in the benchmark size. To do this, we first define a set of `CONSTRAINT_EPSILONS` linearly spaced between 0.5 and 1.5, and sort them in descending order.

In [None]:
def filter_data_by_col_val(
    data: pd.DataFrame, 
    col: str="value",
    thres: float=1.0,
) -> pd.DataFrame:
    """Returns a slice of `data` whose `col`'s value is within `[-thres, thres]`."""
    return data[(data[col] >= -thres) & (data[col] <= thres)].copy()

CONSTRAINT_EPSILONS = np.linspace(0.2, 1.5, 51)[::-1]

filter_templates_results = {3: RESULTS_GENDER_MAX_PER_SENT.groupby("dataset").count()["value"]}
for eps in CONSTRAINT_EPSILONS:
    # number of examples after removing outliers outside [-1, 1]    
    df_eps = filter_data_by_col_val(RESULTS_GENDER_MAX_PER_SENT, thres=eps)
    
    # Obtain the number of remaining templates
    templ_diff = df_eps.groupby("dataset").count()["value"]
    # Obtain the difference in template counts by applying a specific filter.
    #templ_diff = df_eps.groupby("dataset").count()["value"] - num_templates["orig_num_templates"]
    filter_templates_results[round(eps, 2)] = templ_diff
    
# How many templates we loose as we increase the filter
filter_templates_results =  pd.DataFrame(filter_templates_results).T
filter_templates_results = filter_templates_results.reset_index().rename({"index": "filter"}, axis=1)
filter_templates_results

In [None]:
plt.figure(figsize=(4,3), dpi=200)
for dataset in DATANAMES:
    sns.lineplot(filter_templates_results, x="filter", y=dataset, label=dataset, lw=1)

plt.xlabel("Max word-level gender correlation allowed")
plt.ylabel("Number of templates")
plt.legend( loc='upper left', bbox_to_anchor=(1, 1.01))

In [None]:
_filter_templates_results = filter_templates_results.copy() 
_filter_templates_results[DATASET_ORDERING] = (_filter_templates_results[DATASET_ORDERING] / _filter_templates_results[DATASET_ORDERING].iloc[0]).round(2) 

plt.figure(figsize=(4,3), dpi=200)
for dataset in DATANAMES:
    sns.lineplot(_filter_templates_results, x="filter", y=dataset, label=dataset, lw=1)

plt.xlabel("Maximum word-level correlation ($\epsilon$)")
plt.ylabel("Benchmark size fraction")
#plt.legend( loc='upper left', bbox_to_anchor=(0, -0.2), ncols=3)
plt.tight_layout()
plt.show()

### Distribution of $\mathrm{MaxPMI(s)}$ across benchmarks

In this section, we determine what is the average distribution of $\mathrm{MaxPMI(s)}$ per sentence. Any threshold $\varepsilon_k$ will cap the distribution in a symmetric fashion around $0$. That is, we enforce constraints of the type: $|\mathrm{MaxPMI(s)}| \leq \varepsilon_k$.

In [None]:
RESULTS_FILTER_BY_1 = filter_data_by_col_val(RESULTS_GENDER_MAX_PER_SENT, thres=0.25)
sns.boxplot(RESULTS_FILTER_BY_1, x="dataset", y="value")
plt.ylim(-3, 3)
plt.ylabel("Gender Max word-PMI (per sentence)")
plt.show()

RESULTS_FILTER_BY_1[["dataset", "value"]].groupby("dataset").describe().T[DATASET_ORDERING].style.format('{:.2f}')

## Check outlier examples

In this section, you can examine which examples are filtered out using $\epsilon_k$.

In [None]:
def filter_dataframe(original_file: pd.DataFrame, dataset: str, gender_max: pd.DataFrame, col="value", eps=1.0):
    original_file = original_file.copy()
    
    # make sure the gender_max is specific to our original dataset (shouldn't have any ordering other than the index)
    max_df = gender_max[gender_max["dataset"] == dataset]
    assert (np.array(original_file.index) == max_df["template_idx"].values).all(), "Index mismatch"
    # keep templates whose col value is within [-eps, eps]
    filtered_df = filter_data_by_col_val(max_df, col=col, thres=eps)
    
    # add information about the value and whether we were able to obtain any PMI value for that sentence
    for c in (col, "is_invalid"):
        original_file[c] = max_df[c].values
    #print(max_df["value"].head())
    #print("=====")
    #print(original_file["value"].head())
    # Mark every example to be dropped by default
    original_file["is_dropped"] = True
    # Collect a mask w/ the indication of whether that template is to be kept or not
    keep_mask = original_file.index.isin(filtered_df["template_idx"])
    original_file.loc[keep_mask, "is_dropped"] = False    
    return original_file

In [None]:
DROPPED_EXAMPLES_EPS1 = {
    k: filter_dataframe(f, k, RESULTS_GENDER_MAX_PER_SENT, eps=1)
    for k, f in DATASET_2_FILES.items()
}

for name, df in DROPPED_EXAMPLES_EPS1.items():
    print("---", name, "---\n", df["is_dropped"].value_counts())
    print()

In [None]:
DROPPED_EXAMPLES_EPS1["Ours-05"][DROPPED_EXAMPLES_EPS1["Ours-05"]["is_dropped"]].sample(10)

In [None]:
DROPPED_EXAMPLES_EPS1["Ours-10"][DROPPED_EXAMPLES_EPS1["Ours-10"]["is_dropped"]].sample(10)

In [None]:
DROPPED_EXAMPLES_EPS1["Ours-20"][DROPPED_EXAMPLES_EPS1["Ours-20"]["is_dropped"]].sample(10)

In [None]:
DROPPED_EXAMPLES_EPS1["Winobias"][DROPPED_EXAMPLES_EPS1["Winobias"]["is_dropped"]].sample(10)

In [None]:
DROPPED_EXAMPLES_EPS1["Winogender"][DROPPED_EXAMPLES_EPS1["Winogender"]["is_dropped"]].sample(10)

In [None]:
_d = DROPPED_EXAMPLES_EPS1["Ours-10"]
_d[_d["sentence"].apply(lambda x: "captivating" in x)]