Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pairwise distance and dendrogram plots in fairness scorer #134

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions src/fairlens/plot/distr.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def attr_distr_plot(
normalize: bool = False,
cmap: Optional[Sequence[Tuple[float, float, float]]] = None,
ax: Optional[Axes] = None,
) -> Optional[Axes]:
) -> Union[Axes, Sequence[Axes]]:
"""Plot the distribution of the target attribute with respect to all the unique values in the column `attr`.

Args:
Expand Down Expand Up @@ -217,6 +217,9 @@ def attr_distr_plot(
fig.tight_layout()
plt.subplots_adjust(hspace=0.3)

min_ylim = max_ylim = 0
axes = []

for i, (group, title) in enumerate(zip(groups, labels)):
ax_ = fig.add_subplot(r, c, i + 1)
distr_plot(
Expand All @@ -232,8 +235,14 @@ def attr_distr_plot(
ax=ax_,
)
plt.title(title)
min_ylim = min(min_ylim, ax_.get_ylim()[0])
max_ylim = max(max_ylim, ax_.get_ylim()[1])
axes.append(ax_)

for ax_ in axes:
ax_.set_ylim(min_ylim, max_ylim)

return None
return axes

if distr_type == "binary":
_countplot(x=df_[attr], hue=df_[target_attr], palette=cmap, normalize=normalize)
Expand Down
178 changes: 155 additions & 23 deletions src/fairlens/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
from itertools import combinations
from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering

from . import utils
from .metrics.statistics import sensitive_group_analysis
Expand Down Expand Up @@ -86,19 +91,19 @@ def distribution_score(
p_value: bool = False,
max_comb: Optional[int] = None,
) -> pd.DataFrame:
"""Returns a dataframe consisting of all unique sub-groups and their statistical distance to the rest
of the population w.r.t. the target variable.
"""Returns a dataframe consisting of all unique sub-groups and their statistical distances of
the target variable computed based on the `metric` and `method` parameters.

Args:
metric (str, optional):
Choose a metric to use. Defaults to automatically chosen metric depending on
the distribution of the target variable.
Choose the metric to use. If set to "auto" chooses the metric depending on
the distribution of the target variable. Defaults to "auto".
method (str, optional):
The method used to apply the metric to the sub-group. Can take values
["dist_to_all", dist_to_rest"] which correspond to measuring the distance
between the subgroup distribution and the overall distribution, or the
overall distribution without the subgroup, respectively.
Defaults to "dist_to_all".
["dist_to_all", dist_to_rest", "pairwise"] which correspond to measuring
the distance between the subgroup distribution and the overall distribution, or the
overall distribution without the subgroup, or alternatively measuring the distance
between all possible pairs of subgroups, respectively. Defaults to "dist_to_all".
p_value (bool, optional):
Whether or not to compute a p-value for the distances.
max_comb (Optional[int], optional):
Expand All @@ -107,7 +112,6 @@ def distribution_score(
"""

df = self.df[self.sensitive_attrs + [self.target_attr]].copy()
sensitive_attrs = self.sensitive_attrs

# Bin continuous sensitive attributes
for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types):
Expand All @@ -119,25 +123,91 @@ def distribution_score(
if self.distr_type.is_binary():
df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0]

if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0:
if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0:
return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"])

max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs)
df_dists = []

# Try all combinations of sensitive attributes
for k in range(1, max_comb + 1):
for sensitive_attr in combinations(sensitive_attrs, k):
df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)]
if len(df_not_nan) == 0:
continue
# Find all combinations of sensitive attributes
combs = _all_sensitive_combs(df, self.sensitive_attrs, max_comb=max_comb)

df_dist = _calculate_distance(df, self.target_attr, list(sensitive_attr), metric, method, p_value)
df_dists.append(df_dist)
# Computes scores for each sensitive value in a data frame, for each combination of sensitive attributes
if method == "pairwise":
df_dists = [_calculate_distance_pair(df, self.target_attr, comb, metric, p_value) for comb in combs]
else:
df_dists = [_calculate_distance(df, self.target_attr, comb, metric, method, p_value) for comb in combs]

df_dist = pd.concat(df_dists, ignore_index=True)

return df_dist.reset_index(drop=True)
return df_dist

def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[Axes] = None) -> Axes:
"""Hierarchically clusters the sensitive subgroups using the metric and plots
the resulting tree in a dendrogram.

Args:
threshold (float, optional):
The linkage distance threshold, above which clusters will not be merged.
metric (str, optional):
Choose the metric to use. If set to "auto" chooses the metric depending on
the distribution of the target variable. Defaults to "auto".
ax (Optional[matplotlib.axes.Axes], optional):
An axis to plot the figure on. Set to plt.gca() if None. Defaults to None.

Returns:
matplotlib.axes.Axes:
The matplotlib axis containing the plot.
"""

if ax is None:
ax = plt.gca()

df = self.df[self.sensitive_attrs + [self.target_attr]].copy()

# Bin continuous sensitive attributes
for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types):
if distr_type.is_continuous() or distr_type.is_datetime():
col = utils.infer_dtype(df[attr])
df.loc[:, attr] = utils._bin_as_string(col, distr_type.value, prefix=True)

# Convert binary attributes to 0s and 1s
if self.distr_type.is_binary():
df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0]

if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0:
return ax

groups = []
for vs in [[{attr: [val]} for val in df[attr].unique()] for attr in self.sensitive_attrs]:
groups.extend(vs)

dist_matrix = np.zeros((len(groups), len(groups)))
for i, g1 in enumerate(groups):
for j, g2 in enumerate(groups):
dist_matrix[i][j] = abs(stat_distance(df, self.target_attr, g1, g2, mode=metric)[0])

model = AgglomerativeClustering(
n_clusters=None,
distance_threshold=threshold,
affinity="precomputed",
linkage="average",
compute_full_tree=True,
)
model = model.fit(dist_matrix)

# Create Dendrogram
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_counts = [1 if child_idx < n_samples else counts[child_idx - n_samples] for child_idx in merge]
counts[i] = sum(current_counts)

linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)

# Plot the corresponding dendrogram
group_names = [list(group.values())[0][0] for group in groups]
_ = dendrogram(linkage_matrix, labels=group_names, ax=ax)
ax.tick_params(axis="x", labelrotation=90)

return ax

def plot_distributions(
self,
Expand Down Expand Up @@ -301,7 +371,7 @@ def compare_group_statistics(


def calculate_score(df_dist: pd.DataFrame) -> float:
"""Calculate the weighted mean pairwise statistical distance.
"""Calculate the weighted mean of statistical distances.

Args:
df_dist (pd.DataFrame):
Expand Down Expand Up @@ -358,3 +428,65 @@ def _calculate_distance(
df_dist.drop(columns=["P-Value"], inplace=True)

return df_dist


def _calculate_distance_pair(
df: pd.DataFrame,
target_attr: str,
sensitive_attrs: Sequence[str],
metric: str = "auto",
p_value: bool = False,
) -> pd.DataFrame:

unique = df[sensitive_attrs].drop_duplicates()

dist = []

for i_index, i in unique.iterrows():
for j_index, j in unique.iterrows():
if i_index == j_index:
continue

group1 = {attr: [value] for attr, value in i.to_dict().items()}
group2 = {attr: [value] for attr, value in j.to_dict().items()}

preds = utils.get_predicates_mult(df, [group1, group2])
pred1, pred2 = preds[0], preds[1]

dist_res = stat_distance(df, target_attr, pred1, pred2, mode=metric, p_value=p_value)
distance = dist_res[0]
p = dist_res[1] if p_value else 0

dist.append(
{
"Positive Group": ", ".join(map(str, i.to_dict().values())),
"Negative Group": ", ".join(map(str, j.to_dict().values())),
"Distance": distance,
"Positive Counts": len(df[pred1]),
"Negative Counts": len(df[pred2]),
"Counts": len(df[pred1]) + len(df[pred2]),
"P-Value": p,
}
)

df_dist = pd.DataFrame(dist)

if not p_value:
df_dist.drop(columns=["P-Value"], inplace=True)

return df_dist


def _all_sensitive_combs(df: pd.DataFrame, sensitive_attrs: Sequence[str], max_comb: Optional[int] = None):
max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs)

groups = []
for k in range(1, max_comb + 1):
for sensitive_attr in combinations(sensitive_attrs, k):
df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)]
if len(df_not_nan) == 0:
continue

groups.append(list(sensitive_attr))

return groups
36 changes: 34 additions & 2 deletions tests/test_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,46 @@ def test_sensitive_attr_detection():
assert fscorer.sensitive_attrs == ["DateOfBirth", "Ethnicity", "Language", "MaritalStatus", "RawScore", "Sex"]


def test_distribution_score():
def test_distribution_score_all():
fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
df_dist = fscorer.distribution_score()
df_dist = fscorer.distribution_score(method="all")
score = calculate_score(df_dist)

assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum()


def test_distribution_score_rest():
fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
df_dist = fscorer.distribution_score(method="rest")
score = calculate_score(df_dist)

assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum()


def test_pairwise_compas():
fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
df_dist = fscorer.distribution_score(method="pairwise")

assert (df_dist["Distance"] > 0).all()


def test_pairwise_adult():
fscorer = FairnessScorer(dfa, "class", ["race", "sex"])
df_dist = fscorer.distribution_score(metric="binomial", method="pairwise")

assert (df_dist["Distance"] != 0).all()


def test_dendrogram_compas():
fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
fscorer.plot_dendrogram(0.1)


def test_dendrogram_adult():
fscorer = FairnessScorer(dfa, "class", ["race", "sex"])
fscorer.plot_dendrogram(0.1)


def test_group_statistics_manual():
fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
df_stats = fscorer.compare_group_statistics(
Expand Down