synthesized-io · Hilly12 · Sep 17, 2021 · Sep 17, 2021 · Sep 17, 2021 · Sep 17, 2021
diff --git a/src/fairlens/plot/distr.py b/src/fairlens/plot/distr.py
@@ -136,7 +136,7 @@ def attr_distr_plot(
     normalize: bool = False,
     cmap: Optional[Sequence[Tuple[float, float, float]]] = None,
     ax: Optional[Axes] = None,
-) -> Optional[Axes]:
+) -> Union[Axes, Sequence[Axes]]:
     """Plot the distribution of the target attribute with respect to all the unique values in the column `attr`.
 
     Args:
@@ -217,6 +217,9 @@ def attr_distr_plot(
         fig.tight_layout()
         plt.subplots_adjust(hspace=0.3)
 
+        min_ylim = max_ylim = 0
+        axes = []
+
         for i, (group, title) in enumerate(zip(groups, labels)):
             ax_ = fig.add_subplot(r, c, i + 1)
             distr_plot(
@@ -232,8 +235,14 @@ def attr_distr_plot(
                 ax=ax_,
             )
             plt.title(title)
+            min_ylim = min(min_ylim, ax_.get_ylim()[0])
+            max_ylim = max(max_ylim, ax_.get_ylim()[1])
+            axes.append(ax_)
+
+        for ax_ in axes:
+            ax_.set_ylim(min_ylim, max_ylim)
 
-        return None
+        return axes
 
     if distr_type == "binary":
         _countplot(x=df_[attr], hue=df_[target_attr], palette=cmap, normalize=normalize)

diff --git a/src/fairlens/scorer.py b/src/fairlens/scorer.py
@@ -6,7 +6,12 @@
 from itertools import combinations
 from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union
 
+import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
+from matplotlib.axes import Axes
+from scipy.cluster.hierarchy import dendrogram
+from sklearn.cluster import AgglomerativeClustering
 
 from . import utils
 from .metrics.statistics import sensitive_group_analysis
@@ -86,19 +91,19 @@ def distribution_score(
         p_value: bool = False,
         max_comb: Optional[int] = None,
     ) -> pd.DataFrame:
-        """Returns a dataframe consisting of all unique sub-groups and their statistical distance to the rest
-        of the population w.r.t. the target variable.
+        """Returns a dataframe consisting of all unique sub-groups and their statistical distances of
+        the target variable computed based on the `metric` and `method` parameters.
 
         Args:
             metric (str, optional):
-                Choose a metric to use. Defaults to automatically chosen metric depending on
-                the distribution of the target variable.
+                Choose the metric to use. If set to "auto" chooses the metric depending on
+                the distribution of the target variable. Defaults to "auto".
             method (str, optional):
                 The method used to apply the metric to the sub-group. Can take values
-                ["dist_to_all", dist_to_rest"] which correspond to measuring the distance
-                between the subgroup distribution and the overall distribution, or the
-                overall distribution without the subgroup, respectively.
-                Defaults to "dist_to_all".
+                ["dist_to_all", dist_to_rest", "pairwise"] which correspond to measuring
+                the distance between the subgroup distribution and the overall distribution, or the
+                overall distribution without the subgroup, or alternatively measuring the distance
+                between all possible pairs of subgroups, respectively. Defaults to "dist_to_all".
             p_value (bool, optional):
                 Whether or not to compute a p-value for the distances.
             max_comb (Optional[int], optional):
@@ -107,7 +112,6 @@ def distribution_score(
         """
 
         df = self.df[self.sensitive_attrs + [self.target_attr]].copy()
-        sensitive_attrs = self.sensitive_attrs
 
         # Bin continuous sensitive attributes
         for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types):
@@ -119,25 +123,91 @@ def distribution_score(
         if self.distr_type.is_binary():
             df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0]
 
-        if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0:
+        if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0:
             return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"])
 
-        max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs)
-        df_dists = []
-
-        # Try all combinations of sensitive attributes
-        for k in range(1, max_comb + 1):
-            for sensitive_attr in combinations(sensitive_attrs, k):
-                df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)]
-                if len(df_not_nan) == 0:
-                    continue
+        # Find all combinations of sensitive attributes
+        combs = _all_sensitive_combs(df, self.sensitive_attrs, max_comb=max_comb)
 
-                df_dist = _calculate_distance(df, self.target_attr, list(sensitive_attr), metric, method, p_value)
-                df_dists.append(df_dist)
+        # Computes scores for each sensitive value in a data frame, for each combination of sensitive attributes
+        if method == "pairwise":
+            df_dists = [_calculate_distance_pair(df, self.target_attr, comb, metric, p_value) for comb in combs]
+        else:
+            df_dists = [_calculate_distance(df, self.target_attr, comb, metric, method, p_value) for comb in combs]
 
         df_dist = pd.concat(df_dists, ignore_index=True)
 
-        return df_dist.reset_index(drop=True)
+        return df_dist
+
+    def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[Axes] = None) -> Axes:
+        """Hierarchically clusters the sensitive subgroups using the metric and plots
+        the resulting tree in a dendrogram.
+
+        Args:
+            threshold (float, optional):
+                The linkage distance threshold, above which clusters will not be merged.
+            metric (str, optional):
+                Choose the metric to use. If set to "auto" chooses the metric depending on
+                the distribution of the target variable. Defaults to "auto".
+            ax (Optional[matplotlib.axes.Axes], optional):
+                An axis to plot the figure on. Set to plt.gca() if None. Defaults to None.
+
+        Returns:
+            matplotlib.axes.Axes:
+                The matplotlib axis containing the plot.
+        """
+
+        if ax is None:
+            ax = plt.gca()
+
+        df = self.df[self.sensitive_attrs + [self.target_attr]].copy()
+
+        # Bin continuous sensitive attributes
+        for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types):
+            if distr_type.is_continuous() or distr_type.is_datetime():
+                col = utils.infer_dtype(df[attr])
+                df.loc[:, attr] = utils._bin_as_string(col, distr_type.value, prefix=True)
+
+        # Convert binary attributes to 0s and 1s
+        if self.distr_type.is_binary():
+            df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0]
+
+        if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0:
+            return ax
+
+        groups = []
+        for vs in [[{attr: [val]} for val in df[attr].unique()] for attr in self.sensitive_attrs]:
+            groups.extend(vs)
+
+        dist_matrix = np.zeros((len(groups), len(groups)))
+        for i, g1 in enumerate(groups):
+            for j, g2 in enumerate(groups):
+                dist_matrix[i][j] = abs(stat_distance(df, self.target_attr, g1, g2, mode=metric)[0])
+
+        model = AgglomerativeClustering(
+            n_clusters=None,
+            distance_threshold=threshold,
+            affinity="precomputed",
+            linkage="average",
+            compute_full_tree=True,
+        )
+        model = model.fit(dist_matrix)
+
+        # Create Dendrogram
+        counts = np.zeros(model.children_.shape[0])
+        n_samples = len(model.labels_)
+        for i, merge in enumerate(model.children_):
+            current_counts = [1 if child_idx < n_samples else counts[child_idx - n_samples] for child_idx in merge]
+            counts[i] = sum(current_counts)
+
+        linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)
+
+        # Plot the corresponding dendrogram
+        group_names = [list(group.values())[0][0] for group in groups]
+        _ = dendrogram(linkage_matrix, labels=group_names, ax=ax)
+        ax.tick_params(axis="x", labelrotation=90)
+
+        return ax
 
     def plot_distributions(
         self,
@@ -301,7 +371,7 @@ def compare_group_statistics(
 
 
 def calculate_score(df_dist: pd.DataFrame) -> float:
-    """Calculate the weighted mean pairwise statistical distance.
+    """Calculate the weighted mean of statistical distances.
 
     Args:
         df_dist (pd.DataFrame):
@@ -358,3 +428,65 @@ def _calculate_distance(
         df_dist.drop(columns=["P-Value"], inplace=True)
 
     return df_dist
+
+
+def _calculate_distance_pair(
+    df: pd.DataFrame,
+    target_attr: str,
+    sensitive_attrs: Sequence[str],
+    metric: str = "auto",
+    p_value: bool = False,
+) -> pd.DataFrame:
+
+    unique = df[sensitive_attrs].drop_duplicates()
+
+    dist = []
+
+    for i_index, i in unique.iterrows():
+        for j_index, j in unique.iterrows():
+            if i_index == j_index:
+                continue
+
+            group1 = {attr: [value] for attr, value in i.to_dict().items()}
+            group2 = {attr: [value] for attr, value in j.to_dict().items()}
+
+            preds = utils.get_predicates_mult(df, [group1, group2])
+            pred1, pred2 = preds[0], preds[1]
+
+            dist_res = stat_distance(df, target_attr, pred1, pred2, mode=metric, p_value=p_value)
+            distance = dist_res[0]
+            p = dist_res[1] if p_value else 0
+
+            dist.append(
+                {
+                    "Positive Group": ", ".join(map(str, i.to_dict().values())),
+                    "Negative Group": ", ".join(map(str, j.to_dict().values())),
+                    "Distance": distance,
+                    "Positive Counts": len(df[pred1]),
+                    "Negative Counts": len(df[pred2]),
+                    "Counts": len(df[pred1]) + len(df[pred2]),
+                    "P-Value": p,
+                }
+            )
+
+    df_dist = pd.DataFrame(dist)
+
+    if not p_value:
+        df_dist.drop(columns=["P-Value"], inplace=True)
+
+    return df_dist
+
+
+def _all_sensitive_combs(df: pd.DataFrame, sensitive_attrs: Sequence[str], max_comb: Optional[int] = None):
+    max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs)
+
+    groups = []
+    for k in range(1, max_comb + 1):
+        for sensitive_attr in combinations(sensitive_attrs, k):
+            df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)]
+            if len(df_not_nan) == 0:
+                continue
+
+            groups.append(list(sensitive_attr))
+
+    return groups
diff --git a/tests/test_scorer.py b/tests/test_scorer.py
@@ -59,14 +59,46 @@ def test_sensitive_attr_detection():
     assert fscorer.sensitive_attrs == ["DateOfBirth", "Ethnicity", "Language", "MaritalStatus", "RawScore", "Sex"]
 
 
-def test_distribution_score():
+def test_distribution_score_all():
     fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
-    df_dist = fscorer.distribution_score()
+    df_dist = fscorer.distribution_score(method="all")
+    score = calculate_score(df_dist)
+
+    assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum()
+
+
+def test_distribution_score_rest():
+    fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
+    df_dist = fscorer.distribution_score(method="rest")
     score = calculate_score(df_dist)
 
     assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum()
 
 
+def test_pairwise_compas():
+    fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
+    df_dist = fscorer.distribution_score(method="pairwise")
+
+    assert (df_dist["Distance"] > 0).all()
+
+
+def test_pairwise_adult():
+    fscorer = FairnessScorer(dfa, "class", ["race", "sex"])
+    df_dist = fscorer.distribution_score(metric="binomial", method="pairwise")
+
+    assert (df_dist["Distance"] != 0).all()
+
+
+def test_dendrogram_compas():
+    fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
+    fscorer.plot_dendrogram(0.1)
+
+
+def test_dendrogram_adult():
+    fscorer = FairnessScorer(dfa, "class", ["race", "sex"])
+    fscorer.plot_dendrogram(0.1)
+
+
 def test_group_statistics_manual():
     fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"])
     df_stats = fscorer.compare_group_statistics(