Merge branch 'main' into smart-partitioning

synthesized-io · Sep 24, 2021 · 9e59e26 · 9e59e26
2 parents feba5e3 + f68761c
commit 9e59e26
Show file tree

Hide file tree

Showing 12 changed files with 759 additions and 28 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -46,15 +46,15 @@ jobs:
         SYNTHESIZED_KEY: ${{ secrets.SYNTHESIZED_KEY }}
 
     - name: Upload Codecov report
-      if: ${{ matrix.python-version }} == '3.7'
+      if: ${{ matrix.python-version == 3.7 }}
       uses: codecov/codecov-action@v1.5.2
       with:
         files: coverage-reports/cobertura.xml
         flags: unittests
         fail_ci_if_error: false
 
     - name: SonarCloud Scan
-      if: ${{ matrix.python-version }} == '3.7'
+      if: ${{ matrix.python-version == 3.7 }}
       uses: SonarSource/sonarcloud-github-action@master
       with:
         projectBaseDir: .

diff --git a/docs/index.rst b/docs/index.rst
@@ -59,23 +59,6 @@ FairLens is a tool to help people assess the fairness of datasets and models in
       :text: Go
       :classes: btn-block btn-secondary
 
-   ---
-   :fa:`users,text-black fa-4x,style=fa`
-
-   Contributing
-   ^^^^^^^^^^^^
-
-   Saw a typo in the documentation? Want to improve
-   existing functionalities? The contributing guidelines will guide
-   you through the process of improving FairLens.
-
-   +++
-
-   .. link-button:: contributing
-      :type: ref
-      :text: Go
-      :classes: btn-block btn-secondary
-
 .. toctree::
    :maxdepth: 3
    :hidden:
@@ -84,3 +67,39 @@ FairLens is a tool to help people assess the fairness of datasets and models in
    user_guide/index
    reference/index
    contributing
+
+
+.. overview panel
+.. ---
+.. :fa:`balance-scale,text-black fa-4x,style=fa`
+
+.. Fairness and Bias
+.. ^^^^^^^^^^^^^^^^^
+
+.. An introduction to fairness and bias in data science. Learn more about how you can assess the fairness of
+.. your machine learning pipeline.
+
+.. +++
+
+.. .. link-button:: user_guide/fairness
+..    :type: ref
+..    :text: Go
+..    :classes: btn-block btn-secondary
+
+.. contribution panel
+.. ---
+.. :fa:`users,text-black fa-4x,style=fa`
+
+.. Contributing
+.. ^^^^^^^^^^^^
+
+.. Saw a typo in the documentation? Want to improve
+.. existing functionalities? The contributing guidelines will guide
+.. you through the process of improving FairLens.
+
+.. +++
+
+.. .. link-button:: contributing
+..    :type: ref
+..    :text: Go
+..    :classes: btn-block btn-secondary
diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
@@ -6,7 +6,6 @@ User Guide
   :glob:
   :caption: Getting Started
 
-  fairness
   quickstart
 
 .. toctree::

diff --git a/notebooks/compas.ipynb b/notebooks/compas.ipynb
diff --git a/src/fairlens/metrics/significance.py b/src/fairlens/metrics/significance.py
@@ -10,11 +10,13 @@
 carry out p-value tests or obtain a confidence interval.
 """
 
-from typing import Callable, Optional, Tuple
+from typing import Any, Callable, List, Mapping, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
-from scipy.stats import beta, binom_test, norm
+from scipy.stats import beta, binom_test, brunnermunzel, norm
+
+from .. import utils
 
 
 def binominal_proportion_p_value(p_obs: float, p_null: float, n: int, alternative: str = "two-sided") -> float:
@@ -252,3 +254,40 @@ def resampling_interval(t_obs: float, t_distribution: pd.Series, cl: float = 0.9
     delta = t_distribution - t_obs
     d1, d2 = np.percentile(delta, percentiles)
     return t_obs + d1, t_obs + d2
+
+
+def brunner_munzel_test(
+    df: pd.DataFrame,
+    target_attr: str,
+    group1: Union[Mapping[str, List[Any]], pd.Series],
+    group2: Union[Mapping[str, List[Any]], pd.Series],
+) -> Tuple[float, float]:
+    """Compute the non-parametric Brunner-Munzel test of the hypothesis that the probability
+    of getting large values in the target attribute distributions (determined by the input
+    groups of interest) is equal, without requiring equivariance.
+
+    Args:
+        df (pd.DataFrame):
+            The input datafame.
+        target_attr (str):
+            The target attribute in the dataframe from which the distributions are formed.
+        group1 (Union[Mapping[str, List[Any]], pd.Series]):
+            The first group of interest. Each group can be a mapping / dict from attribute to value or
+            a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate
+            to index a subgroup from the dataframe.
+            Examples: {"Sex": ["Male"]}, df["Sex"] == "Female"
+        group2 (Union[Mapping[str, List[Any]], pd.Series]):
+            The second group of interest. Each group can be a mapping / dict from attribute to value or
+            a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate
+            to index a subgroup from the dataframe.
+            Examples: {"Sex": ["Male"]}, df["Sex"] == "Female"
+
+    Returns:
+        Tuple[float, float]:
+            A tuple consisting of the Brunner-Munzel statistic and p-value associated with the significance
+            of the observed difference.
+    """
+    pred_a, pred_b = tuple(utils.get_predicates_mult(df, [group1, group2]))
+    sr_a = df[pred_a][target_attr]
+    sr_b = df[pred_b][target_attr]
+    return brunnermunzel(sr_a, sr_b, nan_policy="omit")
diff --git a/src/fairlens/metrics/statistics.py b/src/fairlens/metrics/statistics.py
@@ -0,0 +1,197 @@
+"""
+This module contains statistical measures for analyzing target variable distributions across sensitive groups.
+"""
+
+import functools
+import operator
+from typing import Any, List, Mapping, Sequence, Union
+
+import pandas as pd
+from scipy.stats import describe, entropy
+
+from .. import utils
+
+
+def _mean_numerical(x: pd.Series) -> float:
+    return describe(x).mean
+
+
+def _variance_numerical(x: pd.Series) -> float:
+    return describe(x).variance
+
+
+def _mean_datetime(x: pd.Series) -> pd.Timedelta:
+    nums = pd.to_datetime(x)
+    date_min = nums.min()
+    diffs = [num - date_min for num in nums]
+    date_mean = date_min + functools.reduce(operator.add, diffs) / len(diffs)
+    return date_mean
+
+
+def _variance_datetime(x: pd.Series) -> pd.Timedelta:
+    nums = pd.to_datetime(x).astype(int)
+    res = nums.std()
+    std = pd.to_timedelta(res)
+    return std
+
+
+def _mode_categorical(x: pd.Series) -> Any:
+    return x.value_counts(sort=True).index[0]
+
+
+def _variance_square_sum(x: pd.Series) -> float:
+    return (x.value_counts(normalize=True) ** 2).sum()
+
+
+def _variance_entropy(x: pd.Series) -> float:
+    counts = x.value_counts()
+    return entropy(counts)
+
+
+def _means_multinomial(x: pd.Series) -> pd.Series:
+    return x.value_counts(normalize=True, sort=False)
+
+
+def _variances_multinomial(x: pd.Series) -> pd.Series:
+    probs = x.value_counts(normalize=True, sort=False)
+    variances = pd.Series([prob * (1 - prob) for prob in probs], index=probs.index)
+    return variances
+
+
+def sensitive_group_analysis(
+    df: pd.DataFrame,
+    target_attr: str,
+    groups: Sequence[Union[Mapping[str, List[Any]], pd.Series]],
+    categorical_mode: str = "multinomial",
+) -> pd.DataFrame:
+    """This function produces a summary of the first two central moments of the distributions created
+    from the target attribute by applying predicates generated by a list of groups of interest. Allows
+    the user to quickly scan how the target varies and how the expected value is different based on
+    possibly protected attributes.
+    Supports binary, date-like, numerical and categorical data for the target column.
+
+    Args:
+        df (pd.DataFrame):
+            The input datafame.
+        target_attr (str):
+            The target attribute in the dataframe from which the distributions are formed.
+        groups (List[Union[Mapping[str, List[Any]], pd.Series]]):
+            The list of groups of interest. Each group can be a mapping / dict from attribute to value or
+            a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate
+            to index a subgroup from the dataframe.
+            Examples of valid groups: {"Sex": ["Male"]}, df["Sex"] == "Female"
+        categorical_mode (str):
+            Allows the user to choose which method will be used for computing the first moment for categorical
+            (and implicitly, binary) series. Can be "square", "entropy" which will use the mode or "multinomial",
+            which returns the probability of each variable occuring. Defaults to "multinomial".
+
+    Returns:
+        pd.DataFrame:
+            A dataframe comprising and reporting the results for the means and variances across the groups
+            of interest which is adapted to the type of the underlying data in the target column.
+    """
+
+    preds = utils.get_predicates_mult(df, groups)
+    distrs = [df[pred][target_attr] for pred in preds]
+    target_type = utils.infer_distr_type(df[target_attr])
+
+    if target_type.is_continuous():
+        sr_type = "continuous"
+    elif target_type.is_datetime():
+        sr_type = "datetime"
+    else:
+        sr_type = "categorical"
+
+    means = [compute_distribution_mean(distr, x_type=sr_type, categorical_mode=categorical_mode) for distr in distrs]
+    variances = [
+        compute_distribution_variance(distr, x_type=sr_type, categorical_mode=categorical_mode) for distr in distrs
+    ]
+
+    # In the case of the multinomial mode of analysis for the categorical variable, the output results from
+    # the corresponding functions for the mean and variance will output series instead of floats (as they
+    # compute a mean and variance for each of the nominal variables).
+    # We create two dataframes, one for means and one for variances, where the column names refer to
+    # the categorical variables and the indexes refer to the corresponding groups.
+    if target_type.is_categorical() and categorical_mode == "multinomial":
+        means_df = pd.DataFrame(means, means.index, columns=df[target_attr].value_counts(sort=False))
+        variances_df = pd.DataFrame(variances, variances.index, columns=df[target_attr].value_counts(sort=False))
+
+        return means_df.append(variances_df)
+
+    results = {"Means": means, "Variances": variances}
+
+    return pd.DataFrame(results, index=groups)
+
+
+def compute_distribution_mean(
+    x: pd.Series, x_type: str, categorical_mode: str = "multinomial"
+) -> Union[float, pd.Series]:
+    """This function computes the mean (means) of a given distribution, based on the type of its underlying
+    data. Supports binary, date-like, numerical and categorical data for the distribution.
+
+    Args:
+        x (pd.Series):
+            The series representing the distribution for which the mean will be calculated
+        x_type (str):
+            This is the underlying type of the target attribute distribution and is passed to avoid errors caused
+            by very specific groping.
+        categorical_mode (str, optional):
+            Allows the user to choose which method will be used for computing the first moment for categorical
+            (and implicitly, binary) series. Can be "square", "entropy" which will use the mode or "multinomial",
+            which returns the probability of each variable occuring. Defaults to "multinomial".
+
+    Returns:
+        Union[float, pd.Series]:
+            The mean (or means, if considering a categorical distribution to be multinomial, for example)
+            of the given distribution.
+    """
+
+    if x_type == "continuous":
+        return _mean_numerical(x)
+    elif x_type == "datetime":
+        return _mean_datetime(x)
+    elif categorical_mode == "square":
+        return _mode_categorical(x)
+    elif categorical_mode == "entropy":
+        return _mode_categorical(x)
+    elif categorical_mode == "multinomial":
+        return _means_multinomial(x)
+    else:
+        return None
+
+
+def compute_distribution_variance(
+    x: pd.Series, x_type: str, categorical_mode: str = "multinomial"
+) -> Union[float, pd.Series]:
+    """This function computes the variances (variances) of a given distribution, based on the type of its underlying
+    data. Supports binary, date-like, numerical and categorical data for the distribution.
+
+    Args:
+        x (pd.Series):
+            The series representing the distribution for which the variance will be calculated
+        x_type (str):
+            This is the underlying type of the target attribute distribution and is passed to avoid errors caused
+            by very specific groping.
+        categorical_mode (str, optional):
+            Allows the user to choose which method will be used for computing the first moment for categorical
+            (and implicitly, binary) series. Can be "square", "entropy" which will use the mode or "multinomial",
+            which returns the probability of each variable occuring. Defaults to "multinomial".
+
+    Returns:
+        Union[float, pd.Series]:
+            The variance (or variances if considering a categorical distribution to be multinomial, for example)
+            of the given distribution.
+    """
+
+    if x_type == "continuous":
+        return _variance_numerical(x)
+    elif x_type == "datetime":
+        return _variance_datetime(x)
+    elif categorical_mode == "square":
+        return _variance_square_sum(x)
+    elif categorical_mode == "entropy":
+        return _variance_entropy(x)
+    elif categorical_mode == "multinomial":
+        return _variances_multinomial(x)
+    else:
+        return None