Skip to content

Commit

Permalink
Merge branch 'main' into smart-partitioning
Browse files Browse the repository at this point in the history
  • Loading branch information
simonhkswan authored Sep 24, 2021
2 parents feba5e3 + f68761c commit 9e59e26
Show file tree
Hide file tree
Showing 12 changed files with 759 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ jobs:
SYNTHESIZED_KEY: ${{ secrets.SYNTHESIZED_KEY }}

- name: Upload Codecov report
if: ${{ matrix.python-version }} == '3.7'
if: ${{ matrix.python-version == 3.7 }}
uses: codecov/codecov-action@v1.5.2
with:
files: coverage-reports/cobertura.xml
flags: unittests
fail_ci_if_error: false

- name: SonarCloud Scan
if: ${{ matrix.python-version }} == '3.7'
if: ${{ matrix.python-version == 3.7 }}
uses: SonarSource/sonarcloud-github-action@master
with:
projectBaseDir: .
Expand Down
53 changes: 36 additions & 17 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,6 @@ FairLens is a tool to help people assess the fairness of datasets and models in
:text: Go
:classes: btn-block btn-secondary

---
:fa:`users,text-black fa-4x,style=fa`

Contributing
^^^^^^^^^^^^

Saw a typo in the documentation? Want to improve
existing functionalities? The contributing guidelines will guide
you through the process of improving FairLens.

+++

.. link-button:: contributing
:type: ref
:text: Go
:classes: btn-block btn-secondary

.. toctree::
:maxdepth: 3
:hidden:
Expand All @@ -84,3 +67,39 @@ FairLens is a tool to help people assess the fairness of datasets and models in
user_guide/index
reference/index
contributing


.. overview panel
.. ---
.. :fa:`balance-scale,text-black fa-4x,style=fa`
.. Fairness and Bias
.. ^^^^^^^^^^^^^^^^^
.. An introduction to fairness and bias in data science. Learn more about how you can assess the fairness of
.. your machine learning pipeline.
.. +++
.. .. link-button:: user_guide/fairness
.. :type: ref
.. :text: Go
.. :classes: btn-block btn-secondary
.. contribution panel
.. ---
.. :fa:`users,text-black fa-4x,style=fa`
.. Contributing
.. ^^^^^^^^^^^^
.. Saw a typo in the documentation? Want to improve
.. existing functionalities? The contributing guidelines will guide
.. you through the process of improving FairLens.
.. +++
.. .. link-button:: contributing
.. :type: ref
.. :text: Go
.. :classes: btn-block btn-secondary
1 change: 0 additions & 1 deletion docs/user_guide/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ User Guide
:glob:
:caption: Getting Started

fairness
quickstart

.. toctree::
Expand Down
18 changes: 14 additions & 4 deletions notebooks/compas.ipynb

Large diffs are not rendered by default.

43 changes: 41 additions & 2 deletions src/fairlens/metrics/significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
carry out p-value tests or obtain a confidence interval.
"""

from typing import Callable, Optional, Tuple
from typing import Any, Callable, List, Mapping, Optional, Tuple, Union

import numpy as np
import pandas as pd
from scipy.stats import beta, binom_test, norm
from scipy.stats import beta, binom_test, brunnermunzel, norm

from .. import utils


def binominal_proportion_p_value(p_obs: float, p_null: float, n: int, alternative: str = "two-sided") -> float:
Expand Down Expand Up @@ -252,3 +254,40 @@ def resampling_interval(t_obs: float, t_distribution: pd.Series, cl: float = 0.9
delta = t_distribution - t_obs
d1, d2 = np.percentile(delta, percentiles)
return t_obs + d1, t_obs + d2


def brunner_munzel_test(
df: pd.DataFrame,
target_attr: str,
group1: Union[Mapping[str, List[Any]], pd.Series],
group2: Union[Mapping[str, List[Any]], pd.Series],
) -> Tuple[float, float]:
"""Compute the non-parametric Brunner-Munzel test of the hypothesis that the probability
of getting large values in the target attribute distributions (determined by the input
groups of interest) is equal, without requiring equivariance.
Args:
df (pd.DataFrame):
The input datafame.
target_attr (str):
The target attribute in the dataframe from which the distributions are formed.
group1 (Union[Mapping[str, List[Any]], pd.Series]):
The first group of interest. Each group can be a mapping / dict from attribute to value or
a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate
to index a subgroup from the dataframe.
Examples: {"Sex": ["Male"]}, df["Sex"] == "Female"
group2 (Union[Mapping[str, List[Any]], pd.Series]):
The second group of interest. Each group can be a mapping / dict from attribute to value or
a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate
to index a subgroup from the dataframe.
Examples: {"Sex": ["Male"]}, df["Sex"] == "Female"
Returns:
Tuple[float, float]:
A tuple consisting of the Brunner-Munzel statistic and p-value associated with the significance
of the observed difference.
"""
pred_a, pred_b = tuple(utils.get_predicates_mult(df, [group1, group2]))
sr_a = df[pred_a][target_attr]
sr_b = df[pred_b][target_attr]
return brunnermunzel(sr_a, sr_b, nan_policy="omit")
197 changes: 197 additions & 0 deletions src/fairlens/metrics/statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""
This module contains statistical measures for analyzing target variable distributions across sensitive groups.
"""

import functools
import operator
from typing import Any, List, Mapping, Sequence, Union

import pandas as pd
from scipy.stats import describe, entropy

from .. import utils


def _mean_numerical(x: pd.Series) -> float:
return describe(x).mean


def _variance_numerical(x: pd.Series) -> float:
return describe(x).variance


def _mean_datetime(x: pd.Series) -> pd.Timedelta:
nums = pd.to_datetime(x)
date_min = nums.min()
diffs = [num - date_min for num in nums]
date_mean = date_min + functools.reduce(operator.add, diffs) / len(diffs)
return date_mean


def _variance_datetime(x: pd.Series) -> pd.Timedelta:
nums = pd.to_datetime(x).astype(int)
res = nums.std()
std = pd.to_timedelta(res)
return std


def _mode_categorical(x: pd.Series) -> Any:
return x.value_counts(sort=True).index[0]


def _variance_square_sum(x: pd.Series) -> float:
return (x.value_counts(normalize=True) ** 2).sum()


def _variance_entropy(x: pd.Series) -> float:
counts = x.value_counts()
return entropy(counts)


def _means_multinomial(x: pd.Series) -> pd.Series:
return x.value_counts(normalize=True, sort=False)


def _variances_multinomial(x: pd.Series) -> pd.Series:
probs = x.value_counts(normalize=True, sort=False)
variances = pd.Series([prob * (1 - prob) for prob in probs], index=probs.index)
return variances


def sensitive_group_analysis(
df: pd.DataFrame,
target_attr: str,
groups: Sequence[Union[Mapping[str, List[Any]], pd.Series]],
categorical_mode: str = "multinomial",
) -> pd.DataFrame:
"""This function produces a summary of the first two central moments of the distributions created
from the target attribute by applying predicates generated by a list of groups of interest. Allows
the user to quickly scan how the target varies and how the expected value is different based on
possibly protected attributes.
Supports binary, date-like, numerical and categorical data for the target column.
Args:
df (pd.DataFrame):
The input datafame.
target_attr (str):
The target attribute in the dataframe from which the distributions are formed.
groups (List[Union[Mapping[str, List[Any]], pd.Series]]):
The list of groups of interest. Each group can be a mapping / dict from attribute to value or
a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate
to index a subgroup from the dataframe.
Examples of valid groups: {"Sex": ["Male"]}, df["Sex"] == "Female"
categorical_mode (str):
Allows the user to choose which method will be used for computing the first moment for categorical
(and implicitly, binary) series. Can be "square", "entropy" which will use the mode or "multinomial",
which returns the probability of each variable occuring. Defaults to "multinomial".
Returns:
pd.DataFrame:
A dataframe comprising and reporting the results for the means and variances across the groups
of interest which is adapted to the type of the underlying data in the target column.
"""

preds = utils.get_predicates_mult(df, groups)
distrs = [df[pred][target_attr] for pred in preds]
target_type = utils.infer_distr_type(df[target_attr])

if target_type.is_continuous():
sr_type = "continuous"
elif target_type.is_datetime():
sr_type = "datetime"
else:
sr_type = "categorical"

means = [compute_distribution_mean(distr, x_type=sr_type, categorical_mode=categorical_mode) for distr in distrs]
variances = [
compute_distribution_variance(distr, x_type=sr_type, categorical_mode=categorical_mode) for distr in distrs
]

# In the case of the multinomial mode of analysis for the categorical variable, the output results from
# the corresponding functions for the mean and variance will output series instead of floats (as they
# compute a mean and variance for each of the nominal variables).
# We create two dataframes, one for means and one for variances, where the column names refer to
# the categorical variables and the indexes refer to the corresponding groups.
if target_type.is_categorical() and categorical_mode == "multinomial":
means_df = pd.DataFrame(means, means.index, columns=df[target_attr].value_counts(sort=False))
variances_df = pd.DataFrame(variances, variances.index, columns=df[target_attr].value_counts(sort=False))

return means_df.append(variances_df)

results = {"Means": means, "Variances": variances}

return pd.DataFrame(results, index=groups)


def compute_distribution_mean(
x: pd.Series, x_type: str, categorical_mode: str = "multinomial"
) -> Union[float, pd.Series]:
"""This function computes the mean (means) of a given distribution, based on the type of its underlying
data. Supports binary, date-like, numerical and categorical data for the distribution.
Args:
x (pd.Series):
The series representing the distribution for which the mean will be calculated
x_type (str):
This is the underlying type of the target attribute distribution and is passed to avoid errors caused
by very specific groping.
categorical_mode (str, optional):
Allows the user to choose which method will be used for computing the first moment for categorical
(and implicitly, binary) series. Can be "square", "entropy" which will use the mode or "multinomial",
which returns the probability of each variable occuring. Defaults to "multinomial".
Returns:
Union[float, pd.Series]:
The mean (or means, if considering a categorical distribution to be multinomial, for example)
of the given distribution.
"""

if x_type == "continuous":
return _mean_numerical(x)
elif x_type == "datetime":
return _mean_datetime(x)
elif categorical_mode == "square":
return _mode_categorical(x)
elif categorical_mode == "entropy":
return _mode_categorical(x)
elif categorical_mode == "multinomial":
return _means_multinomial(x)
else:
return None


def compute_distribution_variance(
x: pd.Series, x_type: str, categorical_mode: str = "multinomial"
) -> Union[float, pd.Series]:
"""This function computes the variances (variances) of a given distribution, based on the type of its underlying
data. Supports binary, date-like, numerical and categorical data for the distribution.
Args:
x (pd.Series):
The series representing the distribution for which the variance will be calculated
x_type (str):
This is the underlying type of the target attribute distribution and is passed to avoid errors caused
by very specific groping.
categorical_mode (str, optional):
Allows the user to choose which method will be used for computing the first moment for categorical
(and implicitly, binary) series. Can be "square", "entropy" which will use the mode or "multinomial",
which returns the probability of each variable occuring. Defaults to "multinomial".
Returns:
Union[float, pd.Series]:
The variance (or variances if considering a categorical distribution to be multinomial, for example)
of the given distribution.
"""

if x_type == "continuous":
return _variance_numerical(x)
elif x_type == "datetime":
return _variance_datetime(x)
elif categorical_mode == "square":
return _variance_square_sum(x)
elif categorical_mode == "entropy":
return _variance_entropy(x)
elif categorical_mode == "multinomial":
return _variances_multinomial(x)
else:
return None
Loading

0 comments on commit 9e59e26

Please sign in to comment.