# Logistic regression for ACS income dataset

## Imports and settings

In [1]:
import sys

sys.path.append('../')

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from src.data.datasets import fetch_openml_data, ACS_INCOME_ID
from src.data.utils import num_to_percentile_category
from src.eval.fairness import eval_binary_clf_fairness

In [58]:
# model
est = LogisticRegression(max_iter=5000)

# data
data = fetch_openml_data(ACS_INCOME_ID)
X = data["features"]
y = num_to_percentile_category(data["labels"], 2)
sensitive_id = "SEX"
sensitive = data["features"][sensitive_id]
X_train, X_test, y_train, y_test, sensitive_train, sensitive_test = train_test_split(
    X, y, sensitive, test_size=0.98, random_state=42
)

# eval metrics
stats_list = ["accuracy", "count", "selection rate", "true positive rate", "false positive rate"]

## Methods

### Raw estimator

In [4]:
model = est
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

Unnamed: 0_level_0,accuracy,count,selection rate,true positive rate,false positive rate
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.760376,849586,0.654737,0.861812,0.376109
2.0,0.742935,781624,0.369099,0.634631,0.179891


### Exponentiated gradient

The key idea is to reduce fair classification to a sequence of cost-sensitive classification problems, whose solutions yield a randomized classifier with the lowest (empirical) error subject to the desired constraints.

In [6]:
from fairlearn.reductions import ExponentiatedGradient, DemographicParity

In [7]:
constraint = DemographicParity()

model = ExponentiatedGradient(
    estimator=est,
    constraints=constraint
)
model.fit(X_train, y_train, sensitive_features=sensitive_train)  # Takes ~ 2 minutes

In [8]:
y_pred = model.predict(X_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

Unnamed: 0_level_0,accuracy,count,selection rate,true positive rate,false positive rate
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.760643,849586,0.532586,0.755579,0.232542
2.0,0.750239,781624,0.513144,0.816505,0.29698


### Grid search

Simplified version of the exponentiated gradient reduction algorithm. Grid search works by generating a sequence of relabelling and reweightings, and trains a predictor for each.

In [9]:
from fairlearn.reductions import GridSearch

In [10]:
constraint = DemographicParity()

sweep = GridSearch(
    estimator=est,
    constraints=constraint,
    grid_size=34
)
sweep.fit(X_train, y_train, sensitive_features=sensitive_train)  # Takes ~ 2 minutes

In [11]:
model = sweep.predictors_[sweep.best_idx_]
y_pred = model.predict(X_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

Unnamed: 0_level_0,accuracy,count,selection rate,true positive rate,false positive rate
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.754302,849586,0.503186,0.724426,0.2055
2.0,0.74191,781624,0.545318,0.84516,0.331662


### Threshold optimizer

The classifier is obtained by applying group-specific thresholds to the provided estimator. The thresholds are chosen to optimize the provided performance objective subject to the provided fairness constraints.

In [12]:
from fairlearn.postprocessing import ThresholdOptimizer

In [15]:
model = ThresholdOptimizer(
    estimator=est,
    constraints='demographic_parity',
    objective='accuracy_score',
    predict_method='auto'
)
model.fit(X_train, y_train, sensitive_features=sensitive_train)  # Takes ~ 2 minutes

In [19]:
y_pred = model.predict(X_test, sensitive_features=sensitive_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

Unnamed: 0_level_0,accuracy,count,selection rate,true positive rate,false positive rate
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.747074,849586,0.525218,0.73733,0.239814
2.0,0.723231,781624,0.525137,0.798462,0.330375


### Correlation remover

CorrelationRemover applies a linear transformation to the non-sensitive feature columns in order to remove their correlation with the sensitive feature columns while retaining as much information as possible (as measured by the least-squares error).

In [38]:
import pandas as pd
from fairlearn.preprocessing import CorrelationRemover

In [78]:
corr_remover = CorrelationRemover(sensitive_feature_ids=[sensitive_id])
X_train_cr = corr_remover.fit_transform(X_train)
columns = list(X_train.columns)
columns.remove(sensitive_id)
X_train_cr = pd.DataFrame(X_train_cr, columns=columns)
X_train_cr[sensitive_id] = X_train[sensitive_id].values
X_train_cr = X_train_cr[X_train.columns]
X_train_cr.index = X_train.index

In [81]:
X_train_cr.head(3)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST
479130,25.104897,1.031771,19.752889,4.962328,5122.550117,17.144289,2.111766,32.721408,2.0,5.974838,14.962219
175101,25.104897,1.031771,20.752889,4.962328,1782.550117,38.144289,0.111766,42.721408,2.0,0.974838,5.962219
103053,31.901269,5.970096,17.232586,5.035458,8732.28693,23.98175,-0.105196,57.438553,1.0,9.023683,6.03556


In [82]:
X_train.head(3)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST
479130,25.0,1.0,20.0,5.0,4700.0,15.0,2.0,30.0,2.0,6.0,15.0
175101,25.0,1.0,21.0,5.0,1360.0,36.0,0.0,40.0,2.0,1.0,6.0
103053,32.0,6.0,17.0,5.0,9130.0,26.0,0.0,60.0,1.0,9.0,6.0


In [83]:
model = est
model.fit(X_train_cr, y_train)

In [84]:
y_pred = model.predict(X_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

Unnamed: 0_level_0,accuracy,count,selection rate,true positive rate,false positive rate
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.76117,849586,0.667493,0.873622,0.39014
2.0,0.742489,781624,0.352773,0.614475,0.166293


### Adversarial fairness classifier

The predictor model takes the features `X` as input and seeks to predict `y`. The adversarial model takes the features `X` and the predicted labels `y` as input and seeks to predict the sensitive feature `A`. The predictor and adversarial models are trained jointly to minimize the prediction error of the predictor model while maximizing the prediction error of the adversarial model.

In [87]:
from fairlearn.adversarial import AdversarialFairnessClassifier
import torch

In [None]:
predictor_model = None
adversary_model = None
predictor_optimizer = None
adversary_optimizer = None

In [None]:
model = AdversarialFairnessClassifier(
    backend='torch',
    predictor_model=predictor_model,
    adversary_model=adversary_model,
    predictor_optimizer=predictor_optimizer,
    adversary_optimizer=adversary_optimizer,
    constraints='demographic_parity',
    epochs=10,
    progress_updates=1
)
model.fit(X_train, y_train, sensitive_features=sensitive_train)  # Takes ~ 2 minutes

In [None]:
y_pred = model.predict(X_test, sensitive_features=sensitive_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

### Adversarial fairness regressor