# Logistic regression for ACS income dataset

## Imports and settings

In [1]:
import sys

sys.path.append('../')

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from src.data.datasets import fetch_openml_data, ACS_INCOME_ID
from src.data.utils import num_to_percentile_category
from src.eval.fairness import eval_binary_clf_fairness

In [3]:
# model
est = LogisticRegression(max_iter=5000)

# data
data = fetch_openml_data(ACS_INCOME_ID)
X = data["features"]
y = num_to_percentile_category(data["labels"], 2)
sensitive = data["features"]["SEX"]
X_train, X_test, y_train, y_test, sensitive_train, sensitive_test = train_test_split(
    X, y, sensitive, test_size=0.98, random_state=42
)

# eval metrics
stats_list = ["accuracy", "count", "selection rate", "true positive rate", "false positive rate"]

## Methods

### Raw estimator

In [4]:
model = est
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

### Exponentiated gradient

The key idea is to reduce fair classification to a sequence of cost-sensitive classification problems, whose solutions yield a randomized classifier with the lowest (empirical) error subject to the desired constraints.

In [None]:
from fairlearn.reductions import ExponentiatedGradient, DemographicParity

In [None]:
constraint = DemographicParity()

model = ExponentiatedGradient(
    estimator=est,
    constraints=constraint
)
model.fit(X_train, y_train, sensitive_features=sensitive_train)  # Takes ~ 2 minutes

In [None]:
y_pred = model.predict(X_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

Unnamed: 0_level_0,accuracy,count,selection rate,true positive rate,false positive rate
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.760821,849586,0.532531,0.755686,0.232269
2.0,0.750133,781624,0.513281,0.816542,0.297188


### Grid search

Simplified version of the exponentiated gradient reduction algorithm. Grid search works by generating a sequence of relabelling and reweightings, and trains a predictor for each.

In [None]:
from fairlearn.reductions import GridSearch

In [None]:
constraint = DemographicParity()

sweep = GridSearch(
    estimator=est,
    constraints=constraint,
    grid_size=34
)
sweep.fit(X_train, y_train, sensitive_features=sensitive_train)  # Takes ~ 2 minutes

In [None]:
model = sweep.predictors_[sweep.best_idx_]
y_pred = model.predict(X_test)
metric = eval_binary_clf_fairness(y_test, y_pred, sensitive_test)  # Takes ~ 20 seconds
metric.by_group[stats_list]

Unnamed: 0_level_0,accuracy,count,selection rate,true positive rate,false positive rate
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.754302,849586,0.503186,0.724426,0.2055
2.0,0.74191,781624,0.545318,0.84516,0.331662


### Threshold optimizer

In [None]:
# Threshold optimizer
def threshold_opt(est, constraint, objective, X, y_true, sensitive):
    mitigator = ThresholdOptimizer(
        estimator=est,
        constraints=constraint,
        objective=objective,
        predict_method='auto'
    )
    mitigator.fit(X, y_true, sensitive_features=sensitive)
    return mitigator

### Correlation remover

### Adversarial fairness classifier

### Adversarial fairness regressor