# Demographic Parity Demo: Prediction Time

In [48]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Generate synthetic dataset with 3 features and 2 sensitive attributes
# The dataset will have 1000 samples
# The sensitive attributes indicate the group membership (privileged or unprivileged)
# We use make_classification to create a synthetic dataset
X, sensitive_features = make_classification(n_samples=1000, n_features=3, n_informative=3, n_redundant=0,
                                            n_clusters_per_class=1, n_classes=2, random_state=52)

# Step 2: Split the dataset into train and test sets
# We split the dataset into 80% training and 20% testing
X_train, X_test, sensitive_train, sensitive_test = train_test_split(X, sensitive_features, test_size=0.2, random_state=42)

# Step 3: Train a logistic regression classifier
# We use logistic regression as a simple classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, sensitive_train)

# Step 4: Predict on the test set
predictions = clf.predict(X_test)

# Step 5: Evaluate the model accuracy
# We calculate the accuracy of the model on the test set
accuracy = accuracy_score(sensitive_test, predictions)
print(f'Accuracy: {accuracy}')

# Step 6: Calculate demographic parity
# Demographic parity checks if the rate of positive predictions (TPR) is similar across different groups
# We define two groups: privileged and unprivileged
privileged_group = 1  # privileged group label
unprivileged_group = 0  # unprivileged group label

# Separate the predictions for each group
privileged_predictions = predictions[sensitive_test == privileged_group]
unprivileged_predictions = predictions[sensitive_test == unprivileged_group]

# Calculate the positive rate for each group
privileged_positive_rate = privileged_predictions.mean()
unprivileged_positive_rate = unprivileged_predictions.mean()

print(f'Privileged Group Positive Rate: {privileged_positive_rate}')
print(f'Unprivileged Group Positive Rate: {unprivileged_positive_rate}')

# Step 7: Check if demographic parity holds
# Demographic parity holds if the difference in positive rates between groups is small

demographic_parity = abs(privileged_positive_rate - unprivileged_positive_rate) < 0.1
print(f'Demographic Parity: {demographic_parity}')


Accuracy: 0.97
Privileged Group Positive Rate: 0.9423076923076923
Unprivileged Group Positive Rate: 0.0
Demographic Parity: False


# Demographic Parity: Training Time Enforcement with Demographic Parity Loss

In [47]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Generate synthetic dataset with 3 features and 2 sensitive attributes
X, sensitive_features = make_classification(n_samples=1000, n_features=3, n_informative=3, n_redundant=0,
                                            n_clusters_per_class=1, n_classes=2, random_state=50)

# Split the dataset into train and test sets
X_train, X_test, sensitive_train, sensitive_test = train_test_split(X, sensitive_features, test_size=0.2, random_state=42)

# Define logistic regression classifier with demographic parity loss regularization
class LogisticRegressionWithDemographicParityLoss(LogisticRegression):
    def __init__(self, C=1.0, demographic_parity_weight=0.2, **kwargs):
        self.demographic_parity_weight = demographic_parity_weight
        super(LogisticRegressionWithDemographicParityLoss, self).__init__(C=C, **kwargs)

    def fit(self, X, y, sample_weight=None):
        # Separate training data into privileged and unprivileged groups
        privileged_indices = np.where(sensitive_train == 1)[0]
        unprivileged_indices = np.where(sensitive_train == 0)[0]

        # Calculate positive rates for privileged and unprivileged groups
        privileged_positive_rate = np.mean(y[privileged_indices] == 1)
        unprivileged_positive_rate = np.mean(y[unprivileged_indices] == 1)

        # Compute demographic parity loss
        demographic_parity_loss = np.abs(privileged_positive_rate - unprivileged_positive_rate)

        # Add demographic parity loss as a regularization term to the objective function
        self.C_ = self.C * (1 + self.demographic_parity_weight * demographic_parity_loss)

        # Fit logistic regression with modified regularization parameter
        super(LogisticRegressionWithDemographicParityLoss, self).fit(X, y, sample_weight)

# Train logistic regression classifier with demographic parity loss regularization
clf = LogisticRegressionWithDemographicParityLoss(demographic_parity_weight=0.1, random_state=42)
clf.fit(X_train, sensitive_train)

# Predict on the test set
predictions = clf.predict(X_test)

# Evaluate the model accuracy
accuracy = accuracy_score(sensitive_test, predictions)
print(f'Accuracy: {accuracy}')

# Calculate demographic parity
privileged_group = 1
unprivileged_group = 0
privileged_positive_rate = np.mean(predictions[sensitive_test == privileged_group] == 1)
print(f'Privileged Group Positive Rate: {privileged_positive_rate}')
unprivileged_positive_rate = np.mean(predictions[sensitive_test == unprivileged_group] == 1)
print(f'Unprivileged Group Positive Rate: {unprivileged_positive_rate}')
demographic_parity = abs(privileged_positive_rate - unprivileged_positive_rate) < 0.9
print(f'Demographic Parity: {demographic_parity}')


Accuracy: 0.95
Privileged Group Positive Rate: 1.0
Unprivileged Group Positive Rate: 0.10638297872340426
Demographic Parity: True


# Calibration to Achieve Demographic Parity

In [50]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score

# Generate synthetic dataset with 3 features and 2 sensitive attributes
X, sensitive_features = make_classification(n_samples=1000, n_features=3, n_informative=3, n_redundant=0,
                                            n_clusters_per_class=1, n_classes=2, random_state=42)

# Split the dataset into train and test sets
X_train, X_test, sensitive_train, sensitive_test = train_test_split(X, sensitive_features, test_size=0.2, random_state=42)

# Train logistic regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, sensitive_train)

# Calibrate classifier to achieve demographic parity
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit')
calibrated_clf.fit(X_train, sensitive_train)

# Predict probabilities on the test set
probabilities = calibrated_clf.predict_proba(X_test)

# Evaluate the model accuracy
accuracy = accuracy_score(sensitive_test, predictions)
print(f'Accuracy: {accuracy}')

# Adjust probabilities for demographic parity
privileged_indices = np.where(sensitive_test == 1)[0]
unprivileged_indices = np.where(sensitive_test == 0)[0]
privileged_positive_rate = np.mean(probabilities[privileged_indices, 1])
print(f'Privileged Group Positive Rate Pre-Calibration: {privileged_positive_rate}')
unprivileged_positive_rate = np.mean(probabilities[unprivileged_indices, 1])
print(f'Unprivileged Group Positive Rate Pre-Calibration: {unprivileged_positive_rate}')
calibrated_probabilities = probabilities.copy()
calibrated_probabilities[privileged_indices, 1] *= unprivileged_positive_rate / privileged_positive_rate

# Calculate demographic parity
privileged_positive_rate = np.mean(calibrated_probabilities[privileged_indices, 1])
print(f'Privileged Group Positive Rate Post-Calibration: {privileged_positive_rate}')
unprivileged_positive_rate = np.mean(calibrated_probabilities[unprivileged_indices, 1])
print(f'Unprivileged Group Positive Rate Post-Calibration: {unprivileged_positive_rate}')
demographic_parity = abs(privileged_positive_rate - unprivileged_positive_rate) < 0.1
print(f'Demographic Parity: {demographic_parity}')


Accuracy: 0.495
Privileged Group Positive Rate Pre-Calibration: 0.8976257038713569
Unprivileged Group Positive Rate Pre-Calibration: 0.13274220047738053
Privileged Group Positive Rate Post-Calibration: 0.13274220047738058
Unprivileged Group Positive Rate Post-Calibration: 0.13274220047738053
Demographic Parity: True
