In [1]:
%pip install fairlearn

Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from fairlearn.metrics import demographic_parity_difference

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
data = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)

In [5]:
# Handle missing data
data_cleaned = data.dropna()

In [6]:
# Convert categorical variables to dummy variables
data_encoded = pd.get_dummies(data_cleaned, drop_first=True)

In [7]:
# Check class balance before sampling
print(f'Class distribution before sampling:\n{data_encoded["income_>50K"].value_counts()}')

Class distribution before sampling:
income_>50K
False    24720
True      7841
Name: count, dtype: int64


In [8]:
# Perform random sampling to get a subset of the data
sampled_data = data_encoded.sample(frac=0.5, random_state=42)

In [9]:
# Check class balance after sampling
print(f'Class distribution after sampling:\n{sampled_data["income_>50K"].value_counts()}')

Class distribution after sampling:
income_>50K
False    12356
True      3924
Name: count, dtype: int64


In [10]:
# Define features and target variable
X = sampled_data.drop('income_>50K', axis=1)
y = sampled_data['income_>50K']

In [11]:
# Split sampled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Fit logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7920761670761671


In [16]:
# Define the sensitive features in the test set
sensitive_features_test = X_test[['sex_Male', 'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White']]

# Calculate demographic parity difference for sex
dp_difference_sex = demographic_parity_difference(y_test, y_pred_binary, sensitive_features=sensitive_features_test['sex_Male'])
print(f"Demographic Parity Difference (Sex): {dp_difference_sex * 100:.2f}%")

# Calculate demographic parity difference for race
# Combine one-hot encoded race columns into a single race column for demographic parity calculation
sensitive_features_test['race'] = sensitive_features_test[['race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White']].idxmax(axis=1)
dp_difference_race = demographic_parity_difference(y_test, y_pred_binary, sensitive_features=sensitive_features_test['race'])
print(f"Demographic Parity Difference (Race): {dp_difference_race * 100:.2f}%")

Demographic Parity Difference (Sex): 5.36%
Demographic Parity Difference (Race): 11.02%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sensitive_features_test['race'] = sensitive_features_test[['race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White']].idxmax(axis=1)
