# Algorithmic Fairness, Accountability, and Ethics, Spring 2024

## Mandatory Assignment 2

Please use the following code to prepare the dataset.
 

In [47]:
from folktables.acs import adult_filter
from folktables import ACSDataSource, BasicProblem, generate_categories
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.stats import  pearsonr

In [26]:

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)


def adult_filter(data):
    """Mimic the filters in place for Adult data.
    Adult documentation notes: Extraction was done by Barry Becker from
    the 1994 Census database. A set of reasonably clean records was extracted
    using the following conditions:
    ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
    """
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    df = df[df["RAC1P"] < 3] ## keep only Whites and African-Americans
    return df


ACSIncomeNew = BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'RELP',
        'WKHP',
        'PWGTP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
    target_transform=lambda x: x > 25000,    
    group=['SEX', 'RAC1P'],
    preprocess=adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

definition_df = data_source.get_definitions(download=True)
categories = generate_categories(features=ACSIncomeNew.features, definition_df=definition_df)
features, labels, groups = ACSIncomeNew.df_to_pandas(acs_data, categories=categories, dummies=True)

# Drop the "redundant" columns
features = features.drop(["RAC1P_White alone", 
                          "SEX_Male", 
                          "SCHL_1 or more years of college credit, no degree",  
                          "MAR_Divorced", 
                          "RELP_Adopted son or daughter",
                          'COW_Working without pay in family business or farm' ], axis = 1) 

print("Columns with the protected features:")
for i, f in enumerate(features.columns):
    if ("RAC1P" in f) or ("SEX" in f):
        print("Column ID: %s" %i, "(%s)"%f)
        
features.head()

Columns with the protected features:
Column ID: 54 (SEX_Female)
Column ID: 55 (RAC1P_Black or African American alone)


Unnamed: 0,AGEP,WKHP,PWGTP,"COW_Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions","COW_Employee of a private not-for-profit, tax-exempt, or charitable organization",COW_Federal government employee,"COW_Local government employee (city, county, etc.)","COW_Self-employed in own incorporated business, professional practice or farm","COW_Self-employed in own not incorporated business, professional practice, or farm",COW_State government employee,...,RELP_Other nonrelative,RELP_Other relative,RELP_Parent-in-law,RELP_Reference person,RELP_Roomer or boarder,RELP_Son-in-law or daughter-in-law,RELP_Stepson or stepdaughter,RELP_Unmarried partner,SEX_Female,RAC1P_Black or African American alone
0,21,20.0,52,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,65,8.0,33,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,33,40.0,53,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,18,18.0,106,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,27,50.0,23,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# TASK 1

In [6]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features.values, labels.values.reshape(-1), groups, test_size=0.3, random_state=0, shuffle=True)

# Subsample for efficiency
# N = 20000  # Subsample size

# Generate random unique indices for subsampling
# indices_train = np.random.choice(X_train.shape[0], N, replace=False)
# indices_test = np.random.choice(X_test.shape[0], N, replace=False)

# Subsample using the random indices
### Train set
# X_train = X_train[indices_train]
# y_train = y_train[indices_train]
# group_train = group_train.iloc[indices_train]
### Test set
# X_test = X_test[indices_test]
# y_test = y_test[indices_test]
# group_test = group_test.iloc[indices_test]

group_test_dict = {
    'Males': group_test['SEX'] == 1,
    'Females': group_test['SEX'] == 2,
    'Whites': group_test['RAC1P'] == 1,
    'African-Americans': group_test['RAC1P'] == 2
}

# Scale all features (even OHE)
scaler = StandardScaler().fit(X_train)
Xs_train = scaler.transform(X_train)
Xs_test = scaler.transform(X_test)
n_features = X_train.shape[1]

# Subset for protected and non-protected features
Xs_train_p = Xs_train[:, 54:]
Xs_test_p = Xs_test[:, 54:]
Xs_train_np = Xs_train[:, :54]
Xs_test_np = Xs_test[:, :54]

In [36]:
# Fit model
model = LogisticRegression(penalty='l2', max_iter=1000, solver='newton-cholesky')
model.fit(Xs_train_np, y_train)
y_hat = model.predict(Xs_test_np)

# Calculate and print F1 Scores for each group
for group, group_idxs in group_test_dict.items():
    print(f'F1, {group}: {f1_score(y_test[group_idxs], y_hat[group_idxs]):.2f}')

# Calculate and print positive rates for each group
for group, group_idxs in group_test_dict.items():
    print(f'Positive Rate, {group}: {np.mean(y_hat[group_idxs]):.2f}')

F1, Males: 0.89
F1, Females: 0.84
F1, Whites: 0.87
F1, African-Americans: 0.84
Positive Rate, Males: 0.76
Positive Rate, Females: 0.72
Positive Rate, Whites: 0.75
Positive Rate, African-Americans: 0.71


### Evaluating the model using cross validation.

In [54]:
model = LogisticRegression(penalty='l2', max_iter=1000)
print('Cross-validation scores:', cross_validate(model, Xs_train_np, y_train, groups=group_train, cv=5, scoring=["f1", "accuracy"]))

Cross-validation scores: {'fit_time': array([0.24236155, 0.25329971, 0.1817987 , 0.2692976 , 0.21836972]), 'score_time': array([0.00460768, 0.00299978, 0.00703955, 0.00419807, 0.00473189]), 'test_f1': array([0.87530562, 0.87428273, 0.87194696, 0.87205446, 0.87394958]), 'test_accuracy': array([0.8215 , 0.81925, 0.8165 , 0.81675, 0.82   ])}


# TASK 2

In [55]:
def debias_features(Xs_np, Xs_p, lambda_ = 1):
    assert Xs_np.shape[0]==Xs_p.shape[0]
    
    # Find orthonormal basis of protected features
    orthbasis = scipy.linalg.orth(Xs_p)

    # 𝜆 · (𝒙𝑗 − 𝒓𝑗)
    # Debias nonprotected features
    Xs_np_debiased = Xs_np - orthbasis @ orthbasis.T @ Xs_np

    # Return debiased nonprotected features
    return Xs_np_debiased + lambda_ * (Xs_np - Xs_np_debiased)

# Xs_train_np_debiased = debias_features(Xs_train_np, Xs_train_p)
# Xs_train_debiased = np.concatenate([Xs_train_np_debiased, Xs_train_p], axis=1)

In [62]:
accuracies = []

for l in np.linspace(0, 1, 10):
    Xs_train_np_debiased = debias_features(Xs_train_np, Xs_train_p, lambda_=l)
    model.fit(Xs_train_np_debiased, y_train)
    y_hat = model.predict(Xs_test_np)
    # print(f'F1, {group}: {f1_score(y_test, y_hat):.2f}')
    accuracies.append(f1_score(y_test, y_hat))

In [52]:
model = LogisticRegression(penalty='l2', max_iter=1000)
print('Cross-validation scores:', cross_validate(model, Xs_train_np_debiased, y_train, groups=group_train, cv=5, scoring=["f1", "accuracy"]))

Cross-validation scores: {'fit_time': array([0.52206874, 0.48006392, 0.55329776, 0.40183496, 0.46727371]), 'score_time': array([0.0072279 , 0.00736165, 0.00605154, 0.004987  , 0.00700068]), 'test_f1': array([0.87334264, 0.86798337, 0.87038001, 0.86870441, 0.87188426]), 'test_accuracy': array([0.8185 , 0.8095 , 0.81325, 0.811  , 0.81625])}


### FAIR PCA

In [69]:
class FairPCA:
    def __init__(self, Xs, p_idxs, n_components):
        self.fit(Xs, p_idxs, n_components)

    def fit(self, Xs, p_idxs, n_components):
        # Extract protected features
        Xs_p = Xs[:, p_idxs]

        # Compute projection matrix (U)
        Z = Xs_p
        #Z = Z - Z.mean(0) # Since we alredy standardised everything, there is not much sense in removing the mean
        R = scipy.linalg.null_space(Z.T @ Xs)
        eig_vals, L = scipy.linalg.eig(R.T @ Xs.T @ Xs @ R)
        self.U = R @ L[:, :n_components]

    def project(self, Xs):
        return Xs @ self.U
    
fair_pca = FairPCA(Xs_train, [54, 55], 30)
Xs_train_debiased_PCA = fair_pca.project(Xs_train)
Xs_test_debiased_PCA = fair_pca.project(Xs_test)

In [72]:
model = LogisticRegression(penalty='l2', max_iter=1000)
print('Cross-validation scores:', cross_validate(model, Xs_train_debiased_PCA, y_train, groups=group_train, cv=5, scoring=["f1", "accuracy"]))

Cross-validation scores: {'fit_time': array([0.10342455, 0.10191202, 0.06066394, 0.06963634, 0.07271314]), 'score_time': array([0.00315213, 0.01088214, 0.00299978, 0.00508189, 0.00654531]), 'test_f1': array([0.87064156, 0.86411632, 0.8708615 , 0.86586847, 0.86737354]), 'test_accuracy': array([0.8145 , 0.80375, 0.81375, 0.80675, 0.80925])}
