# Notebook Setup

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels as sm
import statsmodels.formula.api as smf
import sklearn as sk

In [37]:
df = pd.read_csv('../data/clean-data/cleaned_nys_data.csv')
df_original = df.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233144 entries, 0 to 233143
Data columns (total 20 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   action_taken                       233144 non-null  int64  
 1   derived_race                       233144 non-null  object 
 2   derived_ethnicity                  233144 non-null  object 
 3   applicant_sex                      233144 non-null  int64  
 4   applicant_age                      233144 non-null  object 
 5   income                             226538 non-null  float64
 6   debt_to_income_ratio               100684 non-null  float64
 7   applicant_credit_score_type        233144 non-null  int64  
 8   loan_amount                        233144 non-null  float64
 9   loan_to_value_ratio                224447 non-null  float64
 10  interest_rate                      176143 non-null  float64
 11  rate_spread                        1519

  df = pd.read_csv('../data/clean-data/cleaned_nys_data.csv')


# Impact Analysis Using HMDA Loan Approval Data

In [72]:
# Selecting Black and White applications and creating variable denoting whether white
df_slice = df.loc[(df['derived_race'] == "Black or African American") | (df['derived_race'] == "White")]
df_race = df_slice.copy()
df_race['binary_race'] = 0
df_race.loc[df_race['derived_race'] == "White", 'binary_race'] = 1

# Transform derived ethnicity to binary column for Hispanic or not
df_race['binary_ethnicity'] = 0
df_race.loc[df_race['derived_ethnicity'] == "Not Hispanic or Latino", 'binary_ethnicity'] = 1

In [81]:
age_map = {'<25': 1, '25-34': 2, '35-44' : 3, '45-54' : 4, '55-64' : 5, '65-74' : 6, '>74' : 7,'8888' : 8}
df_race['age_categorical'] = df_race['applicant_age'].map(age_map)
# 

In [82]:
df_race.columns

Index(['action_taken', 'derived_race', 'derived_ethnicity', 'applicant_sex',
       'applicant_age', 'income', 'debt_to_income_ratio',
       'applicant_credit_score_type', 'loan_amount', 'loan_to_value_ratio',
       'interest_rate', 'rate_spread', 'loan_type', 'loan_purpose',
       'lien_status', 'property_value', 'occupancy_type',
       'tract_minority_population_percent', 'aus-1', 'denial_reason-1',
       'binary_race', 'binary_ethnicity', 'age_categorical'],
      dtype='object')

In [88]:
df_race['applicant_age'].value_counts()

applicant_age
35-44    16584
25-34    15798
45-54    13146
55-64    10761
65-74     6047
<25       2150
>74       2126
8888         2
Name: count, dtype: int64

In [86]:
df_race['age_categorical'].value_counts()

age_categorical
3    16584
2    15798
4    13146
5    10761
6     6047
1     2150
7     2126
8        2
Name: count, dtype: int64

In [83]:
df_race[['age_categorical', 'applicant_age']]

Unnamed: 0,age_categorical,applicant_age
0,4,45-54
2,2,25-34
3,2,25-34
6,3,35-44
7,2,25-34
...,...,...
233129,4,45-54
233135,4,45-54
233138,3,35-44
233140,5,55-64


In [None]:
# Cleaning rate_spread
df_race = df_race[df_race['rate_spread'] != 'Exempt']
df_race['rate_spread'] = pd.to_numeric(df_race['rate_spread'])

In [73]:
df_race = df_race.dropna()

Model 1: logistic regression that predicts loan outcome
Model 2: loan amount (MLR)
Model 3: interest rate (MLR), rate_spread

## Model 2: loan amount (MLR)

In [90]:
X

Unnamed: 0,const,tract_minority_population_percent,binary_race,debt_to_income_ratio,income,property_value,applicant_sex,age_categorical,applicant_credit_score_type,loan_to_value_ratio,loan_type,loan_purpose,lien_status,occupancy_type,aus-1
0,1.0,21.09,0,37.0,104.0,405000.0,2,4,1,96.500,2,1,1,1,3
2,1.0,25.74,1,37.0,174.0,755000.0,1,2,2,80.000,1,1,1,1,1
3,1.0,13.10,1,37.0,178.0,615000.0,2,2,2,80.000,1,1,1,1,1
6,1.0,29.25,1,40.0,308.0,575000.0,1,3,1,80.000,1,1,1,1,1
7,1.0,28.19,1,45.0,129.0,465000.0,1,2,3,90.000,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233129,1.0,20.07,1,40.0,248.0,725000.0,1,4,9,69.723,1,2,2,1,6
233135,1.0,8.88,0,38.0,116.0,345000.0,2,4,2,79.412,1,2,2,1,6
233138,1.0,31.05,1,45.0,183.0,1625000.0,2,3,2,30.769,1,2,1,1,6
233140,1.0,29.37,1,41.0,86.0,475000.0,2,5,3,42.110,1,1,1,1,2


In [95]:
pd.set_option('display.max_rows', 300)  # show all rows

In [None]:
df_race = df_race.dropna()

X = df_race[['tract_minority_population_percent', "binary_race", "debt_to_income_ratio", 'income', "property_value", 'applicant_sex',
             'applicant_age', 'applicant_credit_score_type', 'loan_to_value_ratio', 'loan_type', 'loan_purpose', 'lien_status', 
             'occupancy_type', 'aus-1']]

X = pd.get_dummies(X, columns=['binary_race', 'applicant_sex', 'applicant_age', 'aus-1', 'occupancy_type', 'applicant_credit_score_type', 'loan_type', 'loan_purpose', 'lien_status'], drop_first=True)
X = X.astype(float)

y = df_race['loan_amount']

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            loan_amount   R-squared:                       0.790
Model:                            OLS   Adj. R-squared:                  0.790
Method:                 Least Squares   F-statistic:                     5959.
Date:                Thu, 10 Apr 2025   Prob (F-statistic):               0.00
Time:                        16:26:19   Log-Likelihood:            -8.9893e+05
No. Observations:               66614   AIC:                         1.798e+06
Df Residuals:                   66571   BIC:                         1.798e+06
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

## Model 3: interest rate (MLR), rate_spread

In [53]:
df_race.columns


Index(['action_taken', 'derived_race', 'derived_ethnicity', 'applicant_sex',
       'applicant_age', 'income', 'debt_to_income_ratio',
       'applicant_credit_score_type', 'loan_amount', 'loan_to_value_ratio',
       'interest_rate', 'rate_spread', 'loan_type', 'loan_purpose',
       'lien_status', 'property_value', 'occupancy_type',
       'tract_minority_population_percent', 'aus-1', 'denial_reason-1',
       'binary_race', 'binary_ethnicity', 'binary_denied', 'age_categorical'],
      dtype='object')

In [None]:
X = df_race[['tract_minority_population_percent', "binary_race", "debt_to_income_ratio", 'income', "property_value", 'applicant_sex',
             'applicant_age', 'applicant_credit_score_type', 'loan_to_value_ratio', 'loan_type', 'loan_purpose', 'lien_status', 
             'occupancy_type', 'aus-1']]

X = pd.get_dummies(X, columns=['binary_race', 'applicant_sex', 'applicant_age', 'aus-1', 'occupancy_type', 'applicant_credit_score_type', 'loan_type', 'loan_purpose', 'lien_status'], drop_first=True)
X = X.astype(float)

y = df_race['interest_rate']

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          interest_rate   R-squared:                       0.171
Model:                            OLS   Adj. R-squared:                  0.170
Method:                 Least Squares   F-statistic:                     326.7
Date:                Thu, 10 Apr 2025   Prob (F-statistic):               0.00
Time:                        16:27:30   Log-Likelihood:            -1.1545e+05
No. Observations:               66614   AIC:                         2.310e+05
Df Residuals:                   66571   BIC:                         2.314e+05
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [None]:

X = df_race[['tract_minority_population_percent', "binary_race", "debt_to_income_ratio", 'income', "property_value", 'applicant_sex',
             'age_categorical', 'applicant_credit_score_type', 'loan_to_value_ratio', 'loan_type', 'loan_purpose', 'lien_status', 
             'occupancy_type', 'aus-1']]

y = df_race['rate_spread']

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            rate_spread   R-squared:                       0.043
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     161.2
Date:                Thu, 10 Apr 2025   Prob (F-statistic):               0.00
Time:                        15:46:23   Log-Likelihood:                -89164.
No. Observations:               50030   AIC:                         1.784e+05
Df Residuals:                   50015   BIC:                         1.785e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

# Recreation of model 
Train a model based on outcome (denial vs acceptance) using similar predictors
Look for bias in the outcomes of the model -> conclusion about biased data creating biased models in future 

In [58]:
from sklearn.model_selection import train_test_split

# select columns that were explicitly approved or denied
filtered_df = df.loc[(df["action_taken"] == 1) | (df["action_taken"] == 2) | (df["action_taken"] == 7) | (df["action_taken"] == 3)]

# remove na values
filtered_df = filtered_df.dropna(subset=["debt_to_income_ratio","income","loan_to_value_ratio","loan_amount","property_value"])

# create binary accepted column
filtered_df["binary_accepted"] = True
filtered_df.loc[filtered_df["action_taken"] == 7, 'binary_accepted'] = False
filtered_df.loc[filtered_df["action_taken"] == 3, 'binary_accepted'] = False
counts = filtered_df['binary_accepted'].value_counts()

# create binary race column - 1 for White
df_slice = filtered_df.loc[(filtered_df['derived_race'] == "Black or African American") | (filtered_df['derived_race'] == "White")]
filtered_df = df_slice.copy()
filtered_df['binary_race'] = 0
filtered_df.loc[filtered_df['derived_race'] == "White", 'binary_race'] = 1

# split data

X = filtered_df[["debt_to_income_ratio","income","loan_to_value_ratio","loan_amount","property_value"]]
y = filtered_df["binary_accepted"]

train, test = train_test_split(filtered_df, test_size=0.2)
X_train, X_test = train[["debt_to_income_ratio","income","loan_to_value_ratio","loan_amount","property_value"]], \
test[["debt_to_income_ratio","income","loan_to_value_ratio","loan_amount","property_value"]] 
y_train, y_test = train["binary_accepted"], test["binary_accepted"] 

In [59]:
filtered_df['binary_accepted'].value_counts(), filtered_df['action_taken'].value_counts()

(binary_accepted
 True     73052
 False    11674
 Name: count, dtype: int64,
 action_taken
 1    69923
 3    11612
 2     3129
 7       62
 Name: count, dtype: int64)

Model 1: Logistic Regression

In [60]:
# logistic regression
from sklearn.linear_model import LogisticRegression

lr = sk.linear_model.LogisticRegression(max_iter = 100000)

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
acc_lr = sk.metrics.accuracy_score(y_pred,y_test)
print(acc_lr)

0.8593768440930013


Model 2: Decision Tree

In [61]:
# decision tree model
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics # check model accuracy

dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
acc_dt = metrics.accuracy_score(y_pred,y_test)
print(acc_dt)

0.7810102679098312


Model 3: K-Nearest Neighbors

In [62]:
# knn
from sklearn.neighbors import KNeighborsClassifier # KNN

knc = KNeighborsClassifier(n_neighbors=3) # for k=3
knc.fit(X_train,y_train)
y_pred = knc.predict(X_test)
acc_knn = sk.metrics.accuracy_score(y_pred,y_test)
print(acc_knn)

0.8301664109524372


In [63]:
# plot overall accuracy comparison between models
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree','K-Nearest Neighbours'],
    'Score': [acc_lr, acc_dt, acc_knn]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Logistic Regression,0.859377
2,K-Nearest Neighbours,0.830166
1,Decision Tree,0.78101


In order to test our hypothesis that the HDMA model is biased against Black applicants, we compare the accuracy rates across races for each of the models we trained, which use the same metrics. 

In [64]:
# split data and get predictions between Black and white applicants
x_var = ["debt_to_income_ratio","income","loan_to_value_ratio","loan_amount","property_value"]

test_b = test.loc[test['derived_race'] == "Black or African American"]
test_w = test.loc[test['derived_race'] == "White"]

X_test_b, X_test_w = test_b[x_var], test_w[x_var]
y_test_b, y_test_w = test_b['binary_accepted'], test_w['binary_accepted']

# knn
knn_b = knc.predict(X_test_b)
knn_w = knc.predict(X_test_w)

# lr
lr_b = lr.predict(X_test_b)
lr_w = lr.predict(X_test_w)

# dt
dt_b = dt.predict(X_test_b)
dt_w = dt.predict(X_test_w)

In [65]:
# calculate accuracy
# knn
acc_knn_b = sk.metrics.accuracy_score(knn_b, y_test_b)
acc_knn_w = sk.metrics.accuracy_score(knn_w, y_test_w)
knn_diff = acc_knn_w - acc_knn_b
# lr
acc_lr_b = sk.metrics.accuracy_score(lr_b, y_test_b)
acc_lr_w = sk.metrics.accuracy_score(lr_w, y_test_w)
lr_diff = acc_lr_w - acc_lr_b
# dt
acc_dt_b = sk.metrics.accuracy_score(dt_b, y_test_b)
acc_dt_w = sk.metrics.accuracy_score(dt_w, y_test_w)
dt_diff = acc_dt_w - acc_dt_b
# plot differences in accuracy for race between models
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree','K-Nearest Neighbours'],
    'Score': [lr_diff, dt_diff, knn_diff]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Logistic Regression,0.104281
2,K-Nearest Neighbours,0.085135
1,Decision Tree,0.072779


Here we can see the difference in accuracy score between Black and white applicants. The logistic regression model, which was the most accurate overall, also has the highest disparity in terms of race. If they're not already doing so, it would be beneficial for the HMDA algorithm to consider separate values for race as they assess which loans were paid back, etc.

In the section below, we retrain our models to this time include binary race (1 for white, 0 for Black) explicitly as an input. Then, we report on the fairness metrics of each of those models.

In [66]:
import fairlearn
from fairlearn.metrics import demographic_parity_ratio, MetricFrame, selection_rate, false_positive_rate, false_negative_rate
from sklearn import metrics
from sklearn.metrics import accuracy_score

X_train, X_test = train[["debt_to_income_ratio","income","loan_to_value_ratio","loan_amount","property_value", "binary_race"]], \
test[["debt_to_income_ratio","income","loan_to_value_ratio","loan_amount","property_value", "binary_race"]] 

dt_race = DecisionTreeClassifier()
dt_race.fit(X_train,y_train)
y_pred_dt = dt_race.predict(X_test)

knn_race = KNeighborsClassifier(n_neighbors=3)
knn_race.fit(X_train,y_train)
y_pred_knn = knn_race.predict(X_test)

lr_race = sk.linear_model.LogisticRegression(max_iter = 100000)
lr_race.fit(X_train,y_train)
y_pred_lr = lr_race.predict(X_test)

In [67]:
# define fairness metrics
metrics = {
    'accuracy' : accuracy_score,
    'selection rate' : selection_rate,
    'FPR' : false_positive_rate,
    'FNR' : false_negative_rate
}

In [68]:
# Bias report on logistic regression model    
mf_lr = MetricFrame(metrics = metrics, y_true=y_test, y_pred=y_pred_lr, sensitive_features=X_test['binary_race'])
mf_lr.by_group

Unnamed: 0_level_0,accuracy,selection rate,FPR,FNR
binary_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.766702,1.0,1.0,0.0
1,0.870983,0.999137,0.997936,0.000686


In [69]:
# Bias report on k-nearest neighbors model
mf_knn = MetricFrame(metrics = metrics, y_true=y_test, y_pred=y_pred_knn, sensitive_features=X_test['binary_race'])
mf_knn.by_group

Unnamed: 0_level_0,accuracy,selection rate,FPR,FNR
binary_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.754507,0.937964,0.893182,0.048409
1,0.839774,0.938313,0.882869,0.053498


In [70]:
# Bias report on decision tree model
mf_dt = MetricFrame(metrics = metrics, y_true=y_test, y_pred=y_pred_dt, sensitive_features=X_test['binary_race'])
mf_dt.by_group

Unnamed: 0_level_0,accuracy,selection rate,FPR,FNR
binary_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.677094,0.755567,0.668182,0.217842
1,0.796879,0.859031,0.741486,0.123609


In order to illustrate how biased data may negatively affect any models trained on the HDMA dataset, we're creating fake data points to test our own. The following dataframe has a series of fictional applications with baseline values equal to the average for each column. From there, some numbers are adjusted to represent "riskier" debt history, or higher incomes, etc. Each set of values comes in pairs: one Black applicant and one white applicant with exactly the same information. We can say that for Pair 2, there were two different outcomes based solely on race.

In [71]:
# Pair 1: average values
# Pair 2: safer application - less debt, higher income
# Pair 3: safer application - less debt, higher income, lower loan amount
# Pair 4: riskier application - high debt, high loan amount

sample_data = pd.DataFrame({'debt_to_income_ratio': [42.7, 42.7, 35.0, 35.0, 33.0, 33.0, 48.0, 48.0],
                            'income': [145.28, 145.28, 200.28, 200.28, 210.28, 210.28, 145.28, 145.28],
                            'loan_to_value_ratio':[1094.02, 1094.02, 1094.02, 1094.02, 1094.02, 1094.02, 1094.02, 1094.02],
                            'loan_amount': [295623.77, 295623.77, 295623.77, 295623.77, 275000.77, 275000.77, 400000, 400000],
                            'property_value': [561332.65, 561332.65, 561332.65, 561332.65, 561332.65, 561332.65, 561332.65, 561332.65],
                            'binary_race': [0, 1, 0, 1, 0, 1, 0, 1]})

sample_pred = dt_race.predict(sample_data)
print(sample_pred)

[False False False False False False False False]
