In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../..')

from sklearn.metrics import confusion_matrix, accuracy_score
from utils.use_regression import (create_dummy_vars, create_formula, run_regression, calculate_vif, 
                                  calcuate_confusion_matrix, convert_results_to_df)

### 1. Import Data

In [3]:
hmda19_df = pd.read_csv('../../data/hmda_lar/cleaned_data/2_hmda2019_210823.csv', dtype = str)

hmda19_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4529912 entries, 0 to 4529911
Columns: 119 entries, activity_year to lmi_def
dtypes: object(119)
memory usage: 4.0+ GB


### 2. Filter for Conventional Originations and Denials and where income is above 0

In [4]:
hmda19_df['income'] = pd.to_numeric(hmda19_df['income'])

hmda19_df2 = hmda19_df[(hmda19_df['loan_type'] == '1') & (hmda19_df['income'] > 0) &\
                       ((hmda19_df['loan_outcome'] == '1') | (hmda19_df['loan_outcome'] == '3'))].copy()

print(len(hmda19_df2))

2677473


### 3. Create Dummy Variables for Regression

#### Select columns for dummy variables

In [5]:
regression_cols = [{'loan_outcome': {'denied': ['3']}},
                   
                   ### Reference: White
                   {'app_race_ethnicity': {'black': ['3'], 'latino': ['6'], 'asian': ['2'], 'native': ['1'],
                                           'pac_islander': ['4'], 'race_na': ['7'], 'asian_cb': ['2', '4']}},
                   
                   ### Reference: Coapplicant
                   {'co_applicant': {'no_coapplicant': ['2'], 'na_coapplicant': ['3']}},
                   
                   ### Reference: Male
                   {'applicant_sex_cat': {'female': ['2'], 'sex_na': ['3', '6']}},
                   
                   ### Reference: Between 34-44 or Between 34-54
                   {'applicant_age_cat': {'less_than25': ['1'], 'between25_34': ['2'], 
                                          'between45_54': ['4'], 'between55_64': ['5'], 'between65_74': ['6'],
                                          'greater74': ['7'], 'age_na': ['8'],
                                          'younger_than_34': ['1', '2'], 'older_than_55': ['5', '6', '7'],
                                          'older_than65': ['6', '7']}},
                   
                   ### Reference: Bucket 2 & 3
                   {'prop_value_cat': {'pvr_bucket1': ['1'], 'pvr_bucket4': ['4'], 'pvr_bucket5': ['5'], 
                                        'pvr_bucket6': ['6'], 'pvr_bucket_none': ['7']}},
                   
                   
                   ### Reference: 30yr Mortgage
                   {'mortgage_term': {'less30yrs_mortgage': ['2'], 'more30yrs_mortgage': ['3'], 
                                      'mortgage_term_na': ['4'], 'not30yr_mortgage': ['2', '3']}},
                   
                   ### Reference: TransUnion
                   {'app_credit_model': {'equifax': ['1'], 'experian': ['2'], 'other_model': ['4', '6'], 
                                         'more_than_one': ['5'], 'model_na': ['7']}},
                   
                   {'dti_cat': {'dti_manageable': ['2'], 'dti_unmanageable': ['3'], 
                                'dti_struggling': ['4'], 'dti_na': ['5', '6']}},
                   
                   ### Reference: 20 pct downpayment
                   {'downpayment_flag': {'less20pct_downpayment': ['2'],'downpayment_na': ['3', '5']}},
                   
                   ### Reference: Upper LMI
                   {'lmi_def': {'low_lmi': ['1'], 'moderate_lmi': ['2'], 'middle_lmi': ['3'], 'na_lmi': ['5']}},
                   
                   ### Reference: White Cat 1
                   {'diverse_def': {'white_cat2': ['2'], 'white_cat3': ['3'], 'white_cat4': ['4'], 
                                      'white_cat_na': ['0', '5']}},
                   
                   ### Reference: Banks
                   {'lender_def': {'credit_union': ['2'], 'independent': ['3'],  'lender_na': ['4', '6']}},
                   
                   ### Reference: Desktop
                   {'main_aus': {'non_desktop': ['2', '3', '4', '5', '6'], 'aus_na': ['7']}},
                   
                   ### Reference: 99th Percentile
                   {'metro_percentile': {'metro_90th': ['9'], 'metro_80th': ['8'],
                                         'metro_70th': ['7'], 'metro_60th': ['6'], 'metro_50th': ['5'],
                                         'metro_40th': ['4'], 'metro_30th': ['3'], 'metro_20th': ['2'],
                                         'metro_10th': ['1'], 'metro_less10th': ['0'], 'micro_area': ['111'],
                                         'metro_none': ['000']}}]

In [6]:
continous_vars = ['income_log', 'loan_log', 'lar_count', 'property_value_ratio', 'prop_zscore']

for continuous_var in continous_vars:
    hmda19_df2[continuous_var] = pd.to_numeric(hmda19_df2[continuous_var])

In [7]:
for columns in regression_cols:
    ### Function to create dummy variables
    hmda19_df2 = create_dummy_vars(hmda19_df2, columns)

#### Independent Variables

In [8]:
variables = ['black', 'latino', 'asian_cb', 'native', 'race_na',
             'no_coapplicant', 'na_coapplicant',
             'female', 'sex_na',
             'less_than25', 'between25_34', 'between45_54', 'between55_64', 'older_than65', 'age_na',
             'income_log', 'loan_log',
             'pvr_bucket1', 'pvr_bucket4', 'pvr_bucket5', 'pvr_bucket6', 'pvr_bucket_none',
             'less30yrs_mortgage', 'more30yrs_mortgage', 'mortgage_term_na',
             'equifax', 'experian', 'other_model', 'more_than_one', 'model_na',
             'dti_manageable', 'dti_unmanageable', 'dti_struggling', 'dti_na',
             'less20pct_downpayment','downpayment_na',
             'moderate_lmi', 'middle_lmi', 'low_lmi', 'na_lmi',
             'credit_union', 'independent',  'lender_na',
             'lar_count',
             'non_desktop', 'aus_na',
             'white_cat2', 'white_cat3', 'white_cat4', 'white_cat_na',
             'metro_90th', 'metro_80th', 'metro_70th', 'metro_60th', 'metro_50th', 'metro_40th', 
             'metro_30th', 'metro_20th', 'metro_10th', 'metro_less10th', 'micro_area', 'metro_none']

print(len(variables))

62


### 2. Run Collinearity Test

In [9]:
hmda_independent_vars = hmda19_df2[variables]

vif_df = calculate_vif(hmda_independent_vars)

100%|██████████| 62/62 [33:03<00:00, 32.00s/it]


#### Varibales that are above the 2.5 threshold

In [10]:
vif_df[(vif_df['threshold'] == '1')].sort_values(by = ['vif'], ascending = False)

Unnamed: 0,independent_var,vif,threshold
21,pvr_bucket_none,8.63,1
33,dti_na,7.86,1
49,white_cat_na,7.66,1
39,na_lmi,6.72,1
24,mortgage_term_na,5.38,1
35,downpayment_na,3.97,1
16,loan_log,3.09,1
15,income_log,2.98,1
50,metro_90th,2.93,1


#### Remove variables with high VIFs
- Keeping income, loan and metro_90th

In [11]:
to_keep = ['income_log', 'loan_log', 'metro_90th']

highvif_vars = vif_df[(vif_df['threshold'] == '1') & ~(vif_df['independent_var'].isin(to_keep))]\
              ['independent_var'].unique().tolist()

variables2 = [var for var in variables if var not in highvif_vars]

### 3. Filter Out High Vif Variables
- Property Value Ratios NA
- Mortgage Term NA
- DTI NA
- Downpayment NA
- NA Lmi
- White Cat NA

In [12]:
hmda19_df3 = hmda19_df2[(hmda19_df2['prop_value_cat'] != '7') & (hmda19_df2['mortgage_term'] != '4') &\
                        (hmda19_df2['dti_cat'] != '5') & (hmda19_df2['dti_cat'] != '6') &\
                        (hmda19_df2['downpayment_flag'] != '3') & (hmda19_df2['lmi_def'] != '5') &\
                        (hmda19_df2['diverse_def'] != '0') & (hmda19_df2['diverse_def'] != '5')].copy()

print(len(hmda19_df3))

2498421


#### Also filtering out CLTV above 100

In [13]:
hmda19_df3['combined_loan_to_value_ratio'] = pd.to_numeric(hmda19_df3['combined_loan_to_value_ratio'])

hmda19_df4 = hmda19_df3[(hmda19_df3['combined_loan_to_value_ratio'] <= 100)]

print(len(hmda19_df4))

2433071


#### Replace variables

In [14]:
# high_vif_vars = ['pvr_bucket_none', 'mortgage_term_na', 'dti_na', 'downpayment_na', 'na_lmi', 'white_cat_na']

vars_to_removes = ['pvr_bucket1', 'pvr_bucket4', 'pvr_bucket5', 'pvr_bucket6', 'less20pct_downpayment']

variables3 = [var for var in variables2 if var not in vars_to_removes]
variables3.insert(17, 'property_value_ratio')
variables3.insert(28, 'combined_loan_to_value_ratio')

#### Variables to use

In [15]:
pd.Series(variables3)

0                            black
1                           latino
2                         asian_cb
3                           native
4                          race_na
5                   no_coapplicant
6                   na_coapplicant
7                           female
8                           sex_na
9                      less_than25
10                    between25_34
11                    between45_54
12                    between55_64
13                    older_than65
14                          age_na
15                      income_log
16                        loan_log
17            property_value_ratio
18              less30yrs_mortgage
19              more30yrs_mortgage
20                         equifax
21                        experian
22                     other_model
23                   more_than_one
24                        model_na
25                  dti_manageable
26                dti_unmanageable
27                  dti_struggling
28    combined_loan_

### 4. Run Regression
#### Regression Formula

In [16]:
regression_formula = create_formula(variables3)
regression_formula

'denied ~ black + latino + asian_cb + native + race_na + no_coapplicant + na_coapplicant + female + sex_na + less_than25 + between25_34 + between45_54 + between55_64 + older_than65 + age_na + income_log + loan_log + property_value_ratio + less30yrs_mortgage + more30yrs_mortgage + equifax + experian + other_model + more_than_one + model_na + dti_manageable + dti_unmanageable + dti_struggling + combined_loan_to_value_ratio + moderate_lmi + middle_lmi + low_lmi + credit_union + independent + lender_na + lar_count + non_desktop + aus_na + white_cat2 + white_cat3 + white_cat4 + metro_90th + metro_80th + metro_70th + metro_60th + metro_50th + metro_40th + metro_30th + metro_20th + metro_10th + metro_less10th + micro_area + metro_none'

In [17]:
print('Number of records: ' + str(len(hmda19_df4)))

Number of records: 2433071


In [21]:
model = run_regression(data = hmda19_df4, formula = regression_formula).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.189819
         Iterations 8


0,1,2,3
Dep. Variable:,denied,No. Observations:,2433071.0
Model:,Logit,Df Residuals:,2433017.0
Method:,MLE,Df Model:,53.0
Date:,"Tue, 24 Aug 2021",Pseudo R-squ.:,0.2256
Time:,05:56:02,Log-Likelihood:,-461840.0
converged:,True,LL-Null:,-596360.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-8.2670,0.228,-36.282,0.000,-8.714,-7.820
black,0.6017,0.012,49.182,0.000,0.578,0.626
latino,0.3688,0.010,37.397,0.000,0.349,0.388
asian_cb,0.3841,0.011,35.651,0.000,0.363,0.405
native,0.5083,0.041,12.364,0.000,0.428,0.589
race_na,0.3448,0.012,28.135,0.000,0.321,0.369
no_coapplicant,0.2178,0.007,32.756,0.000,0.205,0.231
na_coapplicant,-0.0773,0.079,-0.975,0.329,-0.233,0.078
female,-0.0744,0.007,-11.380,0.000,-0.087,-0.062


### 5. Findings:
#### Racial and ethnic findings
- Black applicants are almost twice as likely to be denied 
- Latinx/Hispanic are almost 1.4 times
- Native Applicants are 1.7 times
- Asian/Pacific Isalnder are 1.5

In [22]:
national_findings_df = convert_results_to_df(model)

races = ['black', 'latino', 'native', 'asian_cb']
national_findings_df[(national_findings_df['variable_name'].isin(races))]

Unnamed: 0,variable_name,pseudo_rsquared,coefficient,standard_error,z_value,p_value,odds_ratio
1,black,0.225557,0.601675,0.012234,49.181903,0.0,1.825173
2,latino,0.225557,0.36882,0.009862,37.396918,4.385107e-306,1.446027
3,asian_cb,0.225557,0.384095,0.010774,35.65102,2.2720399999999998e-278,1.468284
4,native,0.225557,0.508317,0.041112,12.364051,4.090157e-35,1.662491


#### DTI Categories Findings

In [23]:
dti_vars = ['dti_manageable', 'dti_unmanageable', 'dti_struggling']

national_findings_df[(national_findings_df['variable_name'].isin(dti_vars))]

Unnamed: 0,variable_name,pseudo_rsquared,coefficient,standard_error,z_value,p_value,odds_ratio
26,dti_manageable,0.225557,-0.001563,0.007939,-0.196817,0.843971,0.998439
27,dti_unmanageable,0.225557,0.366916,0.008159,44.972003,0.0,1.443277
28,dti_struggling,0.225557,3.819702,0.010734,355.841752,0.0,45.590638


In [25]:
national_findings_df.to_csv('../../findings/national_findings/national_findings_210823.csv', index = False)

### 6. Additional statistical tests
#### Check for collinearity again

In [26]:
cols = national_findings_df['variable_name'].unique().tolist()[1:]

hmda_independent_vars2 = hmda19_df4[cols]
vif_df2 = calculate_vif(hmda_independent_vars2)

100%|██████████| 53/53 [20:54<00:00, 23.67s/it]


No new additional varibales that are collinear

In [27]:
vif_df2[(vif_df2['threshold'] == '1')]

Unnamed: 0,independent_var,vif,threshold
15,income_log,3.0,1
16,loan_log,2.89,1
41,metro_90th,2.87,1


#### Calculate Confusion Matrix

In [28]:
calcuate_confusion_matrix(hmda19_df3, model, cols, ['denied'])

  return 1/(1+np.exp(-X))


Confusion Matrix : 
 [[  49337  125019]
 [  20266 2303799]]
Overall accuracy:  94.18492720001953
Denied accuracy:  28.296703296703296
Loan accuracy :  99.12799340810176


### 7. Export Findings and Data

#### Write data to be used for metros and lenders

In [29]:
cols_to_export = cols + ['denied', 'loan_outcome', 'younger_than_34', 'older_than_55', 'not30yr_mortgage', 
                         'metro_code', 'lei', 'app_race_ethnicity', 'app_credit_model', 'property_value_ratio']

In [30]:
hmda19_df5 = hmda19_df4[cols_to_export]
hmda19_df5.to_csv('../../data/hmda_lar/cleaned_data/3_hmda2019_regressiondata_210823.csv', index = False)