In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../..')

from utils.use_regression import (create_formula, run_regression, convert_results_to_df, calculate_vif)

### 1. Import Data

In [3]:
hmda19_df = pd.read_csv('../../data/hmda_lar/cleaned_data/3_hmda2019_regressiondata_210823.csv',
                        dtype = {'app_credit_model': str, 'metro_code': str, 'lei': str})

hmda19_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2433071 entries, 0 to 2433070
Data columns (total 63 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   black                         float64
 1   latino                        float64
 2   asian_cb                      float64
 3   native                        float64
 4   race_na                       float64
 5   no_coapplicant                float64
 6   na_coapplicant                float64
 7   female                        float64
 8   sex_na                        float64
 9   less_than25                   float64
 10  between25_34                  float64
 11  between45_54                  float64
 12  between55_64                  float64
 13  older_than65                  float64
 14  age_na                        float64
 15  income_log                    float64
 16  loan_log                      float64
 17  property_value_ratio          float64
 18  less30yrs_mortgage    

In [4]:
lenders_df = pd.read_csv('../../data/supplemental_hmda_data/cleaned/lender_definitions_em210513.csv', 
                         dtype = str)

### 2. Clean and filter data

In [5]:
lenders_df2 = lenders_df[['lei', 'respondent_name', 'lender_def']].copy()

In [6]:
print(len(hmda19_df))

hmda19_df2 = hmda19_df[(hmda19_df['na_coapplicant'] != 0) & (hmda19_df['age_na'] != 0) &\
                       (hmda19_df['lender_na'] != 0)]

print(len(hmda19_df2))

2433071
2421691


### 3. Get big lenders

In [7]:
lenders_apps_df = pd.DataFrame(hmda19_df2['lei'].value_counts(dropna = False)).reset_index().\
                  rename(columns = {'index': 'lei', 'lei': 'total_apps'})

lenders_apps_df2 = lenders_apps_df[(lenders_apps_df['total_apps'] >= 5000)]
print(len(lenders_apps_df2))

lenders_apps_df2.head(3)

72


Unnamed: 0,lei,total_apps
0,KB1H1DSPRFMYMCUFXT09,113658
1,549300HW662MN1WU8550,108623
2,549300FGXN1K3HLB1R50,104549


### 4. List independent Variables

In [8]:
independent_vars = ['black', 'latino', 'asian_cb', 'native', 'race_na',
                    'female', 'sex_na',
                    'no_coapplicant',
                    'younger_than_34', 'older_than_55',
                    'income_log',
                    'loan_log',
                    'property_value_ratio', 
                    'not30yr_mortgage',
                    'equifax', 'experian', 'other_model', 'more_than_one', 'model_na',
                    'dti_manageable', 'dti_unmanageable', 'dti_struggling',
                    'combined_loan_to_value_ratio',
                    'low_lmi', 'moderate_lmi', 'middle_lmi',
                    'non_desktop', 'aus_na',
                    'white_cat2', 'white_cat3', 'white_cat4']

### 5. Count the vaules that show up for each lender
- Remove continuous variables because we are counting loans and denials

In [9]:
continous_vars = ['income_log', 'loan_log', 'combined_loan_to_value_ratio', 'property_value_ratio']
independent_vars2 = [var for var in independent_vars if var not in continous_vars]

In [10]:
lenders = lenders_apps_df2['lei'].unique().tolist()
lenders_list = []
df_holder = []

#### Count all the independent variables for each lender, by loans and denials

In [11]:
lender_var_holder = []

for independent_var in independent_vars2:
    index_values = []
    index_values.extend(('lei', independent_var))
    
    lender_var_df = pd.pivot_table(hmda19_df2, index = index_values, columns = ['loan_outcome'], 
                                   values = ['denied'], aggfunc = 'count', fill_value = 0).reset_index()
    
    lender_var_df.columns = lender_var_df.columns.droplevel(0)
    lender_var_df.columns.name = None
    lender_var_df.columns = ['lei', 'variable_flag', 'loan', 'denied']
    lender_var_df['variable_name'] = independent_var
    lender_var_holder.append(lender_var_df)
    
lender_varcount_df = pd.concat(lender_var_holder)
lender_varcount_df['lei'].nunique()

2734

#### Finding missing records for each lender
- Focus on positive variables

In [12]:
lender_varcount_df2 = lender_varcount_df[(lender_varcount_df['variable_flag'] == 0)]
missing_rows_list = []

In [13]:
for lender in lender_varcount_df2['lei'].unique():
    lender_vars_df = lender_varcount_df2[(lender_varcount_df2['lei'] == lender)]
    lender_vars = lender_vars_df['variable_name'].unique()
    
    for var in independent_vars2:
        if var not in lender_vars:
            missing_row = pd.DataFrame([[lender, 0, 0, 0, var]], columns = ['lei', 'variable_flag', 'loan', 
                                                                            'denied', 'variable_name'])
            missing_rows_list.append(missing_row)

missing_rows_df = pd.concat(missing_rows_list)         
lender_varcount_df3 = lender_varcount_df2.append(missing_rows_df)

- Calculate the denial and loan percentage

In [14]:
lender_varcount_df3['total_count'] = lender_varcount_df3['loan'] + lender_varcount_df3['denied']

lender_varcount_df3['loan_pct'] = lender_varcount_df3['loan'].\
                                  div(lender_varcount_df3['total_count']).multiply(100)

lender_varcount_df3['denied_pct'] = lender_varcount_df3['denied'].\
                                    div(lender_varcount_df3['total_count']).multiply(100)

- Filter for the select lenders

In [15]:
lender_varcount_df4 = lender_varcount_df3[(lender_varcount_df3['lei'].isin(lenders))]
len(lenders) == lender_varcount_df4['lei'].nunique()

True

#### Which variables are zero
- Credit models and underwriters are specific to individual lenders
- Many lenders stick to experian, equifax, transunion

In [16]:
lender_varcount_df4[(lender_varcount_df4['total_count'] == 0)]['variable_name'].value_counts(dropna = False)

more_than_one    59
other_model      45
non_desktop       7
model_na          3
aus_na            3
experian          2
equifax           2
Name: variable_name, dtype: int64

In [17]:
model_vars = ['equifax', 'experian', 'other_model', 'more_than_one', 'model_na']

missing_credit = lender_varcount_df4[(lender_varcount_df4['variable_name'].isin(model_vars)) &\
                                      (lender_varcount_df4['total_count'] == 0)]['lei'].nunique()

print('Number of lenders with at least one credit model missing: ' + str(missing_credit))

Number of lenders with at least one credit model missing: 68


In [18]:
aus = ['non_desktop', 'aus_na']

missing_aus = lender_varcount_df4[(lender_varcount_df4['variable_name'].isin(aus)) &\
                                  (lender_varcount_df4['total_count'] == 0)]['lei'].nunique()

print('Number of lenders with at least one underwriter missing: ' + str(missing_aus))

Number of lenders with at least one underwriter missing: 10


#### Select dummy varibales that are greater than zero
- Leaving out variables where credit model and aus don't exits in the lender's data

In [19]:
lender_varcount_df5 = lender_varcount_df4[(lender_varcount_df4['total_count'] > 0)]

### 7. Run regression on individual lenders

In [20]:
lender_holder = []

for lender in lenders:
    lender_df = hmda19_df2[(hmda19_df2['lei'] == lender)]
    total_apps = len(lender_df)
    
    lender_independent_vars = lender_varcount_df5[(lender_varcount_df5['lei'] == lender)]\
                              ['variable_name'].unique().tolist()
    lender_independent_vars2 = lender_independent_vars + continous_vars
    
    regression_formula = create_formula(lender_independent_vars2)
    model = run_regression(data = lender_df, formula = regression_formula)
    
    try:
        results = model.fit()
        info = results.mle_retvals['converged']
    
        results_df = convert_results_to_df(results)
        results_df.insert(0, 'lei', lender)
        results_df.insert(1, 'psuedo_rsquare', results.prsquared)
        results_df['iteration_flag'] = info
        results_df['total_apps'] = total_apps
        
    except:
        independent_nan_list = []
        
        for regression_var in lender_independent_vars:
            results_dict = {'lei': lender, 'variable_name': regression_var, 'standard_error': np.nan, 
                            'z_value': np.nan, 'p_value': np.nan, 'odds_ratio': np.nan, 
                            'iteration_flag': np.nan, 'psuedo_rsquare': np.nan, 'total_apps': total_apps}
            
            results_df = pd.DataFrame([results_dict], columns = results_dict.keys())
            independent_nan_list.append(results_df)
        results_df = pd.concat(independent_nan_list)
    
    lender_holder.append(results_df)

lender_results_df = pd.concat(lender_holder)

         Current function value: 0.182879
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.097206
         Iterations 9
         Current function value: 0.328955
         Iterations: 35




         Current function value: 0.179183
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.251815
         Iterations 14
Optimization terminated successfully.
         Current function value: 0.096063
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.145295
         Iterations 8
         Current function value: 0.318299
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.093371
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.163173
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.101158
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.051905
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.339966
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.054217
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.289879
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.166566
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.133698
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.179477
  



Optimization terminated successfully.
         Current function value: 0.284854
         Iterations 7


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.208540
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.163964
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.096274
         Iterations 9
         Current function value: 0.098960
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.182855
         Iterations 10
         Current function value: 0.177530
         Iterations: 35




         Current function value: 0.106795
         Iterations: 35




         Current function value: 0.010143
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.077796
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.154990
         Iterations 34
Optimization terminated successfully.
         Current function value: 0.185807
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.074433
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.434546
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.121258
         Iterations 8
         Current function value: 0.008661
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.118822
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.116361
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.075483
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.233596
         Iterations 7
         Current function value: 0.100104
         Iterations: 35




         Current function value: 0.042117
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.250088
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.062918
         Iterations 31
Optimization terminated successfully.
         Current function value: 0.131574
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.070439
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.335768
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.191946
         Iterations 8
         Current function value: 0.250917
         Iterations: 35




         Current function value: 0.056319
         Iterations: 35




         Current function value: 0.229305
         Iterations: 35




         Current function value: 0.029377
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.147730
         Iterations 18
Optimization terminated successfully.
         Current function value: 0.566335
         Iterations 7
         Current function value: 0.121210
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.087099
         Iterations 10
         Current function value: 0.070897
         Iterations: 35




         Current function value: 0.131111
         Iterations: 35




         Current function value: 0.083009
         Iterations: 35




         Current function value: 0.088143
         Iterations: 35


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35
         Current function value: 0.044748
         Iterations: 35




### 8. Join Dataframes and filter for significant results

In [21]:
lender_results_df2 = pd.merge(lender_results_df, lender_varcount_df5, how = 'left',
                              on = ['lei', 'variable_name'])

#### Number of lenders that didn't produce any results

In [22]:
no_results_lenders = lender_results_df2[(lender_results_df2['psuedo_rsquare'].isnull()) & \
                                        (lender_results_df2['p_value'].isnull()) &\
                                        (lender_results_df2['z_value'].isnull())]['lei']

no_results_lenders.nunique()

4

#### Lenders where the equation didn't work for them

In [23]:
equation_lenders = lender_results_df2[(lender_results_df2['psuedo_rsquare'] < .1) |\
                                      (lender_results_df2['iteration_flag'] == False)]['lei']

equation_lenders.nunique()

26

#### Results with valid results

In [24]:
lender_results_df3 = lender_results_df2[(lender_results_df2['psuedo_rsquare'] >= .1) &\
                                        (lender_results_df2['iteration_flag'] == True)]

print(lender_results_df3['lei'].nunique())

42


No over lap between lenders with results and lenders where the equation didn't work or with no results at all

In [25]:
lender_results_df3[(lender_results_df3['lei'].isin(equation_lenders)) | \
                   (lender_results_df3['lei'].isin(no_results_lenders))]

Unnamed: 0,lei,psuedo_rsquare,variable_name,pseudo_rsquared,coefficient,standard_error,z_value,p_value,odds_ratio,iteration_flag,total_apps,variable_flag,loan,denied,total_count,loan_pct,denied_pct


### 9. Test for collinarity

In [26]:
vif_list = []

for lender in lender_results_df3['lei'].unique():
    lender_vars_df = lender_results_df3[(lender_results_df3['lei'] == lender)]
    independent_vars = lender_vars_df['variable_name'].unique()[1:]
    
    lender_df = hmda19_df2[(hmda19_df2['lei'] == lender)][independent_vars]
    
    vif_df = calculate_vif(lender_df)
    vif_df['lei'] = lender
    
    vif_list.append(vif_df)
    
lenders_vif_df = pd.concat(vif_list)

100%|██████████| 28/28 [00:06<00:00,  4.32it/s]
  vif = round(1/(1-rsq), 2)
100%|██████████| 30/30 [00:03<00:00,  8.50it/s]
100%|██████████| 29/29 [00:02<00:00, 10.48it/s]
100%|██████████| 30/30 [00:02<00:00, 13.20it/s]
100%|██████████| 29/29 [00:01<00:00, 15.89it/s]
100%|██████████| 29/29 [00:01<00:00, 21.77it/s]
100%|██████████| 30/30 [00:01<00:00, 25.21it/s]
100%|██████████| 29/29 [00:01<00:00, 26.81it/s]
100%|██████████| 29/29 [00:01<00:00, 26.66it/s]
100%|██████████| 31/31 [00:01<00:00, 25.73it/s]
100%|██████████| 30/30 [00:00<00:00, 30.90it/s]
100%|██████████| 29/29 [00:00<00:00, 38.74it/s]
100%|██████████| 30/30 [00:00<00:00, 40.11it/s]
100%|██████████| 29/29 [00:00<00:00, 44.22it/s]
100%|██████████| 30/30 [00:00<00:00, 44.29it/s]
100%|██████████| 30/30 [00:00<00:00, 44.40it/s]
100%|██████████| 29/29 [00:00<00:00, 46.91it/s]
100%|██████████| 29/29 [00:00<00:00, 46.85it/s]
100%|██████████| 27/27 [00:00<00:00, 52.11it/s]
100%|██████████| 30/30 [00:00<00:00, 48.61it/s]
100%|███████

In [27]:
lenders_vif_df2 = lenders_vif_df[(lenders_vif_df['independent_var'] != 'income_log') &\
                                 (lenders_vif_df['independent_var'] != 'loan_log')]

collinarity_lenders = lenders_vif_df2[(lenders_vif_df2['threshold'] == '1')]['lei'].unique()

#### Lenders with collinarity issues
- 12 lenders

In [28]:
len(collinarity_lenders)

12

#### Filter out lenders with collinarity issues
- 30 lenders with no results (4 with no results + 26 with poor fit)
- 12 lenders with collinarity issues
- 30 lenders move forward

In [29]:
lender_results_df4 = lender_results_df3[~(lender_results_df3['lei'].isin(collinarity_lenders))]

lender_results_df4['lei'].nunique()

30

### 10. Focus on lenders with racial and ethnic results
- 26 lenders 

In [30]:
races = ['black', 'latino', 'asian_cb', 'native']
lender_results_df5 = lender_results_df4[(lender_results_df4['variable_name'].isin(races))]

lender_results_df6 = lender_results_df5[(lender_results_df5['p_value'] < .05)]

print(lender_results_df6['lei'].nunique())

26


#### Join with names

In [31]:
lender_results_df7 = pd.merge(lender_results_df6, lenders_df2, how = 'left', on = ['lei'])

In [32]:
lender_results_df7['lei'].nunique()

26

#### Filter out where applicants are less than 75

In [33]:
lender_results_df8 = lender_results_df7[(lender_results_df7['total_count'] >= 75)]

lender_results_df8['lei'].nunique()

26

#### Disparity range:

In [34]:
print(lender_results_df8['odds_ratio'].max())
print(lender_results_df8['odds_ratio'].min())

3.5832032532791747
1.2573673512101307


#### 25 lenders with statistically significant disparities

In [35]:
lender_results_df8[(lender_results_df8['odds_ratio'] >= 1.45)]['lei'].nunique()

25

In [36]:
lender_results_df9 = lender_results_df8[(lender_results_df8['total_count'] >= 1000) & \
                                        (lender_results_df8['odds_ratio'] >= 1.95)].\
                      sort_values(by = ['odds_ratio'], ascending = False)

print(lender_results_df9['lei'].nunique())

lender_results_df9[['lei', 'respondent_name', 'variable_name', 'total_count', 'p_value', 'odds_ratio']].\
sort_values(by = ['respondent_name', 'odds_ratio'])

7


Unnamed: 0,lei,respondent_name,variable_name,total_count,p_value,odds_ratio
21,5493001SXWZ4OFP8Z903,"DHI MORTGAGE COMPANY, LTD.",latino,2154.0,2.423503e-07,2.037769
20,5493001SXWZ4OFP8Z903,"DHI MORTGAGE COMPANY, LTD.",black,1276.0,1.87591e-10,2.613202
18,549300H3IZO24NSOO931,"EAGLE HOME MORTGAGE, LLC",latino,2837.0,1.388851e-14,2.124229
17,549300H3IZO24NSOO931,"EAGLE HOME MORTGAGE, LLC",black,1281.0,3.007772e-11,2.301073
0,549300MGPZBLQDIL7538,FAIRWAY INDEPENDENT MORTGAGE CORPORATION,black,2014.0,1.438527e-10,2.091842
43,549300LYRWPSYPK6S325,FREEDOM MORTGAGE CORPORATION,latino,1043.0,1.885856e-05,2.243939
13,549300DD4R4SYK5RAQ92,"MOVEMENT MORTGAGE, LLC",latino,2228.0,3.507122e-09,2.119191
12,549300DD4R4SYK5RAQ92,"MOVEMENT MORTGAGE, LLC",black,1289.0,1.383497e-06,2.131013
33,5493003GQDUH26DNNH17,NAVY FEDERAL CREDIT UNION,black,1467.0,6.205845e-15,2.056585
32,5493004WMLN60ZJ2ON46,PULTE MORTGAGE LLC,latino,1296.0,9.610523e-05,2.156586


In [37]:
lender_results_df9[['lei', 'respondent_name', 'variable_name', 'total_count', 'p_value', 'odds_ratio']].\
sort_values(by = ['respondent_name', 'odds_ratio']).\
to_csv('../../findings/lender_findings/1_lender_findings210823.csv', index = False)