In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../..')

from utils.clean_data import (clean_location, clean_race_ethnicity, find_same_race, clean_credit_model, 
                              find_coapplicants, clean_outcomes, find_aus_patterns, clean_aus)

### 1. Import HMDA data
- Data can be download from the CFPB site: [2019 LAR dataset](https://ffiec.cfpb.gov/data-publication/snapshot-national-loan-level-dataset/2019)
- The date the file was downloaded was appended to the raw file name.
- 99 Columns
- 17,545,457 records
- [Data Dictionary](https://ffiec.cfpb.gov/documentation/2019/lar-data-fields/)

In [3]:
hmda19_df = pd.read_csv('../../data/hmda_lar/raw_data/2019_public_lar_csv210810.csv', dtype = str)

hmda19_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17545457 entries, 0 to 17545456
Data columns (total 99 columns):
 #   Column                                    Dtype 
---  ------                                    ----- 
 0   activity_year                             object
 1   lei                                       object
 2   derived_msa_md                            object
 3   state_code                                object
 4   county_code                               object
 5   census_tract                              object
 6   conforming_loan_limit                     object
 7   derived_loan_product_type                 object
 8   derived_dwelling_category                 object
 9   derived_ethnicity                         object
 10  derived_race                              object
 11  derived_sex                               object
 12  action_taken                              object
 13  purchaser_type                            object
 14  preapproval     

### 2. Clean Data


#### Dropping columns I don't need (21 in total) to make the data easier to work with:

##### The following columns were added by the CFPB, not using them. 
    - derived_loan_product_type
    - derived_dwelling_category
    - derived_ethnicity
    - derived_race
    - derived_sex
##### Focusing on the applicant's first ethnicity
    - applicant_ethnicity-2
    - applicant_ethnicity-3
    - applicant_ethnicity-4
    - applicant_ethnicity-5
##### Focusing on the co-applicant's first ethnicity. Don't need these columns to find co-applicants.
    - co-applicant_ethnicity-2
    - co-applicant_ethnicity-3
    - co-applicant_ethnicity-4
    - co-applicant_ethnicity-5
##### Focusing on the applicant's first race
    - applicant_race-2
    - applicant_race-3
    - applicant_race-4
    - applicant_race-5
##### Focusing on the co-applicant's first race. Don't need these columns to find co-applicants.
    - co-applicant_race-2 
    - co-applicant_race-3 
    - co-applicant_race-4 
    - co-applicant_race-5
    
#### Using 78 columns instead of 99

In [4]:
remove_cols = ['derived_loan_product_type', 'derived_dwelling_category', 'derived_ethnicity', 
               'derived_race', 'derived_sex', 
               'applicant_ethnicity_2','applicant_ethnicity_3', 'applicant_ethnicity_4', 'applicant_ethnicity_5',
               'co_applicant_ethnicity_2', 'co_applicant_ethnicity_3', 'co_applicant_ethnicity_4', 
               'co_applicant_ethnicity_5', 
               'applicant_race_2','applicant_race_3', 'applicant_race_4', 'applicant_race_5', 
               'co_applicant_race_2', 'co_applicant_race_3', 'co_applicant_race_4', 
               'co_applicant_race_5']

new_headers = []
for column in hmda19_df.columns:
    if column not in remove_cols:
        new_headers.append(column)
        
print(len(new_headers))

78


#### Create smaller subset of HMDA data
- Deleting the orginal HMDA df to clear memory

In [5]:
hmda19_df2 = hmda19_df[new_headers].copy()
del hmda19_df

hmda19_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17545457 entries, 0 to 17545456
Data columns (total 78 columns):
 #   Column                                    Dtype 
---  ------                                    ----- 
 0   activity_year                             object
 1   lei                                       object
 2   derived_msa_md                            object
 3   state_code                                object
 4   county_code                               object
 5   census_tract                              object
 6   conforming_loan_limit                     object
 7   action_taken                              object
 8   purchaser_type                            object
 9   preapproval                               object
 10  loan_type                                 object
 11  loan_purpose                              object
 12  lien_status                               object
 13  reverse_mortgage                          object
 14  open_end_line_of

### 2. Clean Location

In [6]:
### Group all unique combinations of county codes and census tract
location_df = pd.DataFrame(hmda19_df2.groupby(by = ['county_code', 'census_tract'], dropna = False).size()).\
              reset_index().rename(columns = {0: 'count'})

### Replacing the Nulls with other text so that the function works, Keeping the Nulls seperate from the "NAs"
location_df = location_df.replace(to_replace = 'Na', value = 'ii-ii')
location_df = location_df.fillna('00-00')

In [7]:
### Number of unique combinations of county and census
print(len(location_df))

### Records where county code or census tract are Na
print(((location_df['county_code'] == 'ii-ii') | (location_df['census_tract'] == 'ii-ii')).values.sum())
### Records where county or census tract are NULL
print(((location_df['county_code'] == '00-00') | (location_df['census_tract'] == '00-00')).values.sum())

location_df.sample(3, random_state = 303)

76439
1
3489


Unnamed: 0,county_code,census_tract,count
65426,48201,48201431201,3
6782,6037,6037599100,3
64125,48113,48113017812,98


In [8]:
### Running clean_location function to ensure every record has county code
location_df['location_code'] = location_df.apply(clean_location, axis = 1)

### Split location column for state and county fips codes
location_df['state_fips'] = location_df['location_code'].str[0:2]
location_df['county_fips'] = location_df['location_code'].str[2:5]

### Number of records with no county code and census tract information
nulls_df = location_df[(location_df['state_fips'] == '--') & (location_df['county_fips'] == '---')]
print('Number of records with location nulls: ' + str(nulls_df['count'].sum()))

### Remove columns that are no longer needed
location_df2 = location_df.drop(columns = ['count', 'location_code'], axis = 1)

Number of records with location nulls: 349945


In [9]:
### Replace two dashes and three dashes data points with NaN 
location_df2 = location_df2.replace(to_replace = '--', value = np.nan)
location_df2 = location_df2.replace(to_replace = '---', value = np.nan)

### Replace '00-00' and 'ii-ii' with the orginal data points to join back to the orginal HMDA data
location_df2 = location_df2.replace(to_replace = '00-00', value = np.nan)
location_df2 = location_df2.replace(to_replace = 'ii-ii', value = 'Na')

location_df2.head(1)

Unnamed: 0,county_code,census_tract,state_fips,county_fips
0,1001,1001020100,1,1


In [10]:
hmda19_df2 = pd.merge(hmda19_df2, location_df2, how = 'left', on = ['county_code', 'census_tract'])

nulls_records = (hmda19_df2['county_fips'].isnull() & hmda19_df2['state_fips'].isnull()).values.sum()

### This number matches the one from above:
print('Null Records that don\'t have fips data: ' + str((nulls_records)))

Null Records that don't have fips data: 349945


### 3. Clean Race and Ethnicity
- 1: Native American
- 2: Asian
- 3: Black
- 4: Pacific Islander
- 5: White
- 6: Latino
- 7: Race NA

In [11]:
### Group race and ethnicity for all unique combinations
main_race_eth = pd.DataFrame(hmda19_df2.groupby(by = ['applicant_race_1', 'applicant_ethnicity_1'],
                             dropna = False).size()).reset_index().rename(columns = {0: 'count'})

### Replace NAs with 000 for cleaning purposes
main_race_eth = main_race_eth.fillna('000')
print(len(main_race_eth))
main_race_eth.head(2)

164


Unnamed: 0,applicant_race_1,applicant_ethnicity_1,count
0,1,1,29471
1,1,11,1858


In [12]:
### Apply clean_race_ethnicity function for the r/e dataframe
main_race_eth['app_race_ethnicity'] = main_race_eth.apply(clean_race_ethnicity, axis = 1)

### Replace 000 with NaN to join back with HMDA data
main_race_eth = main_race_eth.replace(to_replace = '000', value = np.nan)
### Drop Count Column
main_race_eth = main_race_eth.drop(columns = ['count'], axis = 1)

hmda19_df2 = pd.merge(hmda19_df2, main_race_eth, how = 'left', on = ['applicant_race_1', 'applicant_ethnicity_1'])

hmda19_df2['app_race_ethnicity'].value_counts(dropna = False)

5    9923840
7    3729763
6    1697089
3    1135436
2     928462
1      92074
4      38793
Name: app_race_ethnicity, dtype: int64

### 4. Clean Co Race and Ethnicity
- 1: Native American
- 2: Asian
- 3: Black
- 4: Pacific Islander
- 5: White
- 6: Latino
- 7: Race NA
- 8: No Coapp

In [13]:
coapp_race_ethnicity = pd.DataFrame(hmda19_df2.groupby(by = ['co_applicant_race_1', 'co_applicant_ethnicity_1'],
                                    dropna = False).size()).reset_index().rename(columns = {0: 'count'})

coapp_race_ethnicity = coapp_race_ethnicity.fillna('000')
coapp_race_ethnicity.head(1)

Unnamed: 0,co_applicant_race_1,co_applicant_ethnicity_1,count
0,1,1,9973


In [14]:
### Using clean_race_ethnicity function for the coapp r/e dataframe, it has a no co-app flag
coapp_race_ethnicity['coapp_race_ethnicity'] = coapp_race_ethnicity.apply(clean_race_ethnicity, axis = 1)

coapp_race_ethnicity = coapp_race_ethnicity.drop(columns = ['count'], axis = 1)
coapp_race_ethnicity = coapp_race_ethnicity.replace(to_replace = '000', value = np.nan)

hmda19_df2 = pd.merge(hmda19_df2, coapp_race_ethnicity, how = 'left', 
                      on = ['co_applicant_race_1', 'co_applicant_ethnicity_1'])

hmda19_df2['coapp_race_ethnicity'].value_counts(dropna = False)

8    9158472
5    4581280
7    2360361
6     683338
2     422998
3     289797
1      30843
4      18368
Name: coapp_race_ethnicity, dtype: int64

### 6. Same or Different Race for Co-Applicant
- 1: Same
- 2: Difference
- 3: Not Applicable

In [15]:
### group all instances of main applicants and co-applicants races and ethnicities
coapp_same_race = pd.DataFrame(hmda19_df2.groupby(by = ['app_race_ethnicity', 'coapp_race_ethnicity'],
                               dropna = False).size()).reset_index().rename(columns = {0: 'count'})

coapp_same_race.sample(2, random_state = 303)

Unnamed: 0,app_race_ethnicity,coapp_race_ethnicity,count
9,2,2,323202
51,7,4,457


In [16]:
### Find records where applicant and co-applicant are the same
coapp_same_race['coapp_same_race'] = coapp_same_race.apply(find_same_race, axis = 1)

coapp_same_race = coapp_same_race.drop(columns = ['count'], axis = 1)
hmda19_df2 = pd.merge(hmda19_df2, coapp_same_race, how = 'left', 
                    on = ['app_race_ethnicity', 'coapp_race_ethnicity'])

hmda19_df2['coapp_same_race'].value_counts(dropna = False)

4    9158472
1    5409249
3    2423876
2     553860
Name: coapp_same_race, dtype: int64

### 7. Clean Credit Models
- 1: Equifax
- 2: Experian
- 3: TransUnion
- 4: Vantage
- 5: More than one
- 6: Other Model
- 7: Credit Na

In [17]:
credit_models = pd.DataFrame(hmda19_df2.groupby(by = ['applicant_credit_score_type'],
                dropna = False).size()).reset_index().rename(columns = {0: 'count'})

credit_models.head(1)

Unnamed: 0,applicant_credit_score_type,count
0,1,3406553


In [18]:
### Using function to standardize credit model
credit_models['app_credit_model'] = credit_models.apply(clean_credit_model, axis = 1)

credit_models = credit_models.drop(columns = ['count'], axis = 1)

hmda19_df2 = pd.merge(hmda19_df2, credit_models, how = 'left', on = ['applicant_credit_score_type'])

hmda19_df2['app_credit_model'].value_counts(dropna = False)

7    6787023
1    3406553
3    2895349
2    2752732
6    1172497
5     498468
4      32835
Name: app_credit_model, dtype: int64

### 8. Find Co-Applicants

- 9999 in age means [no co-applicant](https://s3.amazonaws.com/cfpb-hmda-public/prod/help/2018-public-LAR-code-sheet.pdf)
- 8888 in age means no applicable

In [19]:
coapp_cols = ['coapp_race_ethnicity', 'co_applicant_sex', 'co_applicant_age', 'co_applicant_credit_score_type']

coapp_comb_df = pd.DataFrame(hmda19_df2.groupby(by = coapp_cols, dropna = False).size()).reset_index().rename(\
                columns = {0: 'count'})

coapp_comb_df.head(1)

Unnamed: 0,coapp_race_ethnicity,co_applicant_sex,co_applicant_age,co_applicant_credit_score_type,count
0,1,1,25-34,1,241


#### Co-Applicants

- 1: Co-Applicants
- 2: No co-applicants
- 3: Not Applicable

In [20]:
### Run function to find co-applicants
coapp_comb_df['co_applicant'] = coapp_comb_df.apply(find_coapplicants, axis = 1)

coapp_comb_df = coapp_comb_df.drop(columns = ['count'], axis = 1)

hmda19_df2 = pd.merge(hmda19_df2, coapp_comb_df, how = 'left', on = coapp_cols)

hmda19_df2['co_applicant'].value_counts(dropna = False)

2    9269362
1    7107953
3    1168142
Name: co_applicant, dtype: int64

### 9. Standardize Outcomes
- 1: Loan originated
- 2: Application approved but not accepted
- 3: Application denied
- 4: Application withdrawn by applicant
- 5: File closed for incompleteness
- 6: Purchased loan
- 7: Preapproval request denied
- 8: Preapproval request approved but not accepted

In [21]:
action_taken = pd.DataFrame(hmda19_df2['action_taken'].value_counts(dropna = False)).reset_index().\
               rename(columns = {'index': 'action_taken', 'action_taken': 'count'})

action_taken.head(2)

Unnamed: 0,action_taken,count
0,1,9325241
1,3,2538965


#### Outcomes:
- 1: Loans
- 3: Denials
- 4: Other Outcomes
- 6: Purchase loans

In [22]:
### Clean Outcomes
action_taken['loan_outcome'] = action_taken.apply(clean_outcomes, axis = 1)

action_taken = action_taken.drop(columns = ['count'], axis = 1)

hmda19_df2 = pd.merge(hmda19_df2, action_taken, how = 'left', on = ['action_taken'])

hmda19_df2['loan_outcome'].value_counts(dropna = False)

1    9325241
4    3415558
3    2538965
6    2265693
Name: loan_outcome, dtype: int64

### 10. Standardize Automated Underwriting System

In [23]:
aus = ['aus_1', 'aus_2', 'aus_3', 'aus_4', 'aus_5']

### Group all unique combinations of AUS together to find all the patterns
aus_df = pd.DataFrame(hmda19_df2.groupby(by = aus, dropna = False).size()).\
         reset_index().rename(columns = {0: 'count'})
aus_df = aus_df.drop(columns = ['count'], axis = 1)

aus_df.head(2)

Unnamed: 0,aus_1,aus_2,aus_3,aus_4,aus_5
0,1,1,1,1,1
1,1,1,1,1,2


#### Aus Cat
- 1: One AUS was used
- 2: Same AUS was used mulitple times
- 3: Different AUS was used 
- 4: Exempt

In [24]:
### Calculate unique values and nulls
aus_df = find_aus_patterns(aus_df)

### Categorize AUS
aus_df['aus_cat'] = aus_df.apply(clean_aus, axis = 1)

aus_df = aus_df.drop(columns = ['number_of_values', 'number_of_nulls'], axis = 1)

hmda19_df2 = pd.merge(hmda19_df2, aus_df, how = 'left', on = aus)
hmda19_df2['aus_cat'].value_counts(dropna = False)

1    16144858
3      778268
4      471452
2      150879
Name: aus_cat, dtype: int64

### Write out new csv

In [25]:
hmda19_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17545457 entries, 0 to 17545456
Data columns (total 87 columns):
 #   Column                                    Dtype 
---  ------                                    ----- 
 0   activity_year                             object
 1   lei                                       object
 2   derived_msa_md                            object
 3   state_code                                object
 4   county_code                               object
 5   census_tract                              object
 6   conforming_loan_limit                     object
 7   action_taken                              object
 8   purchaser_type                            object
 9   preapproval                               object
 10  loan_type                                 object
 11  loan_purpose                              object
 12  lien_status                               object
 13  reverse_mortgage                          object
 14  open_end_line_of

In [26]:
hmda19_df2.to_csv('../../data/hmda_lar/cleaned_data/1_hmda2019_210823.csv', index = False)