In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import statsmodels.formula.api as sm
from stargazer.stargazer import Stargazer
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DATA_DIR = 'data'

In [None]:
df_acp = pd.read_pickle(os.path.join(DATA_DIR, 'accepted_merged.pickle'))

In [None]:
df_demograph = pd.read_csv(os.path.join(DATA_DIR, 'cps_data/demographic_cleaned.csv'), dtype={'zip3': str})

In [None]:
race = 'black'
normalize_period = '2016Q2'
top_n = 30

In [None]:
black_zip = df_demograph.groupby('zip3')[f'is_{race}'].mean()
black_zip = black_zip.sort_values(ascending=False).index[:top_n].tolist()
emp2yr = {'10+ years':10, '3 years':3, '4 years':4, '6 years':6, '1 year':1, '7 years':7, '8 years':8, '5 years':5, '2 years':2, '9 years':9, '< 1 year':1, 0:0}

west = ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
south_west = ['AZ', 'TX', 'NM', 'OK']
south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ]
mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']
north_east = ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME']

def finding_regions(state):
    if state in west:
        return 'West'
    elif state in south_west:
        return 'SouthWest'
    elif state in south_east:
        return 'SouthEast'
    elif state in mid_west:
        return 'MidWest'
    else:
        return 'NorthEast'

In [None]:
# df_rej = pd.read_pickle(os.path.join(DATA_DIR, 'rejected.pickle'))

In [None]:
# X_rej = df_rej[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt']].copy()
# X_rej['black_zip']  = (df_rej['zip3'].isin(black_zip)).astype(int)
# X_rej['is_funded'] = 0
# X_rej['post'] = (pd.to_datetime(df_rej['issue_d']) >= '2016-03-01').astype(int)
# X_rej['emp_length'].fillna(0, inplace=True)
# X_rej['emp_length'] = X_rej['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
# X_rej['region'] = X_rej['addr_state'].apply(lambda x: finding_regions(x))
# del df_rej

In [None]:
df_acp['year'] = df_acp['issue_d'].dt.year
X_acp = df_acp[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', \
    'fico_range_low', 'delinq_2yrs', 'home_ownership', 'annual_inc', 'open_acc', 'revol_bal', 'term', 'grade', 'sub_grade']].copy()
X_acp['black_zip']  = (df_acp['zip3'].isin(black_zip)).astype(int)
X_acp['is_funded'] = 1
X_acp['post'] = (pd.to_datetime(df_acp['issue_d']) >= '2016-03-01').astype(int)
X_acp['emp_length'].fillna(0, inplace=True)
X_acp['emp_length'] = X_acp['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
X_acp['region'] = X_acp['addr_state'].apply(lambda x: finding_regions(x))

In [None]:
X  = X_acp #pd.concat([X_acp, X_rej])
X['dti'].clip(upper=100, inplace=True)
X['post_black_zip'] = X['post']*X['black_zip']

for c in ['loan_amnt', 'funded_amnt', 'funded_amnt_inv']:
    X[c].fillna(1, inplace=True)
    X[c].clip(upper=40000, inplace=True)
    X['log_{}'.format(c)] = np.log(X[c] + 1)
    X['{}_norm'.format(c)] = X[c] / 40000

In [None]:
pre_mask = (X['post'] == 0)
post_mask = ~pre_mask

PRE

In [None]:
reg_str = "int_rate ~ dti + black_zip + emp_length + C(year)*C(region) + loan_amnt_norm + fico_range_low + delinq_2yrs + open_acc \
    + revol_bal + C(term) + C(grade) + C(sub_grade)"

In [None]:
df = X[pre_mask]
result_pre = sm.ols(formula = reg_str, data = df).fit()
result_pre.summary()

POST

In [None]:
df = X[post_mask]
result_post = sm.ols(formula = reg_str, data = df).fit()
result_post.summary()

In [None]:
out_stargazer = Stargazer([result_pre, result_post])
print(out_stargazer.render_latex())
with open("regression_our", "w") as f:
    f.write(str(out_stargazer.render_latex()))