In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import statsmodels.formula.api as sm
from stargazer.stargazer import Stargazer
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DATA_DIR = 'data'

In [None]:
df_acp = pd.read_pickle(os.path.join(DATA_DIR, 'accepted_merged.pickle'))

In [None]:
df_demograph = pd.read_csv(os.path.join(DATA_DIR, 'cps_data/demographic_cleaned.csv'), dtype={'zip3': str})

In [None]:
race = 'black'
normalize_period = '2016-03-01'
top_n = 20
nyc_zip = [str(x) for x in range(0, 1000)]

In [None]:
mus_by_zip = pd.read_csv('/Users/yashgupta/Downloads/xibai_drive/datathon_2022/data/muslim_by_county_cleaned.csv', dtype={'zip3':str})
# mus_by_zip['zip3'] = mus_by_zip['zip3'].astype(str)
mus_by_zip = mus_by_zip[mus_by_zip['zip3'].isin(nyc_zip)].sort_values(by='Percent')
mus_zip = mus_by_zip['zip3'][:top_n].tolist()
len(mus_zip)

In [None]:
black_zip = df_demograph.groupby('zip3')[f'is_{race}'].mean()
black_zip = black_zip.sort_values(ascending=False).index[:top_n].tolist()
# black_zip = mus_zip
emp2yr = {'10+ years':10, '3 years':3, '4 years':4, '6 years':6, '1 year':1, '7 years':7, '8 years':8, '5 years':5, '2 years':2, '9 years':9, '< 1 year':1, 0:0}

west = ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
south_west = ['AZ', 'TX', 'NM', 'OK']
south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ]
mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']
north_east = ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME']

def finding_regions(state):
    if state in west:
        return 'West'
    elif state in south_west:
        return 'SouthWest'
    elif state in south_east:
        return 'SouthEast'
    elif state in mid_west:
        return 'MidWest'
    else:
        return 'NorthEast'

In [None]:
df_rej = pd.read_pickle(os.path.join(DATA_DIR, 'rejected.pickle'))

In [None]:
X_rej = df_rej[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt', 'zip3']].copy()
X_rej['black_zip']  = (df_rej['zip3'].isin(black_zip)).astype(int)
X_rej['is_funded'] = 0
X_rej['post'] = (pd.to_datetime(df_rej['issue_d']) >= normalize_period).astype(int)
X_rej['emp_length'].fillna(0, inplace=True)
X_rej['emp_length'] = X_rej['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
X_rej['region'] = X_rej['addr_state'].apply(lambda x: finding_regions(x))
del df_rej

In [None]:
df_acp['year'] = df_acp['issue_d'].dt.year
X_acp = df_acp[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt', 'zip3']].copy()
X_acp['black_zip']  = (df_acp['zip3'].isin(black_zip)).astype(int)
X_acp['is_funded'] = 1
X_acp['post'] = (pd.to_datetime(df_acp['issue_d']) >= normalize_period).astype(int)
X_acp['emp_length'].fillna(0, inplace=True)
X_acp['emp_length'] = X_acp['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
X_acp['region'] = X_acp['addr_state'].apply(lambda x: finding_regions(x))

In [None]:
X  = pd.concat([X_acp, X_rej])
X['dti'].clip(upper=100, inplace=True)
X['post_black_zip'] = X['post']*X['black_zip']
X['loan_amnt'].fillna(1, inplace=True)
X['loan_amnt'].clip(upper=40000, inplace=True)
X['log_loan_amnt'] = np.log(X['loan_amnt'] + 1)
X['loan_amnt_norm'] = X['loan_amnt'] / 40000

In [None]:
X = X[X['zip3'].isin(nyc_zip)]

In [None]:
pre_mask = (X['post'] == 0)
post_mask = ~pre_mask

PRE

In [None]:
df = X[pre_mask]
result_pre = sm.ols(formula = "is_funded ~ dti + black_zip + emp_length + C(year)*C(region) + loan_amnt_norm", data = df).fit()
result_pre.summary()

POST

In [None]:
df = X[post_mask]
result_post = sm.ols(formula = "is_funded ~ dti + black_zip + emp_length + C(year)*C(region) + loan_amnt_norm", data = df).fit()
result_post.summary()

In [None]:
out_stargazer = Stargazer([result_pre, result_post])
print(out_stargazer.render_latex())
with open("regression_our", "w") as f:
    f.write(str(out_stargazer.render_latex()))