In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DATA_DIR = 'data'

In [None]:
df_acp = pd.read_pickle(os.path.join(DATA_DIR, 'accepted_merged.pickle'))
df_acp['year'] = pd.to_datetime(df_acp['issue_d']).dt.year

In [None]:
race = 'black'
normalize_period = '2016Q2'
top_n = 30
df_demograph = pd.read_csv(os.path.join(DATA_DIR, 'cps_data/demographic_cleaned.csv'), dtype={'zip3': str})
black_zip = df_demograph.groupby('zip3')[f'is_{race}'].mean()
black_zip = black_zip.sort_values(ascending=False).index[:30].tolist()
emp2yr = {'10+ years':10, '3 years':3, '4 years':4, '6 years':6, '1 year':1, '7 years':7, '8 years':8, '5 years':5, '2 years':2, '9 years':9, '< 1 year':1, 0:0}

west = ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
south_west = ['AZ', 'TX', 'NM', 'OK']
south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ]
mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']
north_east = ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME']

def finding_regions(state):
    if state in west:
        return 'West'
    elif state in south_west:
        return 'SouthWest'
    elif state in south_east:
        return 'SouthEast'
    elif state in mid_west:
        return 'MidWest'
    else:
        return 'NorthEast'

In [None]:
X_san = df_acp[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt', 'fico_range_low', 'delinq_2yrs', 'home_ownership', 'annual_inc', \
    'open_acc', 'revol_bal']].copy()
X_san['black_zip']  = (df_acp['zip3'].isin(black_zip)).astype(int)
X_san['post'] = (pd.to_datetime(df_acp['issue_d']) >= '2016-03-01').astype(int)
X_san['emp_length'].fillna(0, inplace=True)
X_san['emp_length'] = X_san['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
X_san['region'] = X_san['addr_state'].apply(lambda x: finding_regions(x))

In [None]:
df_acp['year'].unique(), X_san['year'].unique()

In [None]:
print(df_acp['loan_status'].unique())
mask_def = df_acp['loan_status'].isin(['In Grace Period', 'Late (31-120 days)' 'Late (16-30 days)', 'Default'])
mask_no_def = df_acp['loan_status'].isin(['Fully Paid'])

In [None]:
X_san['default'] = 0
X_san['default'][mask_def] = 1
X_san = X_san[mask_def | mask_no_def]

In [None]:
X_san['loan_amnt'].fillna(1, inplace=True)
X_san['loan_amnt'].clip(upper=40000, inplace=True)
X_san['log_loan_amnt'] = np.log(X_san['loan_amnt'] + 1)
X_san['loan_amnt_norm'] = X_san['loan_amnt'] / 40000

In [None]:
import numpy as np
X_san['annual_inc'].fillna(1, inplace=True)
X_san['annual_inc'].clip(upper=1000000, inplace=True)
X_san['annual_inc_norm'] = np.log(X_san['annual_inc'])

In [None]:
X_san['dti'].clip(upper=100, inplace=True)
X_san['post_black_zip'] = X_san['post']*X_san['black_zip']

In [None]:
df_acp['year'].unique(), X_san['year'].unique()

In [None]:
sns.set(rc={'figure.figsize':(15,15)})
sns.heatmap(X_san.corr(), cmap="YlGnBu", annot=True)

In [None]:
import statsmodels.formula.api as sm
df = X_san
result = sm.logit(formula = "default ~ dti + post + post_black_zip + emp_length + C(year) + C(region) + loan_amnt_norm + \
    delinq_2yrs + annual_inc_norm + fico_range_low + C(home_ownership) + open_acc", data = df).fit()

In [None]:
result.summary()

In [None]:
import statsmodels.formula.api as sm
df = X_san
result = sm.logit(formula = "default ~ dti + post + black_zip + emp_length + C(year) + C(region) + loan_amnt_norm + \
    delinq_2yrs + annual_inc_norm + fico_range_low + C(home_ownership) + open_acc", data = df).fit()

In [None]:
result.summary()