In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DATA_DIR = 'data'

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
df_acp = pd.read_csv(os.path.join(DATA_DIR, 'Lending_Club_Accepted_2014_2018.csv'))

df_acp['dti_cat'] = pd.cut(df_acp['dti'], 
                           bins=[0,40,100,1000], 
                           labels=["<40", "40-100","100-1000"])

df_acp['issue_d'] = pd.to_datetime(df_acp['issue_d'])
df_acp['issue_d_year'] = pd.DatetimeIndex(df_acp['issue_d']).year
df_acp['zip3'] = df_acp['zip_code'].apply(lambda x: str(x)[:3])

## 1. Loans

In [None]:
fig = plt.figure(figsize=(8,6))
sns.violinplot(x="issue_d_year", y="loan_amnt", data=df_acp, palette="Pastel1")
plt.show()

In [None]:
fig = plt.figure(figsize=(8,6))
df_acp['purpose'].value_counts().plot.pie()
plt.legend()
plt.show()

In [None]:
df_acp['dti_cat'].value_counts().plot.pie()
plt.title('Debt to Income - Accepted')
plt.legend()
plt.show()

In [None]:
df_acp['application_type'].value_counts().plot.pie()

In [None]:
status_df = df_acp.groupby(['issue_d', 'loan_status'], as_index=False)['funded_amnt'].sum()
status_df['issue_d'] = status_df['issue_d'].apply(lambda x: x.strftime('%Y/%m'))
status_df = status_df.pivot(index='issue_d', columns='loan_status', values='funded_amnt')
loan_status_rank = df_acp['loan_status'].value_counts().index
status_df = status_df[loan_status_rank]

In [None]:
fig = plt.figure(figsize=(12,8))
sns.heatmap(status_df, cmap="YlGnBu")
plt.title('Loan Status Change')
plt.show()

In [None]:
df_acp['loan_status'].value_counts()

## Demographics

In [None]:
df_acp = pd.read_pickle(os.path.join(DATA_DIR, 'accepted_merged.pickle'))

In [None]:
# df_acp.head()

In [None]:
race = 'white'
normalize_period = '2016Q2'
top_n = 30

df_demograph = pd.read_csv(os.path.join(DATA_DIR, 'cps_data/demographic_cleaned.csv'), dtype={'zip3': str})
black_zip = df_demograph.groupby('zip3')[f'is_{race}'].mean()
black_zip = black_zip.sort_values(ascending=False).index[:top_n].tolist()

# black_zip = df_acp.groupby('zip3')[f'is_{race}'].first().sort_values(ascending=False).index[:top_n].tolist()

loan_df = df_acp[['funded_amnt', 'issue_d', 'zip3']]
loan_df['issue_q'] = pd.to_datetime(loan_df['issue_d']).dt.to_period('Q')

loan_black = loan_df[loan_df['zip3'].isin(black_zip)]
loan_non_black = loan_df[~loan_df['zip3'].isin(black_zip)]

loan_black_amt = loan_black.groupby('issue_q')['funded_amnt'].sum()
loan_black_amt /= loan_black_amt[normalize_period]
loan_non_black_amt = loan_non_black.groupby('issue_q')['funded_amnt'].sum()
loan_non_black_amt /= loan_non_black_amt[normalize_period]

fig = plt.figure(figsize=(8,6))
loan_black_amt.plot(label=f'{race}')
loan_non_black_amt.plot(label=f'non-{race}')
plt.axvline(x=normalize_period, color = 'black',  linestyle='--')
plt.title(f'Loan Amount around LC Scandal')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Normalized Average Funded Amout')
# plt.show()
plt.savefig('fund_amnt_2016q2_white.png')

## Regression

In [None]:
black_zip = df_demograph.groupby('zip3')[f'is_{race}'].mean()
black_zip = black_zip.sort_values(ascending=False).index[:30].tolist()
emp2yr = {'10+ years':10, '3 years':3, '4 years':4, '6 years':6, '1 year':1, '7 years':7, '8 years':8, '5 years':5, '2 years':2, '9 years':9, '< 1 year':1, 0:0}

west = ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
south_west = ['AZ', 'TX', 'NM', 'OK']
south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ]
mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']
north_east = ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME']

def finding_regions(state):
    if state in west:
        return 'West'
    elif state in south_west:
        return 'SouthWest'
    elif state in south_east:
        return 'SouthEast'
    elif state in mid_west:
        return 'MidWest'
    else:
        return 'NorthEast'

In [None]:
df_rej = pd.read_pickle(os.path.join(DATA_DIR, 'rejected.pickle'))

In [None]:
df_rej.head()

In [None]:
X_rej = df_rej[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt']]
X_rej['black_zip']  = (df_rej['zip3'].isin(black_zip)).astype(int)
X_rej['is_funded'] = 0
X_rej['post'] = (pd.to_datetime(df_rej['issue_d']) >= '2016-03-01').astype(int)
X_rej['emp_length'].fillna(0, inplace=True)
X_rej['emp_length'] = X_rej['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
X_rej['region'] = X_rej['addr_state'].apply(lambda x: finding_regions(x))
del df_rej

In [None]:
df_acp['year'] = df_acp['issue_d'].dt.year

In [None]:
X_acp = df_acp[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt']].copy()
X_acp['black_zip']  = (df_acp['zip3'].isin(black_zip)).astype(int)
X_acp['is_funded'] = 1
X_acp['post'] = (pd.to_datetime(df_acp['issue_d']) >= '2016-03-01').astype(int)
X_acp['emp_length'].fillna(0, inplace=True)
X_acp['emp_length'] = X_acp['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
X_acp['region'] = X_acp['addr_state'].apply(lambda x: finding_regions(x))

In [None]:
X  = pd.concat([X_acp, X_rej])

In [None]:
X.columns

In [None]:
X['dti'].clip(upper=100, inplace=True)

In [None]:
X['post_black_zip'] = X['post']*X['black_zip']

In [None]:
X['loan_amnt'].fillna(1, inplace=True)
X['loan_amnt'].clip(upper=40000, inplace=True)
X['log_loan_amnt'] = np.log(X['loan_amnt'] + 1)
X['loan_amnt_norm'] = X['loan_amnt'] / 40000

In [None]:
import statsmodels.formula.api as sm
df = X.sample(n=1000000, random_state=1)
result = sm.logit(formula = "is_funded ~ dti + post + post_black_zip + emp_length + C(year)*C(region) + loan_amnt_norm", data = df).fit()

In [None]:
result.summary()

In [None]:
# sns.set(rc={'figure.figsize':(11.7,8.27)})
# sns.heatmap(df.corr(), cmap="YlGnBu", annot=True)

In [None]:
# df['loan_amnt_norm'].hist()

In [None]:
# df.plot(y='is_funded', x='loan_amnt_norm', kind='scatter')

In [None]:
import statsmodels.formula.api as sm
df = X.sample(n=1000000, random_state=1)
result = sm.logit(formula = "is_funded ~ dti + post + black_zip + emp_length + C(year)*C(region) + loan_amnt_norm", data = df).fit()

In [None]:
result.summary()

sanity checks

In [None]:
X_san = df_acp[['dti', 'addr_state', 'emp_length', 'year', 'loan_amnt', 'fico_range_low', 'delinq_2yrs', 'home_ownership', 'annual_inc']].copy()
X_san['black_zip']  = (df_acp['zip3'].isin(black_zip)).astype(int)
X_san['is_funded'] = 1
X_san['post'] = (pd.to_datetime(df_acp['issue_d']) >= '2016-03-01').astype(int)
X_san['emp_length'].fillna(0, inplace=True)
X_san['emp_length'] = X_san['emp_length'].apply(lambda x: emp2yr[x]).astype(int)
X_san['region'] = X_san['addr_state'].apply(lambda x: finding_regions(x))

In [None]:
print(df_acp['loan_status'].unique())
mask_def = df_acp['loan_status'].isin(['Charged Off', 'In Grace Period', 'Late (31-120 days)', 'Late (16-30 days)', 'Default'])
mask_no_def = df_acp['loan_status'].isin(['Fully Paid'])

In [None]:
X_san['default'] = 0
X_san[mask_def] = 1
X_san = X_san[mask_def | mask_no_def]

In [None]:
X_san['loan_amnt'].fillna(1, inplace=True)
X_san['loan_amnt'].clip(upper=40000, inplace=True)
X_san['log_loan_amnt'] = np.log(X_san['loan_amnt'] + 1)
X_san['loan_amnt_norm'] = X_san['loan_amnt'] / 40000

In [None]:
X_san['dti'].clip(upper=100, inplace=True)
X_san['post_black_zip'] = X_san['post']*X_san['black_zip']

In [None]:
import statsmodels.formula.api as sm
df = X_san
result = sm.logit(formula = "default ~ dti + post + black_zip + emp_length + C(year)*C(region) + loan_amnt_norm", data = df).fit()

In [None]:
result.summary()

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.heatmap(X_san.corr(), cmap="YlGnBu", annot=True)

In [None]:
# X_san.plot(x='fico_range_low', y='year', kind='scatter')
X_san['year'].unique()

In [None]:
X_san['year'].hist()

In [None]:
df_acp['year'].hist()

In [None]:
X['addr_state'].unique(), X.addr_state.isna().sum()

In [None]:
#total funding likelihood
print(X['is_funded'].mean())

#pre funding likelihood
mask = (X['post'] == 0)
print(X[mask]['is_funded'].mean())

#post funding likelihood
mask = (X['post'] == 1)
print(X[mask]['is_funded'].mean())

In [None]:
df_acp