# Loan Analysis - Exploratory Data Analysis

### The objective is to identify predictors of default so that at the time of loan application, we can use those variables for approval/rejection of the loan

In [None]:
import pandas as pd
import numpy as np

loan_data = pd.read_csv('loan.csv', low_memory=False)
print(loan_data.shape)

### Now let us understand how the data looks like

In [None]:
loan_data.head(5)

### Now let us do missing value analysis, will discard features with more than 25% missing values

In [None]:
a = loan_data.isnull().sum(axis=0)

b = np.round(loan_data.isnull().sum(axis=0) / loan_data.fillna(0).count(axis=0), 2)

c = loan_data.columns

missing_df = pd.DataFrame({'missing_vals' : a,  'missing_ratio' : b, 'cols' : c})

In [None]:
missing_df_g25 = missing_df[missing_df['missing_ratio'] >= 0.25]
missing_df_g25['cols'].count()

In [None]:
drop_c = missing_df_g25['cols']
loan_data.drop(labels = drop_c, axis =1, inplace=True)
print(loan_data.shape)

In [None]:
int_feat = loan_data._get_numeric_data().columns.tolist()
print("Numeric variables are - ", int_feat)
cat_feat = list(loan_data.select_dtypes(include=['object']).columns)
print("Categorical variables are - ",cat_feat)

### Let's get rid of features of no use

In [None]:
drop_c1 = ['id', 'member_id', 'url', 'zip_code']
loan_data.drop(labels = drop_c1, axis =1, inplace=True)
print(loan_data.shape)

### Customer behavior variables are not available at the time of loan application, and thus they cannot be used as predictors for credit approval. Getting rid of these features.

In [None]:
con_var = ['delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
           'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
           'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 
           'last_credit_pull_d', 'application_type']
loan_data.drop(labels = con_var, axis =1, inplace=True)
print(loan_data.shape)

In [None]:
int_feat = loan_data._get_numeric_data().columns.tolist()
print("Numeric variables are - ", int_feat)
cat_feat = list(loan_data.select_dtypes(include=['object']).columns)
print("Categorical variables are - ",cat_feat)

In [None]:
loan_data.info()

In [None]:
loan_data = loan_data[loan_data.loan_status != "Current"]
print(loan_data.shape)

In [None]:
loan_data['int_rate'] = loan_data['int_rate'].str.rstrip("%").astype(float)

In [None]:
loan_data['loan_status_binary'] = loan_data.loan_status.apply(lambda x: 0 if x =='Fully Paid' else 1)

In [None]:
loan_data.head()

### Now let us start with univariate analysis - Try understand on continuous variables (Binning might be good for these features)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Loan Amount
print(loan_data["loan_amnt"].describe())

import plotly.express as px
df = px.data.tips()
fig = px.histogram(loan_data, x="loan_amnt")
fig.show()

In [None]:
bins = [0, 5000, 10000, 15000, 20000, 25000, 35000]
slot = ['0-5,000', '5,000-10,000', '10,000-15,000', '15,000-20,000', '20,000-25,000','> 25,000']
loan_data['loan_amnt_range'] = pd.cut(loan_data['loan_amnt'], bins, labels=slot)

import plotly.express as px
df = px.data.tips()
fig = px.histogram(loan_data, x="loan_amnt_range", 
                   category_orders=dict(loan_amnt_range=['0-5,000', '5,000-10,000', '10,000-15,000', '15,000-20,000', '20,000-25,000','> 25,000']))
fig.show()

In [None]:
## Annual Income
pd.set_option('float_format', '{:f}'.format)
print(loan_data["annual_inc"].describe())

#### Though median value is 59K, the highest value is quite high. We need to remove outliers and see.

In [None]:
q = loan_data["annual_inc"].quantile(0.99)
loan_data = loan_data[loan_data["annual_inc"] < q]
loan_data["annual_inc"].describe()

In [None]:
bins = [0, 25000, 50000, 75000, 100000, 1000000]
slot = ['0-25,000', '25,000-50,000', '50,000-75,000', '75,000-1,00,000', '1,00,000 and above']
loan_data['annual_inc_range'] = pd.cut(loan_data['annual_inc'], bins, labels=slot)

In [None]:
df = px.data.tips()
fig = px.histogram(loan_data, x="annual_inc_range")
fig.show()

In [None]:
## DTI
print(loan_data["dti"].describe())

df = px.data.tips()
fig = px.histogram(loan_data, x="dti")
fig.show()

In [None]:
bins = [0, 5, 10, 15, 20, 25, 30]
slot = ['0-5', '5-10', '10-15', '15-20', '25-30', '>30']
loan_data['dti_range'] = pd.cut(loan_data['dti'], bins, labels=slot)

df = px.data.tips()
fig = px.histogram(loan_data, x="dti_range",
                  category_orders=dict(dti_range=['0-5', '5-10', '10-15', '15-20', '25-30', '>30']))
fig.show()

In [None]:
import plotly.offline as py 
py.init_notebook_mode(connected=True) # this code, allow us to work with offline plotly version
import plotly.graph_objs as go # it's like "plt" of matplot
import plotly.tools as tls # It's useful to we get some tools of plotly
import warnings # This library will be used to ignore some warnings
from collections import Counter # To do counter of some features


tr0 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Fully Paid']["loan_status"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Fully Paid']["loan_status"].value_counts().values,
    name='Fully Paid'
)

#Second plot
tr1 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Charged Off']["loan_status"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Charged Off']["loan_status"].value_counts().values,
    name='Charged Off'
)

data = [tr0, tr1]

layout = go.Layout(
    
)

layout = go.Layout(
    yaxis=dict(
        title='Count'
    ),
    xaxis=dict(
        title='Loan payment Status'
    ),
    title='Loan Payment Status'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='grouped-bar')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def generate_graph(x_val):
    loan_data[x_val] = pd.Categorical(loan_data[x_val], categories=['0-5,000', '5,000-10,000', '10,000-15,000', '15,000-20,000', '20,000-25,000','> 25,000'],
                                   ordered=True)
    #result = loan_data.groupby([x_val])['loan_status_binary'].aggregate(sum).reset_index().sort_values('loan_status_binary')
    splot = sns.barplot(x=x_val, y='loan_status_binary',
                        data=loan_data, estimator=lambda x: sum(x) / len(x) * 100, palette="RdYlBu")
    plt.xticks(rotation='vertical')
    for p in splot.patches:
        splot.annotate(format(p.get_height(), '.1f'),
                       (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha = 'center', va = 'center',
                       xytext = (0, 10),
                       textcoords = 'offset points')


generate_graph('loan_amnt_range')
plt.xlabel(" Loan Amount", size=16)
plt.ylabel(" % of default ", size=16)

### How the home ownership helps in understanding who are more like to pay-off

In [None]:
#First plot
tr0 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Fully Paid']["home_ownership"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Fully Paid']["home_ownership"].value_counts().values,
    name='Fully Paid'
)

#Second plot
tr1 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Charged Off']["home_ownership"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Charged Off']["home_ownership"].value_counts().values,
    name='Charged Off'
)

data = [tr0, tr1]

layout = go.Layout(title='Home Ownership Distribution')


fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='Home-Ownership')

In [None]:
def generate_graph(x_val):
    loan_data[x_val] = pd.Categorical(loan_data[x_val], categories=['RENT', 'MORTGAGE', 'OWN', 'OTHER', 'NONE'],
                                   ordered=True)
    #result = loan_data.groupby([x_val])['loan_status_binary'].aggregate(sum).reset_index().sort_values('loan_status_binary')
    splot = sns.barplot(x=x_val, y='loan_status_binary',
                        data=loan_data, estimator=lambda x: sum(x) / len(x) * 100, palette="RdYlBu")
    plt.xticks(rotation='vertical')
    for p in splot.patches:
        splot.annotate(format(p.get_height(), '.1f'),
                       (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha = 'center', va = 'center',
                       xytext = (0, 10),
                       textcoords = 'offset points')
        
generate_graph('home_ownership')
plt.xlabel(" Home Ownership ")
plt.ylabel(" % of default ")

### Now, let us understand if the purpose of loan taken & loan amount has any relation with loan payment

In [None]:
df_fp = loan_data[loan_data['loan_status']== 'Fully Paid']
df_co = loan_data[loan_data['loan_status']== 'Charged Off']

tr0 = go.Box(y=df_fp["loan_amnt"], x=df_fp["purpose"], name='Fully Paid', marker=dict(color='#3D9970'))

tr1 = go.Box(y=df_co["loan_amnt"], x=df_co["purpose"], name='Charged Off', marker=dict(color='#FF4136'))
    
data = [tr0, tr1]

layout = go.Layout(yaxis=dict(title='Loan Amount Range', zeroline=False),
                   xaxis=dict(title='Purpose of Loan Taken'), boxmode='group')
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='box-age-cat')

### How the home ownership type alongwith annual income helps in understanding who are more like to pay-off

In [None]:
df_fp = loan_data[loan_data['loan_status']== 'Fully Paid']
df_co = loan_data[loan_data['loan_status']== 'Charged Off']

tr0 = go.Box(y=df_fp["annual_inc"], x=df_fp["home_ownership"], name='Fully Paid', marker=dict(color='#3D9970'))

tr1 = go.Box(y=df_co["annual_inc"], x=df_co["home_ownership"], name='Charged Off', marker=dict(color='#FF4136'))
    
data = [tr0, tr1]

layout = go.Layout(yaxis=dict(title='Annual Income', zeroline=False),
                   xaxis=dict(title='Home Ownership Status'), boxmode='group')

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='box-age-cat')

### How the home ownership along with interest rate charged on loans helps in understanding who are more like to pay-off

In [None]:
df_fp = loan_data[loan_data['loan_status']== 'Fully Paid']
df_co = loan_data[loan_data['loan_status']== 'Charged Off']

tr0 = go.Box(y=df_fp["int_rate"], x=df_fp["home_ownership"], name='Fully Paid', marker=dict(color='#3D9970'))

tr1 = go.Box(y=df_co["int_rate"], x=df_co["home_ownership"], name='Charged Off', marker=dict(color='#FF4136'))
    
data = [tr0, tr1]

layout = go.Layout(yaxis=dict(title='Interest Rate', zeroline=False),
                   xaxis=dict(title='Home Ownership Status'), boxmode='group')

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='box-age-cat')

### Does installment paied for loans helps in understanding who are more like to pay-off

In [None]:
df_fp = loan_data[loan_data['loan_status']== 'Fully Paid']
df_co = loan_data[loan_data['loan_status']== 'Charged Off']

tr0 = go.Box(y=df_fp["installment"], x=df_fp["home_ownership"], name='Fully Paid', marker=dict(color='#3D9970'))

tr1 = go.Box(y=df_co["installment"], x=df_co["home_ownership"], name='Charged Off', marker=dict(color='#FF4136'))
    
data = [tr0, tr1]

layout = go.Layout(yaxis=dict(title='Installment', zeroline=False),
                   xaxis=dict(title='Home Owenership Status'), boxmode='group')

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='box-age-cat')

### Which tenured employees are possibly most risky to pay-off the loan

In [None]:
#First plot
tr0 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Fully Paid']["emp_length"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Fully Paid']["emp_length"].value_counts().values,
    name='Fully Paid'
)

#Second plot
tr1 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Charged Off']["emp_length"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Charged Off']["emp_length"].value_counts().values,
    name='Charged Off'
)

data = [tr0, tr1]

layout = go.Layout(title='Employee Length Distribution')


fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='Emp_Length')

### Does the borrower background verification of income help in understanding who are more like to pay-off

In [None]:
#First plot
tr0 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Fully Paid']["verification_status"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Fully Paid']["verification_status"].value_counts().values,
    name='Fully Paid'
)

#Second plot
tr1 = go.Bar(
    x = loan_data[loan_data['loan_status']== 'Charged Off']["verification_status"].value_counts().index.values,
    y = loan_data[loan_data['loan_status']== 'Charged Off']["verification_status"].value_counts().values,
    name='Charged Off'
)

data = [tr0, tr1]

layout = go.Layout(title='Income Verification Status')


fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='Emp_VS')

### Does the verification status alongwith borrower annual income help in understanding who are more like to pay-off

In [None]:
df_fp = loan_data[loan_data['loan_status']== 'Fully Paid']
df_co = loan_data[loan_data['loan_status']== 'Charged Off']

tr0 = go.Box(y=df_fp["annual_inc"], x=df_fp["verification_status"], name='Fully Paid', marker=dict(color='#3D9970'))

tr1 = go.Box(y=df_co["annual_inc"], x=df_co["verification_status"], name='Charged Off', marker=dict(color='#FF4136'))
    
data = [tr0, tr1]

layout = go.Layout(yaxis=dict(title='Loan Amount', zeroline=False),
                   xaxis=dict(title='Verification Status'), boxmode='group')

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='box-age-cat')