## Project Index

1. **[Import packages](#import_packages)**
    
2. **[Preprocessing](#Preprocessing)**
    1. **[Null Values Treatment](#null_values)**
    2. **[Upper Lower Capping](#capping)**
        
3. **[EDA](#EDA)**

4. **[Model Building](#Model_Building)**

5. **[Model Evaluation](#Model_Evaluation)**

6. **[Conclusion](#Conclusion)**

<a id='import_packages'></a>
## Data And Packages Loading

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [70]:
data = pd.read_csv('XYZCorp_LendingData.txt', sep = '\t', na_values = None)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,default_ind
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,0
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,1
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,0
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,0
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,0


In [4]:
# Dataframe Shape
data.shape

(855969, 73)

In [71]:
# Drop 'id' and 'member_id' as it will not play significant role into model building process
data.drop(['id','member_id'], inplace = True, axis = 1)

In [6]:
# List of All Data Columns
data.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 'issue_d',
       'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code', 'addr_state',
       'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt',
       'tot

In [19]:
# Statistics for Numerical Data
data.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,...,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,default_ind
count,855969.0,855969.0,855969.0,855969.0,855969.0,855969.0,855969.0,855969.0,855969.0,416157.0,...,11609.0,13288.0,13288.0,13288.0,13288.0,788656.0,13288.0,13288.0,13288.0,855969.0
mean,14745.571335,14732.378305,14700.061226,13.19232,436.238072,75071.19,18.122165,0.311621,0.680915,34.149943,...,71.486993,1.354305,2.945515,5840.443332,61.024526,32163.57,0.947772,1.524232,1.841963,0.054286
std,8425.340005,8419.471653,8425.805478,4.368365,243.726876,64264.47,17.423629,0.857189,0.964033,21.8685,...,23.015293,1.48371,2.595313,5108.500262,20.018117,37699.64,1.441667,2.697601,2.975049,0.226581
min,500.0,500.0,0.0,5.32,15.69,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,0.0
25%,8000.0,8000.0,8000.0,9.99,260.55,45000.0,11.88,0.0,0.0,15.0,...,58.5,0.0,1.0,2405.0,47.9,14000.0,0.0,0.0,0.0,0.0
50%,13000.0,13000.0,13000.0,12.99,382.55,65000.0,17.61,0.0,0.0,31.0,...,75.0,1.0,2.0,4485.5,62.1,23800.0,0.0,0.0,2.0,0.0
75%,20000.0,20000.0,20000.0,15.99,571.56,90000.0,23.9,0.0,1.0,50.0,...,87.5,2.0,4.0,7701.25,75.3,39900.0,1.0,2.0,3.0,0.0
max,35000.0,35000.0,35000.0,28.99,1445.46,9500000.0,9999.0,39.0,8.0,188.0,...,223.3,22.0,43.0,83047.0,151.4,9999999.0,15.0,33.0,32.0,1.0


<a id='Preprocessing'></a>
## Data Preprocessing

In [72]:
# Saperating Dependant and Independant Variables
y = data['default_ind']
x = data.drop('default_ind', axis = 1)

In [73]:
x.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,,10+ years,...,,,,,,,,,,
1,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,Ryder,< 1 year,...,,,,,,,,,,
2,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,,10+ years,...,,,,,,,,,,
3,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,AIR RESOURCES BOARD,10+ years,...,,,,,,,,,,
4,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,University Medical Group,1 year,...,,,,,,,,,,


<a id='null_values'></a>
## Missing Value Treatment

In [74]:
drop_columns = []

def get_na(data): 
    null_vars = data.isnull().sum()
    null_vars = null_vars[null_vars > 0]
    if(len(null_vars) > 0):
        null_vars.sort_values(inplace=True)
        print(str(null_vars/data.shape[0] * 100))
        # null_vars.plot.bar(figsize=(15,4))
    else:
        print("No column have NA values")

In [75]:
get_na(x)

title                           0.003855
last_credit_pull_d              0.005841
collections_12_mths_ex_med      0.006542
revol_util                      0.052105
last_pymnt_d                    1.035318
emp_length                      5.030673
emp_title                       5.776261
total_rev_hi_lim                7.863953
tot_cur_bal                     7.863953
tot_coll_amt                    7.863953
next_pymnt_d                   29.553757
mths_since_last_delinq         51.381767
mths_since_last_major_derog    75.099682
mths_since_last_record         84.674211
desc                           85.769111
inq_fi                         98.447607
all_util                       98.447607
max_bal_bc                     98.447607
open_rv_24m                    98.447607
open_rv_12m                    98.447607
total_bal_il                   98.447607
open_il_24m                    98.447607
open_il_12m                    98.447607
open_il_6m                     98.447607
open_acc_6m     

In [76]:
## Removing Columns those are having more than 75% NA entries

drop_columns_list = ['mths_since_last_major_derog', 'mths_since_last_record','desc','inq_fi','all_util','max_bal_bc','open_rv_24m','open_rv_12m','total_bal_il','open_il_24m','open_il_12m','open_il_6m','open_acc_6m','total_cu_tl','inq_last_12m','mths_since_rcnt_il','il_util','verification_status_joint','annual_inc_joint','dti_joint']
x.drop(drop_columns_list, inplace = True, axis = 1)

In [77]:
get_na(x)

title                          0.003855
last_credit_pull_d             0.005841
collections_12_mths_ex_med     0.006542
revol_util                     0.052105
last_pymnt_d                   1.035318
emp_length                     5.030673
emp_title                      5.776261
tot_coll_amt                   7.863953
tot_cur_bal                    7.863953
total_rev_hi_lim               7.863953
next_pymnt_d                  29.553757
mths_since_last_delinq        51.381767
dtype: float64


## Imputing NA values for independent variable

In [78]:
x.title.head()

In [79]:
x.title.fillna('None', inplace = True)

In [80]:
x.last_credit_pull_d.head()

0    Jan-2016
1    Sep-2013
2    Jan-2016
3    Jan-2015
4    Jan-2016
Name: last_credit_pull_d, dtype: object

In [81]:
x.next_pymnt_d.fillna(method='ffill', inplace = True)

In [48]:
x.next_pymnt_d.tail()

855964    Feb-2016
855965    Feb-2016
855966    Feb-2016
855967    Feb-2016
855968    Feb-2016
Name: next_pymnt_d, dtype: object

In [83]:
x.collections_12_mths_ex_med.value_counts()

0.0     844768
1.0      10320
2.0        709
3.0         80
4.0         22
5.0          7
6.0          2
7.0          1
16.0         1
20.0         1
14.0         1
10.0         1
Name: collections_12_mths_ex_med, dtype: int64

In [84]:
x.collections_12_mths_ex_med.fillna(0, inplace = True)

In [85]:
x.revol_util.head()

0    83.7
1     9.4
2    98.5
3    21.0
4    53.9
Name: revol_util, dtype: float64

In [86]:
x.revol_util.fillna(round(x.revol_util.mean(),1), inplace = True)

In [87]:
x.last_pymnt_d.head()

0    Jan-2015
1    Apr-2013
2    Jun-2014
3    Jan-2015
4    Jan-2016
Name: last_pymnt_d, dtype: object

In [88]:
x.last_pymnt_d.fillna( method ='ffill', inplace = True)

In [110]:
x.emp_length.head()

0    10+ years
1     < 1 year
2    10+ years
3    10+ years
4       1 year
Name: emp_length, dtype: object

In [108]:
# Replace unwanted symbols and String for emp_length 
x.emp_length = x.emp_length.str.replace(r'\D', '')

In [111]:
x.emp_length.head()

0    10
1     1
2    10
3    10
4     1
Name: emp_length, dtype: object

In [113]:
x.emp_length.value_counts()

10    282090
1     122452
2      75986
3      67392
5      53812
4      50643
7      43204
8      42421
6      41446
9      33462
Name: emp_length, dtype: int64

In [116]:
x.emp_length.fillna( 0, inplace = True)

In [118]:
# Convering datatype of emp_length to int as it is representing number of years
x.emp_length = x.emp_length.astype(int)

In [119]:
x.emp_title.head()

0                         NaN
1                       Ryder
2                         NaN
3         AIR RESOURCES BOARD
4    University Medical Group
Name: emp_title, dtype: object

In [120]:
x.emp_title.fillna('None', inplace = True)

In [130]:
x.tot_coll_amt.tail()

855964    0.0
855965    0.0
855966    0.0
855967    0.0
855968    0.0
Name: tot_coll_amt, dtype: float64

In [139]:
x.tot_coll_amt.fillna( 0 , inplace = True)

In [141]:
x.tot_cur_bal.tail()

855964     25274.0
855965    140285.0
855966     34178.0
855967     58418.0
855968     33307.0
Name: tot_cur_bal, dtype: float64

In [145]:
x.tot_cur_bal = x.tot_cur_bal.fillna(round(x.tot_cur_bal.mean(), 2), inplace = True)

In [167]:
x.total_rev_hi_lim.tail()

855964    17100.0
855965    10200.0
855966    18000.0
855967    27000.0
855968    41700.0
Name: total_rev_hi_lim, dtype: float64

In [164]:
x.next_pymnt_d.tail()

855964    Feb-2016
855965    Feb-2016
855966    Feb-2016
855967    Feb-2016
855968    Feb-2016
Name: next_pymnt_d, dtype: object

In [158]:
x.mths_since_last_delinq.head()

0    None
1    None
2    None
3    None
4    None
Name: mths_since_last_delinq, dtype: object

In [161]:
x.mths_since_last_delinq = data.mths_since_last_delinq.fillna(round(x.mths_since_last_delinq.mean(), 2), inplace = False)

In [165]:
get_na(x)

last_credit_pull_d          0.005841
next_pymnt_d               29.553757
mths_since_last_delinq     51.381767
total_rev_hi_lim          100.000000
dtype: float64


<a id='capping'></a>
## Upper Lower Capping

 <a id='EDA'></a>
## Exploratory Data Analysis

<a id='Model_Building'></a>
## Model Building

As this is kind of classification problem .


In [None]:
# Logistic Regression

logistic_model = 


In [None]:
# Random Forest

rf_model = 


In [None]:
# SVM - Support Vector Machine

svm_model = 


In [None]:
# KNN - K Nearest Neighbour

knn_model =


<a id='Model_Evaluation'></a>
## Model Evaluation

<a id='conclusion'></a>
## Conclusion