In [None]:
#Importing the python libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Filtering out the warnings
import warnings
warnings.filterwarnings('ignore')

#Set the max columns and rows
pd.set_option('display.max_columns',150)
pd.set_option('display.max_rows',150)

In [None]:
#Import the data set
data = pd.read_csv("application_data.csv")
data.head()


In [None]:
#Check the structure of dataset
print("Shape of the dataset: ",data.shape)
print("Data types of the dataset: \n",data.dtypes)

In [None]:
#Check the percentage of missing values in each column
data.isnull().sum()/len(data)*100

In [None]:
#Drop the columns which have more the 50% missing values
df = pd.DataFrame((data.isnull().sum()/len(data)*100)>50)
df_drop = df[df[0]==True].index
data = data.drop(df_drop, axis=1)
data

In [None]:
data[data.CNT_FAM_MEMBERS.isna() == True]

In [None]:
data.AMT_ANNUITY.describe()

In [None]:
data.CNT_FAM_MEMBERS.fillna(0, inplace = True)

### Replaced nulls in CNT_FAM_MEMBERS with 0 [ Assumption : there may not be any family memebers present. Note : Family Status is Unknown ] 

In [None]:
data[data.AMT_ANNUITY.isna() == True]

In [None]:
###For the NaNs in AMT_ANNUITY since we have very low count, we replaced it with the mean.

data.AMT_ANNUITY.fillna(data.AMT_ANNUITY.mean(), inplace = True)

In [None]:
data[data.DAYS_LAST_PHONE_CHANGE.isna() == True]

In [None]:
data.DAYS_LAST_PHONE_CHANGE.describe()

In [None]:
data[data.DAYS_LAST_PHONE_CHANGE == 0]['DAYS_LAST_PHONE_CHANGE'].count()

In [None]:
data.DAYS_LAST_PHONE_CHANGE.fillna(0, inplace = True)

**In DAYS_LAST_PHONE_CHANGE column ====> only one record has null value so replaced it with 0.**

In [None]:
data[data.NAME_TYPE_SUITE.isna() == True].head()

In [None]:
data.NAME_TYPE_SUITE.unique()

In [None]:
data.NAME_TYPE_SUITE.fillna('Unaccompanied', inplace = True)

### In NAME_TYPE_SUITE, we replaced the NaN's with the mode i.e.,'Unaccompanied'. Also, NaN ( or Unknown ) in this field is similar to Unaccompanied'

In [None]:
data[data.OCCUPATION_TYPE.isna() == True].head()

In [None]:
data.OCCUPATION_TYPE.unique()

In [None]:
data.OCCUPATION_TYPE.fillna('Others', inplace = True)

**In OCCUPATION_TYPE, we replaced the NULLS with 'Others'.**

In [None]:
#Check the datatypes of all the columns
data.dtypes

In [None]:
data = data.drop(['FLOORSMAX_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'FLOORSMAX_MODE', 
                  'YEARS_BEGINEXPLUATATION_MEDI', 'FLOORSMAX_MEDI', 'TOTALAREA_MODE', 'EMERGENCYSTATE_MODE'], axis=1)

data = data.drop(['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
                  'DEF_60_CNT_SOCIAL_CIRCLE'], axis = 1)

data = data.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6',
                  'FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11',
                  'FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16',
                  'FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21'], axis = 1)

data = data.drop(['EXT_SOURCE_2', 'EXT_SOURCE_3'], axis=1)

data = data.drop(['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 
                  'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'], axis=1)

data = data.drop(['DAYS_LAST_PHONE_CHANGE'], axis = 1)

data = data.drop(['REGION_POPULATION_RELATIVE', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 
                 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START','LIVE_REGION_NOT_WORK_REGION', 
                  'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY'], axis = 1)


In [None]:
data.shape

In [None]:
#Check AMT_INCOME_TOTAL for outliers
sns.boxplot(data.AMT_INCOME_TOTAL)

In [None]:
data.AMT_INCOME_TOTAL.quantile([0.25,0.5,0.75,0.99,1])

In [None]:
#Clearly there are outliers in the column and we can remove the values greater than 0.99 IQR
data = data[data['AMT_INCOME_TOTAL']<=data.AMT_INCOME_TOTAL.quantile(0.99)]

In [None]:
data.AMT_INCOME_TOTAL.quantile([0.25,0.5,0.75,0.99,1])

In [None]:
#Check AMT_ANNUITY for outliers
sns.boxplot(data.AMT_ANNUITY)

In [None]:
data.AMT_ANNUITY.quantile([0.25,0.5,0.75,0.95,0.99,1])

In [None]:
#Clearly there are outliers in the column and we can remove the values greater than 0.99 IQR
data = data[data['AMT_ANNUITY'] <= data.AMT_ANNUITY.quantile(0.99)]

In [None]:
#Check AMT_GOODS_PRICE for outliers
sns.boxplot(data.AMT_GOODS_PRICE)

In [None]:
data.AMT_GOODS_PRICE.quantile([0.25,0.5,0.75,0.95,0.99,1])

In [None]:
#Clearly there are outliers in the column and we can remove the values greater than 0.99 IQR
data = data[data['AMT_INCOME_TOTAL']<=data.AMT_INCOME_TOTAL.quantile(0.99)]

In [None]:
data['DAYS_BIRTH'] = abs(data['DAYS_BIRTH'])
data['Age'] = data['DAYS_BIRTH']/365

**Checking for Data Imbalance**

In [None]:
#Check for data imbalance
data.TARGET.value_counts()/len(data)*100

In [None]:
sns.countplot(data.TARGET)

In [None]:
# Dividing the credit amount in different bins
bins = [0,150000,200000,250000,300000,350000,400000,450000,500000,550000,600000,650000,700000,750000,800000,850000,900000,1000000000]
slots = ['0-150000', '150000-200000','200000-250000', '250000-300000', '300000-350000', '350000-400000','400000-450000',
        '450000-500000','500000-550000','550000-600000','600000-650000','650000-700000','700000-750000','750000-800000',
        '800000-850000','850000-900000','900000 and above']

data['AMT_CREDIT_BIN']=pd.cut(data['AMT_CREDIT'],bins=bins,labels=slots)

In [None]:
# Dividing the income in different bins
bins = [0,25000,50000,75000,100000,125000,150000,175000,200000,225000,250000,275000,300000,325000,350000,375000,400000,425000,450000,475000,500000,10000000000]
slot = ['0-25000', '25000-50000','50000-75000','75000-100000','100000-125000', '125000-150000', '150000-175000','175000-200000',
       '200000-225000','225000-250000','250000-275000','275000-300000','300000-325000','325000-350000','350000-375000',
       '375000-400000','400000-425000','425000-450000','450000-475000','475000-500000','500000 and above']

data['AMT_INCOME_BIN']=pd.cut(data['AMT_INCOME_TOTAL'],bins,labels=slot)

**Division of Dataset based on Target column**

In [None]:
data0 = data[data['TARGET']==0]
data1 = data[data['TARGET']==1]

****Univariate Analysis****

In [None]:
#NAME_CONTRACT_TYPE
plt.figure(figsize = (15,12))

plt.subplot(1,2,1)
plt.title('TARGET = 0')
plt.pie(data0.NAME_CONTRACT_TYPE.value_counts(), labels = data0.NAME_CONTRACT_TYPE.value_counts().index, startangle = 70,autopct = '%1.1f%%')

plt.subplot(1,2,2)
plt.title('TARGET = 1')
plt.pie(data1.NAME_CONTRACT_TYPE.value_counts(), labels = data1.NAME_CONTRACT_TYPE.value_counts().index,startangle = 70, autopct = '%1.1f%%')

plt.show()

`Inference` : We see that the ratio of Revolving Loans to Cash Loans is same in both the cases i.e., defaulters and non defaulters

In [None]:
#CODE_GENDER

plt.figure(figsize = (10,5))

plt.subplot(1,2,1)
plt.title('TARGET = 0')
sns.countplot(x = data0.CODE_GENDER, data = data0 ,order=["M","F","XNA"])

plt.subplot(1,2,2)
plt.title('TARGET = 1')
sns.countplot(x = data1.CODE_GENDER, data = data1)
plt.show()

In [None]:
data.CODE_GENDER.value_counts()

`Inference` : We see that number of females take more loans when compared to number of males.
7% of Female applicants are defaulters.
10.45% of Male applicants are defaulters.

In [None]:
### Bar plot for Distribution of Applicants by occupation

fig = plt.figure(figsize=[15,7])
data['OCCUPATION_TYPE'].value_counts().sort_values(ascending=False).plot.bar()
plt.title('Bar plot for Distribution of Applicants by Occupation')

In [None]:
#Univariate analysis of categorical data
fig = plt.figure(figsize=[15,7])
plt.subplot(1, 2, 1)


data0['OCCUPATION_TYPE'].value_counts().sort_values(ascending=False).plot.bar()
plt.title('Target 0')

plt.subplot(1, 2, 2)
data1['OCCUPATION_TYPE'].value_counts().sort_values(ascending=False).plot.bar(color='r')
plt.title('Target 1')
plt.show()
plt.tight_layout(fig)

`Inference` : We can see that most of the loans are taken by Labourers followed by Sales Staff, Core Staff etc. and the defaulters to non defaulters also follow the same sequence.

In [None]:
#NAME_INCOME_TYPE 
fig = plt.figure(figsize=[15,7])
plt.subplot(1, 2, 1)


data0['NAME_INCOME_TYPE'].value_counts().sort_values(ascending=False).plot.bar(color='b')
plt.title('Target 0')

plt.subplot(1, 2, 2)
data1['NAME_INCOME_TYPE'].value_counts().sort_values(ascending=False).plot.bar(color='r')
plt.title('Target 1')
plt.show()
plt.tight_layout(fig)

We see that working professionals take most of the loans.

In [None]:
#NAME_EDUCATION_TYPE 
fig = plt.figure(figsize=[10,3])
plt.subplot(1, 2, 1)


data0['NAME_EDUCATION_TYPE'].value_counts().sort_values(ascending=False).plot.bar(color='b')
plt.title('Target 0')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
data1['NAME_EDUCATION_TYPE'].value_counts().sort_values(ascending=False).plot.bar(color='r')
plt.title('Target 1')
plt.xticks(rotation=45)
plt.show()
plt.tight_layout(fig)

`Inference` : We see that clients with Secondary Education take most number of loans.

In [None]:
#NAME_HOUSING_TYPE
plt.figure(figsize = (12,3.5))

plt.subplot(1,2,1)
plt.title('TARGET = 0')
sns.countplot(x = data0.NAME_HOUSING_TYPE, data=data0 , order=data0.NAME_HOUSING_TYPE.value_counts().index)
plt.xticks(rotation=45)

plt.subplot(1,2,2)
plt.title('TARGET = 1')
sns.countplot(x = data1.NAME_HOUSING_TYPE, data=data1 , order=data0.NAME_HOUSING_TYPE.value_counts().index)
plt.xticks(rotation=45)

plt.show()

`Inference` : We also notice that most of the loans are taken by clients who have their own House/Apartments.

In [None]:
#NAME_FAMILY_STATUS
plt.figure(figsize = (15,5))

plt.subplot(1,2,1)
plt.title('TARGET = 0')
sns.countplot(x = data0.NAME_FAMILY_STATUS,data=data0,order=data0.NAME_FAMILY_STATUS.value_counts().index)
plt.xticks(rotation=45)

plt.subplot(1,2,2)
plt.title('TARGET = 1')
sns.countplot(x = data1.NAME_FAMILY_STATUS, data=data1,order=data0.NAME_FAMILY_STATUS.value_counts().index)
plt.xticks(rotation=45)

plt.show()

`Inference` : We also notice that most of the loans are taken by clients who are married.

**Univariate - Continuous variables**

In [None]:
#AMT_ANNUITY
plt.figure(figsize = (15,5))

plt.subplot(1,2,1)
plt.title('Target = 0')
sns.distplot(data0.AMT_ANNUITY)

plt.subplot(1,2,2)
plt.title('Target = 1')
sns.distplot(data1.AMT_ANNUITY)
plt.show()

`Inference` : We notice that the Annuity amount data is more skewed towards left in the case of non-defaulters. Around 70% of people with low loan annuity are bound to pay their loan on time.

In [None]:
#DAYS_BIRTH
plt.figure(figsize = (15,5))

plt.subplot(1,2,1)
plt.title('Target = 0')
sns.distplot(data0.DAYS_BIRTH/365)

plt.subplot(1,2,2)
plt.title('Target = 1')
sns.distplot(data1.DAYS_BIRTH/365)
plt.show()

`Inference` : We notice that the age of applicants in the case of non-defaulters are normally distributed whereas they are left skewed in the case of defaulters. We also notice that most of the defaulters have age around 30 years and as the age increases the number of defaulters decreases.

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,2,2)
sns.countplot(x=data1['AMT_INCOME_BIN'],palette='rocket')
plt.title("Distribution of Applicant's Income Target 1")
plt.xlabel("Income Categories")
plt.ylabel("Number of Applicants")
plt.xticks(rotation=45)

plt.subplot(1,2,1)
sns.countplot(x=data0['AMT_INCOME_BIN'],palette='rocket')
plt.title("Distribution of Applicant's Income Target 0")
plt.xlabel("Income Categories")
plt.ylabel("Number of Applicants")
plt.xticks(rotation=45)

plt.show()

`Inference` : Most of the applicants lie between 50k and 225k income range.

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,2)
sns.countplot(x=data1['AMT_CREDIT_BIN'],palette='rocket')
plt.title("Distribution of Applicant's Credit, Target = 1")

plt.xticks(rotation=45)
plt.subplot(1,2,1)
sns.countplot(x=data0['AMT_CREDIT_BIN'],palette='rocket')
plt.title("Distribution of Applicant's Credit, Target = 0")
plt.xticks(rotation=45)
plt.show()

`Inference` : In both categories, most applicants have taken the credit in the range of 250k to 300k.

### Bivariate Analysis 

**Bivariate - Continuous : Continuous**

In [None]:
columnsCorr = ['CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 
               'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 
               'DAYS_ID_PUBLISH', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS']
plt.figure(figsize=(12,12))
sns.heatmap(data0[columnsCorr].corr(),annot=True)
plt.show()

In [None]:
plt.figure(figsize=(12,12))
ax = sns.heatmap(data1[columnsCorr].corr(),annot=True)
plt.show()
bottom, top = ax.get_ylim()
ax.set_ylim(bottom+0.5, top-0.5)

In [None]:
corr0 = data0[columnsCorr].corr()
corr0_df = corr0.where(np.triu(np.ones(corr0.shape), k = 1).astype(np.bool))
corr0_df = corr0_df.unstack().reset_index()
corr0_df.columns = ['Variable1', 'Variable2', 'Correlation']
corr0_df.dropna(subset=['Correlation'], inplace=True)
corr0_df.Correlation = abs(corr0_df.Correlation)
corr0_df.sort_values('Correlation', ascending=False).head(10)

In [None]:
corr1 = data1[columnsCorr].corr()
corr1_df = corr1.where(np.triu(np.ones(corr1.shape), k = 1).astype(np.bool))
corr1_df = corr1_df.unstack().reset_index()
corr1_df.columns = ['Variable1', 'Variable2', 'Correlation']
corr1_df.dropna(subset=['Correlation'], inplace=True)
corr1_df.Correlation = abs(corr1_df.Correlation)
corr1_df.sort_values('Correlation', ascending=False).head(10)

`Inference` : 

After comparing the two data sets for Target = 1 and Target = 0, we find that the sets of correlated columns are same for both the datasets.

1. AMT_GOODS_PRICE has more effect on AMT_CREDIT than AMT_INCOME_TOTAL [ As opposed to our assumptions ]
2. CNT_FAM_MEMBERS has high corelation to CNT_CHILDREN

In [None]:
plt.figure(figsize=[15,7])
plt.subplot(1, 2, 1)

sns.scatterplot(x='AMT_INCOME_TOTAL', y='AMT_ANNUITY', data=data0, color='b')
plt.title('Target 0')

plt.subplot(1, 2, 2)
sns.scatterplot(x='AMT_INCOME_TOTAL', y='AMT_ANNUITY', data=data1, color='r')
plt.title('Target 1')
plt.show()
plt.tight_layout(fig)

In [None]:
fig = plt.figure(figsize=[15,7])
plt.subplot(1, 2, 1)


sns.scatterplot(x='AMT_INCOME_TOTAL', y='AMT_CREDIT', data=data0, color='b')

plt.title('Target 0')

plt.subplot(1, 2, 2)
sns.scatterplot(x='AMT_INCOME_TOTAL', y='AMT_CREDIT', data=data1, color='r')
plt.title('Target 1')
plt.show()
plt.tight_layout(fig)


`Inference` : In the case of defaulters [Target 1] , Maximum loan given to people with income till 250k is around 130k.

In [None]:

fig = plt.figure(figsize=[15,7])
plt.subplot(1, 2, 1)
sns.scatterplot(x='AMT_ANNUITY', y='AMT_CREDIT', data=data0, color='b')
plt.title('Target 0')

plt.subplot(1, 2, 2)
sns.scatterplot(x='AMT_ANNUITY', y='AMT_CREDIT', data=data1, color='r')
plt.title('Target 1')
plt.show()
plt.tight_layout(fig)

`Inference` : 
    For the same credit amount - some applicants have lower AMT_ANNUITY,other have higher. 
    This can be due to two things :
        1) Risk Involved
        2) Tenure
        
The upper edge of the cone has the applicants with the best possible AMT_ANNUITY amounts. 
This also marks the lowest AMT_ANNUITY one could get for a particular AMT_CREDIT.

The lower edge of the cone has the applicants who were assumed to be at a greater risk of being a defaulter.
This also marks the highest AMT_ANNUITY bank can recieve from the applicants.

We notice that the applicants with higher risk are likely to be defaulters.

**Bivariate: Continuous : Categorical**

In [None]:
data.columns

In [None]:
features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS']
plt.figure(figsize = (15,20))
for i in enumerate(features):
    plt.subplot(3,2,i[0]+1)
    sns.boxplot(x = i[1], y = data0.AMT_INCOME_TOTAL, data = data0)
    plt.xticks(rotation=45)

We notice the following from above plots:

- Females have comparitively lower income than Males and Others.
- People with comparitively higher income own the cars.
- There is no significant of income on owning a house or a flat.
- Applicants with Higher education and Academic degree have comparitively higher income than others.
- Applicants are likely to have better AMT_INCOME_TOTAL as their education level increases.
- There is no significant of income on Family Status.


In [None]:
#### We checked and the above inferences apply to target 1 as well. 

features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS']
plt.figure(figsize = (15,20))
for i in enumerate(features):
    plt.subplot(3,2,i[0]+1)
    sns.boxplot(x = i[1], y = data1.AMT_INCOME_TOTAL, data = data1)
    plt.xticks(rotation=45)

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,2)
sns.boxplot(x=data1['NAME_EDUCATION_TYPE'],y=data1['AMT_CREDIT'],hue=data1['NAME_FAMILY_STATUS'],hue_order=data['NAME_FAMILY_STATUS'].value_counts().index,order=data['NAME_EDUCATION_TYPE'].value_counts().index)
plt.title('Target 1 Family Status Distribution vs Credit amount')
plt.xlabel('Family Status')
plt.xticks(rotation=45)

plt.subplot(1,2,1)
sns.boxplot(x=data0['NAME_EDUCATION_TYPE'],y=data0['AMT_CREDIT'],hue=data0['NAME_FAMILY_STATUS'],hue_order=data['NAME_FAMILY_STATUS'].value_counts().index,order=data['NAME_EDUCATION_TYPE'].value_counts().index)
plt.title('Target 0 Family Status Distribution vs Credit amount')
plt.xlabel('Family Status')
plt.xticks(rotation=45)

plt.show()

`Inference` : It is clearly visible that married applicants are more likely to be defaulters than other categories in NAME_FAMILY_STATUS

**Bivariate: Categorical : Categorical**

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x = data0.NAME_FAMILY_STATUS, hue = data0.NAME_HOUSING_TYPE, data=data0)


`Inference`: Most of the clients own a house/apartment irrespective of their family status. 

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x = data0.FLAG_OWN_REALTY, hue = data0.CODE_GENDER, data=data0)


`Inference`: Most of the female applicants own a house or apartment when compared to male applicants.

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x = data0.FLAG_OWN_CAR, hue = data0.CODE_GENDER, data=data0)

`Inference`: We notice that most of the female clients dont own a car when compared to male clients.

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x = data0.NAME_EDUCATION_TYPE, hue = data0.CODE_GENDER, data=data0)

`Inference` : Clearly female applicants lead in the area of education when compared to male clients be it any level of education.

In [None]:
fig = plt.figure(figsize=[15,7])
plt.subplot(1, 2, 1)
for i in data0['CODE_GENDER'].unique():
    subset=data0[data0['CODE_GENDER']==i]
    sns.distplot(subset['AMT_INCOME_TOTAL'],hist=False, label=i)
plt.title('Target 0')


plt.subplot(1, 2, 2)
for i in data1['CODE_GENDER'].unique():
    subset=data1[data1['CODE_GENDER']==i]
    sns.distplot(subset['AMT_INCOME_TOTAL'],hist=False, label=i)
    
plt.title('Target 1')
plt.show()
plt.tight_layout(fig)

We notice that the clients with income between 50000 & 250000 take most of the loans and most of the defaulters are also from the same range across all genders.

In [None]:
prev_app = pd.read_csv("previous_application.csv")

In [None]:
appT1previous=data1.merge(prev_app,on='SK_ID_CURR', how='left' )
appT0previous=data0.merge(prev_app,on='SK_ID_CURR', how='left' )

In [None]:
appT0previous.shape

In [None]:
appT1previous.head()

In [None]:
#since no null in previous application NAME_CONTRACT_STATUS. We can assume that where ever we encounter na,that means no previous record found
appT1previous.NAME_CONTRACT_STATUS=appT1previous.NAME_CONTRACT_STATUS.fillna('No History')
appT0previous.NAME_CONTRACT_STATUS=appT0previous.NAME_CONTRACT_STATUS.fillna('No History')

In [None]:
#Current customers, previous loan status 
fig=plt.figure(figsize=(12,7))
plt.subplot(1,2,2)
bbc=sns.countplot(x='NAME_CONTRACT_STATUS', data=appT1previous)
bbc.set_xticklabels(bbc.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Target 1')

plt.subplot(1,2,1)
ddc=sns.countplot(x='NAME_CONTRACT_STATUS', data=appT0previous)
ddc.set_xticklabels(ddc.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.title('Target 0')
plt.show()



In [None]:
#previous status and median credit

fig=plt.figure(figsize=(13,7))

plt.subplot(1,2,1)
bbc=sns.barplot(x='NAME_CONTRACT_STATUS',y='AMT_CREDIT_y', estimator=np.median,hue='TARGET', data=appT1previous)
bbc.set_xticklabels(bbc.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Target 1')

plt.subplot(1,2,2)
ddc=sns.barplot(x='NAME_CONTRACT_STATUS',y='AMT_CREDIT_y', estimator=np.median, hue='TARGET', data=appT0previous)
ddc.set_xticklabels(ddc.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.title('Target 0')
plt.show()


In [None]:
#checking 
appT1previous.groupby('NAME_CONTRACT_STATUS').aggregate(np.median)['AMT_CREDIT_y']

In [None]:
#checking the cause of rejection in previous applications
#
fig=plt.figure(figsize=(6,5))

plt.subplot(1,2,1)
subset=appT1previous[appT1previous['NAME_CONTRACT_STATUS']=='Refused']
appT1previous.CODE_REJECT_REASON.value_counts(normalize=True).plot.bar(color='blue')
plt.title('Target 1')

plt.subplot(1,2,2)
subset=appT0previous[appT0previous['NAME_CONTRACT_STATUS']=='Refused']

appT0previous.CODE_REJECT_REASON.value_counts(normalize=True).plot.bar(color='red')
plt.title('Target 0')


In [None]:
#bivariate analysis - checking the amount asked while filling the application vs the final credit amount 

plt.figure(figsize=(12,7))
plt.subplot(1,2,2)
bbc=sns.scatterplot(x='AMT_APPLICATION',y='AMT_CREDIT_y', data=appT1previous)
bbc.set_xticklabels(bbc.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Target 1')

plt.subplot(1,2,1)
ddc=sns.scatterplot(x='AMT_APPLICATION',y='AMT_CREDIT_y', data=appT0previous)
ddc.set_xticklabels(ddc.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Target 0')
plt.show()

In [None]:
#Shift in contract type (previous to current)


fig=plt.figure(figsize=(12,7))

plt.subplot(1,2,2)
bbc=sns.countplot(x='NAME_CONTRACT_TYPE_x',hue='NAME_CONTRACT_TYPE_y', data=appT1previous)
bbc.set_xticklabels(bbc.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Target 1')

plt.subplot(1,2,1)
ddc=sns.countplot(x='NAME_CONTRACT_TYPE_x',hue='NAME_CONTRACT_TYPE_y', data=appT0previous)
ddc.set_xticklabels(ddc.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.title('Target 0')
plt.show()