### Bank loan risk analysis 

In [0]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
import io
app_train = pd.read_csv(io.BytesIO(uploaded['application_train.csv']))
app_train.head()

In [0]:
app_train.shape

In [0]:
app_train.info()

#### Explore the missing values

In [0]:
def missing_value_table(df):
    #calculate all the null values
    mis_val = df.isnull().sum()
    # calculate the percentage
    mis_val_percent = 100*df.isnull().sum()/len(df)
    #combine
    mis_val_table = pd.concat([mis_val,mis_val_percent],axis=1)
    mis_val_rename = mis_val_table.rename(columns = {0:'Missing valyes',1:'% of total values'})
    #keep only the column with missing values and sort in descending order
    mis_val_rename = mis_val_rename[mis_val_rename.iloc[:,1]!=0].sort_values('% of total values',ascending=False)
    return mis_val_rename


In [0]:
missing_value_table(app_train)[:10]

### Deal with object type

In [0]:
app_train.dtypes.value_counts()

In [0]:
app_train.select_dtypes('object').apply(pd.Series.nunique,axis=0)

In [0]:
app_train = pd.get_dummies(app_train)
app_train.shape

### Data EDA 

In [0]:
train_labels = app_train['TARGET']
app_train['DAYS_BIRTH'][:5]

In [0]:
# transfer the day birth to year
(app_train['DAYS_BIRTH']/-365).describe()

In [0]:
(app_train['DAYS_EMPLOYED']).describe()

In [0]:
# check the feature distribution by ploting histgram 
ax = app_train['DAYS_EMPLOYED'].plot.hist()
ax.set_xlabel("Days employed")
plt.show()

In [0]:
# remove outliers
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243
app_train['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)
ax = app_train['DAYS_EMPLOYED'].plot.hist()
ax.set_xlabel("Days employed")
plt.show()

In [0]:
# check feature correlations with target
correlations = app_train.corr()['TARGET'].sort_values()
correlations.head()

In [0]:
correlations.tail()

In [0]:
# age has a large correlation with the target
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['TARGET'].corr(app_train['DAYS_BIRTH'])

In [0]:
# plot histogram on age feature
plt.style.use('fivethirtyeight')
plt.hist(app_train['DAYS_BIRTH']/365,edgecolor='k',bins=25)
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

In [0]:
plt.figure(figsize=(10,8))
#KDEPLOT kernal density estimation
sns.kdeplot(app_train.loc[app_train['TARGET']==0,'DAYS_BIRTH']/365,label='target==0')
sns.kdeplot(app_train.loc[app_train['TARGET']==1,'DAYS_BIRTH']/365,label='target==1')
plt.xlabel("Age")
plt.ylabel("Distribution")
plt.show()

In [0]:
# bin the age fat
age_data = app_train[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365

age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'],bins=np.linspace(20,70,num=11))
age_data.head()

In [0]:
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [0]:
# plot loan default rate for each age group
plt.bar(age_groups.index.astype(str),100*age_groups['TARGET'])
plt.xticks(rotation=75)
plt.xlabel("age group")
plt.ylabel("default rate")
plt.show()

In [0]:
# explore the "ext_sources"
ext_data = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [0]:
# plot correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(ext_data_corrs,cmap = plt.cm.RdYlBu_r,annot=True)
plt.show()

In [0]:
plt.figure(figsize=(10,10))
#kdeplot
for i,source in enumerate(['EXT_SOURCE_3','EXT_SOURCE_2','EXT_SOURCE_1']):
    plt.subplot(3,1,i+1)
    sns.kdeplot(app_train.loc[app_train['TARGET']==0,source],label='target==0')
    sns.kdeplot(app_train.loc[app_train['TARGET']==1,source],label='target==1')
    plt.title('Distribution of %s' % source)
plt.tight_layout(h_pad=2.5)
plt.show()

### Data Preprocessing

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
label = app_train['TARGET']
train = app_train.drop(columns = ['TARGET'])
x_train,x_test,y_train,y_test= train_test_split(train,label,test_size = 0.2,random_state = 0)
features = list(train.columns)

imputer = Imputer(strategy='median')
std = StandardScaler()
# impute missing values
imputer.fit(x_train)
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test)
# data standardization
std.fit(x_train)
x_train = std.transform(x_train)
x_test = std.transform(x_test)

In [0]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {}".format(sum(y_train==0)))
print("The ratio between label '0' and label '1' is: {} ".format(sum(y_train==0)/sum(y_train==1)) )

### Baseline model: Logistic regression

In [0]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C=0.0001)
log_reg.fit(x_train,y_train)

### Predict test result

In [0]:
predictions = log_reg.predict_proba(x_test)[:,1]

In [0]:
from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(y_test,predictions)
test_auc

### Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100,random_state=10,n_jobs=-1)
random_forest.fit(x_train,y_train)

In [0]:
predictions = random_forest.predict_proba(test)[:,1]

In [0]:
test_auc = roc_auc_score(y_test,predictions)
test_auc

### Light GBM 

In [0]:
import lightgbm as lgb
model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)

model.fit(x_train, y_train, eval_metric = 'auc',
          eval_set = [(x_test, y_test), (x_train, y_train)],
          eval_names = ['x_test', 'x_train'],
          early_stopping_rounds = 100, verbose = 200)

### Try to over-sample the imbalanced data set using SMOTE

In [0]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(x_train, y_train.ravel())

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

### Apply ML models again for balanced data

In [0]:
# logistic regression
log_reg.fit(X_train_res,y_train_res)
predictions = log_reg.predict_proba(x_test)[:,1]
test_auc = roc_auc_score(y_test,predictions)
test_auc

In [0]:
# Random Forest
random_forest.fit(X_train_res,y_train_res)
predictions = random_forest.predict_proba(x_test)[:,1]
test_auc = roc_auc_score(y_test,predictions)
test_auc

In [0]:
model.fit(x_train, y_train, eval_metric = 'auc',
          eval_set = [(x_test, y_test), (X_train_res, y_train_res)],
          eval_names = ['x_test', 'x_train'],
          early_stopping_rounds = 100, verbose = 200)