## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,recall_score,precision_recall_curve,auc,roc_curve,roc_auc_score,classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from collections import Counter
%matplotlib inline

  from numpy.core.umath_tests import inner1d


## Important Functions

In [2]:
def one_hot_encoder(df):
    cols_to_dummy = []
    cols_to_encode = []
    for index,i in enumerate(df.columns):
        if df[i].dtype == object:
            length = len(df[i].unique())
            if length > 2:
                df = pd.get_dummies(data=df, columns=[i])
                cols_to_dummy.append(i)
            if length <= 2:
                cols_to_encode.append(i)                
                #encode
                le = LabelEncoder()
                df[i] = le.fit_transform(df[i])
    return df

## Import Data

In [18]:
train = pd.read_csv('./application_train.csv')
test = pd.read_csv('./application_test.csv')

In [31]:
bureau = pd.read_csv('./bureau.csv')

In [32]:
bureau = one_hot_encoder(bureau)
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
0,215354,5714462,-497,0,-153.0,-153.0,,0,91323.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,215354,5714463,-208,0,1075.0,,,0,225000.0,171342.0,...,0,0,0,0,0,0,0,0,0,0
2,215354,5714464,-203,0,528.0,,,0,464323.5,,...,0,0,0,0,0,0,0,0,0,0
3,215354,5714465,-203,0,,,,0,90000.0,,...,0,0,0,0,0,0,0,0,0,0
4,215354,5714466,-629,0,1197.0,,77674.5,0,2700000.0,,...,0,0,0,0,0,0,0,0,0,0


In [33]:
bureau_balance = pd.read_csv('./bureau_balance.csv')

In [None]:
min_value = bureau_balance.MONTHS_BALANCE.min()/3
bureau_balance['weight'] = np.exp(-1*(bureau_balance.MONTHS_BALANCE)/(min_value))

bureau_balance = one_hot_encoder(bureau_balance)
bureau_balance_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
cols_to_drop = []

for col in bureau_balance.columns:
    if col not in ['weight','MONTHS_BALANCE', 'SK_ID_BUREAU']:
        bureau_balance[col+'_WEIGHTED'] = bureau_balance[col] * bureau_balance['weight']
        bureau_balance_aggregations[col] = ['sum', 'mean']        
        bureau_balance_aggregations[col+'_WEIGHTED'] = ['sum', 'mean']
        cols_to_drop.append(col)
        
bureau_balance.drop(columns = 'weight', axis = 1, inplace = True)

In [None]:
bureau_balance_agg = bureau_balance.groupby(['SK_ID_BUREAU']).agg(bureau_balance_aggregations)
bureau_balance_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bureau_balance_agg.columns.tolist()])
bureau = bureau.join(bureau_balance_agg, how='left', on='SK_ID_BUREAU')

In [None]:
bureau.head()

In [None]:
bureau.drop(['SK_ID_BUREAU'], axis = 1, inplace = True)

In [None]:
# CREDIT_ACTIVE to convert to one-hot encoding
# CREDIT_CURRENCY to convert to one-hot encoding
# DAYS_CREDIT to take mean, variance, sd, 
# CREDIT_DAY_OVERDUE delayed, using exponential curve
# 
num_aggregations = {
        'DAYS_CREDIT': [ 'mean', 'var', 'sd'],
        'DAYS_CREDIT_ENDDATE': [ 'mean', 'var'],
        'DAYS_CREDIT_UPDATE': ['mean', 'var'],
        'CREDIT_DAY_OVERDUE': ['mean', 'var'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }

In [26]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [28]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

In [29]:
test.columns

Index(['SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=121)

## Exploratory Analysis

In [None]:
# columns having nans
# remove columns having more than 80% nans
# replace nans with mode for columns having more than 5% to 80% nans
# remove rows having less than 5% nans
columns_with_nans = list(train.columns[train.isnull().any()])
print('No of columns having NANs : ' + str(len(columns_with_nans)))
train = train[train.columns[train.isnull().mean() < 0.8]]
for i in columns_with_nans:
    col_na_percentage = (float(train[i].isnull().sum())/len(train))*100
    if col_na_percentage > 0:
        train[i].fillna(train[i].mode()[0], inplace = True)
        test[i].fillna(train[i].mode()[0], inplace = True)
    else:
        train.dropna(axis=0, inplace=True, subset=[i])
        test.dropna(axis=0, inplace=True, subset=[i])

In [None]:
# take only columns which are in train
target_col = 'TARGET'
test = test[[x for x in test.columns if x not in (list(test.columns.difference(train.columns)))]]

In [None]:
approved_loans = len(train[train["TARGET"]==0])
total = len(train["TARGET"])
normal_percentage = float(approved_loans)/float(total)
print("The percentage of approved loans is " + str(normal_percentage*100))
class_imbalance = 1 - normal_percentage
print("The percentage of stopped loans is " + str(class_imbalance*100))

In [None]:
print(sns.countplot("TARGET", data = train))

In [None]:
train_approved = train[train['TARGET'] == 0]
train_unapproved = train[train['TARGET'] == 1]

In [None]:
print('Approved Loans Revolving Loans percentage')
print(float(train_approved.NAME_CONTRACT_TYPE.value_counts()[1])/float(train_approved.NAME_CONTRACT_TYPE.value_counts().sum())*100)
print('Unapproved Loans Revolving Loans percentage')
print(float(train_unapproved.NAME_CONTRACT_TYPE.value_counts()[1])/float(train_unapproved.NAME_CONTRACT_TYPE.value_counts().sum())*100)

In [None]:
print('Approved Loans Female Count Percentage')
print(float(train_approved.CODE_GENDER.value_counts()[0])/train_approved.CODE_GENDER.value_counts().sum()*100)
print('Unapproved Loans Female Count Percentage')
print(float(train_unapproved.CODE_GENDER.value_counts()[0])/train_unapproved.CODE_GENDER.value_counts().sum()*100)

In [None]:
print('Approved Loans Owning a Car Count Percentage')
print(float(train_approved.FLAG_OWN_CAR.value_counts()[0])/train_approved.FLAG_OWN_CAR.value_counts().sum()*100)
print('Unapproved Loans Owning a Car Count Percentage')
print(float(train_unapproved.FLAG_OWN_CAR.value_counts()[0])/train_unapproved.FLAG_OWN_CAR.value_counts().sum()*100)

In [None]:
docs = [_f for _f in train.columns if 'FLAG_DOC' in _f]
live = [_f for _f in train.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]    

In [None]:
train['NEW_CREDIT_TO_ANNUITY_RATIO'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']
train['NEW_CREDIT_TO_GOODS_RATIO'] = train['AMT_CREDIT'] / train['AMT_GOODS_PRICE']
train['NEW_DOC_IND_KURT'] = train[docs].kurtosis(axis=1)
train['NEW_LIVE_IND_SUM'] = train[live].sum(axis=1)
train['NEW_INC_PER_CHLD'] = train['AMT_INCOME_TOTAL'] / (1 + train['CNT_CHILDREN'])
train['NEW_EMPLOY_TO_BIRTH_RATIO'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']
train['NEW_ANNUITY_TO_INCOME_RATIO'] = train['AMT_ANNUITY'] / (1 + train['AMT_INCOME_TOTAL'])
train['NEW_SOURCES_PROD'] = train['EXT_SOURCE_1'] * train['EXT_SOURCE_2'] * train['EXT_SOURCE_3']
train['NEW_EXT_SOURCES_MEAN'] = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
train['NEW_SCORES_STD'] = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
train['NEW_SCORES_STD'] = train['NEW_SCORES_STD'].fillna(train['NEW_SCORES_STD'].mean())
train['NEW_CAR_TO_BIRTH_RATIO'] = train['OWN_CAR_AGE'] / train['DAYS_BIRTH']
train['NEW_CAR_TO_EMPLOY_RATIO'] = train['OWN_CAR_AGE'] / train['DAYS_EMPLOYED']
train['NEW_PHONE_TO_BIRTH_RATIO'] = train['DAYS_LAST_PHONE_CHANGE'] / train['DAYS_BIRTH']
train['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = train['DAYS_LAST_PHONE_CHANGE'] / train['DAYS_EMPLOYED']
train['NEW_CREDIT_TO_INCOME_RATIO'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']

In [None]:
test['NEW_CREDIT_TO_ANNUITY_RATIO'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']
test['NEW_CREDIT_TO_GOODS_RATIO'] = test['AMT_CREDIT'] / test['AMT_GOODS_PRICE']
test['NEW_DOC_IND_KURT'] = test[docs].kurtosis(axis=1)
test['NEW_LIVE_IND_SUM'] = test[live].sum(axis=1)
test['NEW_INC_PER_CHLD'] = test['AMT_INCOME_TOTAL'] / (1 + test['CNT_CHILDREN'])
test['NEW_EMPLOY_TO_BIRTH_RATIO'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']
test['NEW_ANNUITY_TO_INCOME_RATIO'] = test['AMT_ANNUITY'] / (1 + test['AMT_INCOME_TOTAL'])
test['NEW_SOURCES_PROD'] = test['EXT_SOURCE_1'] * test['EXT_SOURCE_2'] * test['EXT_SOURCE_3']
test['NEW_EXT_SOURCES_MEAN'] = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
test['NEW_SCORES_STD'] = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
test['NEW_SCORES_STD'] = test['NEW_SCORES_STD'].fillna(test['NEW_SCORES_STD'].mean())
test['NEW_CAR_TO_BIRTH_RATIO'] = test['OWN_CAR_AGE'] / test['DAYS_BIRTH']
test['NEW_CAR_TO_EMPLOY_RATIO'] = test['OWN_CAR_AGE'] / test['DAYS_EMPLOYED']
test['NEW_PHONE_TO_BIRTH_RATIO'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_BIRTH']
test['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_EMPLOYED']
test['NEW_CREDIT_TO_INCOME_RATIO'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']

In [None]:
# find the data type and what columns to preprocess
# these columns need to be one hot encoded
cols_to_dummy = []
cols_to_encode = []
for index,i in enumerate(train.columns):
    if train[i].dtype == object:
        if i in test.columns:
            # remove columns not having values in list
            values_to_remove = list(set(train[i].unique()) - set(test[i].unique()))
            length = len(train[i].unique())
            train = train[~train[i].isin(values_to_remove)]
            if length > 2:
                train = pd.get_dummies(data=train, columns=[i])
                test = pd.get_dummies(data=test, columns=[i])
                cols_to_dummy.append(i)
            if length <= 2:
                cols_to_encode.append(i)                
                #encode
                le = LabelEncoder()
                train[i] = le.fit_transform(train[i])
                test[i] = le.fit_transform(test[i])

## Standardize Data

In [None]:
train_ids = train['SK_ID_CURR']
train.drop('SK_ID_CURR',axis=1, inplace=True)
test_ids = test['SK_ID_CURR']
test.drop('SK_ID_CURR',axis=1, inplace=True)

In [None]:
columns_to_scale = [x for x,y in zip(train.dtypes.index, train.dtypes.values) if all([len(train[x].unique()) > 2, y != object])]

In [None]:
for i in columns_to_scale:
    scaler = StandardScaler()
    train['Scaled_'+i] = scaler.fit_transform(train[i].values.reshape(-1, 1))
    train.drop(i,axis=1, inplace=True)
    test['Scaled_'+ i] = scaler.transform(test[i].values.reshape(-1, 1))
    test.drop(i,axis=1, inplace=True)    

## Test Train Split

In [None]:
x_features= train.ix[:,train.columns != target_col]
x_labels=train.ix[:,train.columns == target_col]
x_features_train,x_features_test,x_labels_train,x_labels_test = train_test_split(x_features, x_labels, 
                                                                                 test_size = 0.3, 
                                                                                 random_state = 42, 
                                                                                 stratify = x_labels)
print("length of training data")
print(len(x_features_train))
print("length of test data")
print(len(x_features_test))

In [None]:
import lightgbm as lgb

In [None]:
clf= lgb.LGBMClassifier()
clf.fit(x_features_train,x_labels_train.values.ravel())
pred_proba=clf.predict_proba(x_features_test)
pred = [1 if x> 0.5 else 0 for x in pred_proba[:, 1]]
cnf_matrix=confusion_matrix(x_labels_test,pred)
print("the recall for this model is :",cnf_matrix[1,1]/float(cnf_matrix[1,1]+cnf_matrix[1,0]))
fig= plt.figure(figsize=(6,3))# to plot the graph
print("TP",cnf_matrix[1,1,])
print("TN",cnf_matrix[0,0])
print("FP",cnf_matrix[0,1])
print("FN",cnf_matrix[1,0])
sns.heatmap(cnf_matrix,cmap="coolwarm_r",annot=True,linewidths=0.5)
plt.title("Confusion_matrix")
plt.xlabel("Predicted_class")
plt.ylabel("Real class")
plt.show()
print("\n----------Classification Report------------------------------------")
print(classification_report(x_labels_test,pred))

In [None]:
fpr, tpr, _ = roc_curve(x_labels_test,  pred_proba[:,1])
auc = roc_auc_score(x_labels_test, pred_proba[:,1])
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

## Top 30 features

In [None]:
col_names = ['Feature Name', 'Feature Importance']
a = pd.DataFrame(index = x_features_train.columns, data = clf.feature_importances_)
a.reset_index(inplace = True)
a.rename(columns={'index': 'Feature Name', 0:'Feature Importance'}, inplace = True)
b = a.sort_values('Feature Importance', ascending = False).head(30)
feature_names = list(b['Feature Name'].values)

In [None]:
x_features_train_selected = x_features_train[feature_names]
x_features_test_selected = x_features_test[feature_names]

In [None]:
clf_selected= lgb.LGBMClassifier()
clf_selected.fit(x_features_train_selected,x_labels_train.values.ravel())
pred_proba=clf_selected.predict_proba(x_features_test_selected)
pred = [1 if x> 0.5 else 0 for x in pred_proba[:, 1]]
cnf_matrix=confusion_matrix(x_labels_test,pred)
print("the recall for this model is :",cnf_matrix[1,1]/float(cnf_matrix[1,1]+cnf_matrix[1,0]))
fig= plt.figure(figsize=(6,3))# to plot the graph
print("TP",cnf_matrix[1,1,])
print("TN",cnf_matrix[0,0])
print("FP",cnf_matrix[0,1])
print("FN",cnf_matrix[1,0])
sns.heatmap(cnf_matrix,cmap="coolwarm_r",annot=True,linewidths=0.5)
plt.title("Confusion_matrix")
plt.xlabel("Predicted_class")
plt.ylabel("Real class")
plt.show()
print("\n----------Classification Report------------------------------------")
print(classification_report(x_labels_test,pred))

In [None]:
fpr, tpr, _ = roc_curve(x_labels_test,  pred_proba[:,1])
auc = roc_auc_score(x_labels_test, pred_proba[:,1])
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
pred_proba=clf.predict_proba(test.as_matrix())

In [None]:
pred_proba = pred_proba[:, 1]

In [None]:
pd.DataFrame(pred_proba, test_ids).to_csv('output.csv')