In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A comparision of L0 models

For choosing the best model to use in Level 0 of our stacking ensemble, we tried 3 models and used the model that gave the best score on train data for all tables

In [2]:
#importing the necessary libraries
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder




In [3]:
#function for one hot encoding of the categorical columns
def one_hot_encoding_dataframe(df):
    original_columns = list(df.columns)
    cat_columns=[x for x in df.columns if df[x].dtype == 'object']
    df=pd.get_dummies(df,columns=cat_columns,dummy_na= False)
    new_added_columns=list(set(df.columns).difference(set(original_columns)))
    return df,new_added_columns,df.columns

In [4]:
#loading the main test and train data
main_df=pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
test_df=pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')
train_target = main_df[['TARGET', 'SK_ID_CURR']]
test_target = test_df['SK_ID_CURR'].copy()


### We will also be plotting the top 20 features that help in predicting the target
We are using feature_importances of lightgbm to show the top 20 features. This is the function to find the top features

In [22]:
#plot the top 20 features
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def plotImp(model, X , num = 20, fig_size = (40, 20)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()

## 1) Credit Card balance table

In [5]:
def credit_card_bal(df):
    
    
    categorical_cols = ['NAME_CONTRACT_STATUS']
    for col in categorical_cols:
            enc = LabelEncoder()
            df[col] = enc.fit_transform(df[col])
            
    #Creating new features
    df['ratio_ab_acl'] = df['AMT_BALANCE'] / df['AMT_CREDIT_LIMIT_ACTUAL']
    df['sum_draw'] = df[['AMT_DRAWINGS_ATM_CURRENT','AMT_DRAWINGS_CURRENT','AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT']].sum(axis=1)
    df['ratio_tc_tr'] = df['AMT_PAYMENT_TOTAL_CURRENT'] / df['AMT_TOTAL_RECEIVABLE']
    df['ratio_pc_ar'] = df['AMT_PAYMENT_CURRENT'] / df['AMT_RECIVABLE']
    df['sum_cntdraw'] = df[['CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT']].sum(axis=1)
    df['diff_tr_tc'] = df['AMT_TOTAL_RECEIVABLE'] / df['AMT_PAYMENT_TOTAL_CURRENT']
    df['ratio_pc_ptc'] = df['AMT_PAYMENT_CURRENT'] / df['AMT_PAYMENT_TOTAL_CURRENT']
    
    #Creating aggregates
    aggs = {
        'MONTHS_BALANCE': ['min', 'max', 'size'],
        'CNT_DRAWINGS_ATM_CURRENT': ['max'], 
        'CNT_DRAWINGS_CURRENT': ['max'],
        'CNT_DRAWINGS_POS_CURRENT': ['max'],
        'CNT_INSTALMENT_MATURE_CUM': ['mean', 'sum'],
        'AMT_BALANCE': ['min', 'max', 'mean', 'sum'],
        'AMT_CREDIT_LIMIT_ACTUAL': ['max', 'mean','var'],
        'AMT_DRAWINGS_ATM_CURRENT': ['max'],
        'AMT_DRAWINGS_CURRENT': ['max'],
        'AMT_DRAWINGS_POS_CURRENT': ['max'],
        'AMT_PAYMENT_CURRENT': ['max'],
        'AMT_PAYMENT_TOTAL_CURRENT': ['max'],
        'AMT_RECEIVABLE_PRINCIPAL': ['mean', 'sum'],
        'AMT_RECIVABLE': ['mean', 'sum'],
        'AMT_TOTAL_RECEIVABLE': ['mean'],
        
        #New features
        'ratio_ab_acl': ['min', 'max', 'mean'],
        'ratio_tc_tr': ['min', 'max', 'mean'],
        'ratio_pc_ar': ['min', 'max', 'mean'],
        'diff_tr_tc': ['min', 'max', 'mean'],
        'ratio_pc_ptc': ['min', 'max', 'mean']
    }
    #creating n from those aggregates
    cc_aggs = df.groupby('SK_ID_CURR').agg(aggs)
    cc_aggs.columns = pd.Index([i[0] + "_" + i[1].upper() + '_(CREDIT_CARD)' for i in cc_aggs.columns.tolist()])
    cc_aggs['CC_COUNT'] = df.groupby('SK_ID_CURR').size()
    
    return cc_aggs

In [6]:
#reading credit information
credit_data = pd.read_csv('/kaggle/input/home-credit-default-risk/credit_card_balance.csv')
df_credit=credit_card_bal(credit_data)


In [7]:
df_credit =main_df.merge(df_credit, how='left', on='SK_ID_CURR')
df_credit=df_credit[df_credit['TARGET'].notnull()]

y_train=df_credit['TARGET']
train_column=set(df_credit.columns)-set(main_df.columns)
X_train=df_credit[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)


### Since the classes are imbalanced, we need to assign them weights

In [8]:
import sklearn
classes_zero = main_df[main_df['TARGET'] == 0]
classes_one = main_df[main_df['TARGET'] == 1]

# Convert parts into NumPy arrays for weight computation
zero_numpy = classes_zero['TARGET'].to_numpy()
one_numpy = classes_one['TARGET'].to_numpy()
all_together = np.concatenate((zero_numpy, one_numpy))
unique_classes = np.unique(all_together)

# Compute weights
weights = sklearn.utils.class_weight.compute_class_weight('balanced', unique_classes, all_together)

In [9]:
#convert weights to dictionary
wts={}
for i in range(len(weights)):
    wts[i]=weights[i]

### To assign weight for decision tree

In [10]:
poswt = len(classes_zero) / len(classes_one)
poswt

## Training models and finding their auc scores on train data to find the best model

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(class_weight=wts,solver='liblinear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)


In [12]:
print("ROCAUC Score :",roc_auc_score(y_train,clf.predict_proba(X_train)[:,1]))

### Decision tree

In [13]:
from sklearn import tree
DTclf = tree.DecisionTreeClassifier(class_weight=wts,max_depth=6)
DTclf.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,DTclf.predict_proba(X_train)[:,1]))

### LightGBM

In [19]:
lgclf=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)
lgclf.fit(X_train,y_train,eval_metric='auc')


In [23]:
print("ROCAUC Score :",roc_auc_score(y_train,lgclf.predict_proba(X_train)[:,1]))

**The best model is Lightgbm**

### Top 20 features for credit card balance table

In [24]:
plotImp(lgclf,df_credit[train_column])

## 2) For Installments table

In [25]:
def install_bal(inst_df):
    
    inst_df['late_pay']=inst_df['DAYS_INSTALMENT']-inst_df['DAYS_ENTRY_PAYMENT']
    inst_df['less_pay']=inst_df['AMT_INSTALMENT']-inst_df['AMT_PAYMENT']    
    inst_df['late_lp']=0.5*inst_df['late_pay']+0.5*inst_df['less_pay']   
    inst_df['ltp_flag']=((inst_df['DAYS_INSTALMENT']-inst_df['DAYS_ENTRY_PAYMENT'])>0).astype(int)    
    inst_df['lsp_flag']=((inst_df['AMT_INSTALMENT']-inst_df['AMT_PAYMENT'])>0).astype(int)
    
    for col in inst_df.columns:
        if col.startswith('DAYS'):
            inst_df[col].replace(365243, np.nan, inplace= True)
            
    inst_df,installments_payments_cat_columns,all_columns=one_hot_encoding_dataframe(inst_df)
       
    inst_df_agg={}
    for col in inst_df.columns:
        if col!='SK_ID_CURR' and col !='SK_ID_PREV':
            inst_df_agg[col]=['mean']
            if (col=='late_pay') |  (col=='less_pay') | (col=='NUM_INSTALMENT_VERSION') | (col=='NUM_INSTALMENT_NUMBER'):
                inst_df_agg[col]=['mean','sum','max','min']
    
    inst_agg = inst_df.groupby('SK_ID_CURR').agg(inst_df_agg)
    
    modified_col=[]
    for c in list(inst_agg.columns):
        modified_col.append("INST_"+c[0]+"_"+c[1].upper())
    inst_agg.columns=modified_col
    
    inst_agg['cnt_inst'] = inst_df.groupby('SK_ID_CURR')['SK_ID_PREV'].count()

    no = -365*3
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['3365_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['3365_ltp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].min()
    inst_agg['3365_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['3365_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum() 

    no = -365*2
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['2365_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['2365_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['2365_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum()

    no = -365 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['365_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['365_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['365_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum()
    
    no = -180 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['180_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['180_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    inst_agg['180_ltp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['ltp_flag'].sum()
    
    no = -90 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['90_ltp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].mean()
    inst_agg['90_ltp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].min()
    inst_agg['90_ltp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['late_pay'].max()
    
    no = -365*2
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['2365_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['2365_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['2365_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
    inst_agg['2365_lsp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['lsp_flag'].sum()

    no = -365 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >=no].copy()
    inst_agg['365_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['365_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['365_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
    inst_agg['365_lsp_flag_cnt'] = inst_agg_temp.groupby('SK_ID_CURR')['lsp_flag'].sum()
    
    no = -180 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['180_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['180_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['180_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
    
    no = -90 
    inst_agg_temp = inst_df[inst_df.DAYS_ENTRY_PAYMENT >= no].copy()
    inst_agg['90_lsp_flag_mean'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].mean()
    inst_agg['90_lsp_flag_min'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].min()
    inst_agg['90_lsp_flag_max'] = inst_agg_temp.groupby('SK_ID_CURR')['less_pay'].max()
    
    return inst_agg

In [26]:
inst_data = pd.read_csv('/kaggle/input/home-credit-default-risk/installments_payments.csv')

df_inst=install_bal(inst_data)
df_inst =main_df.merge(df_inst, how='left', on='SK_ID_CURR')
df_inst=df_inst[df_inst['TARGET'].notnull()]

y_train=df_inst['TARGET']
train_column=set(df_inst.columns)-set(main_df.columns)
X_train=df_inst[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)

### Linear Regression

In [27]:
from sklearn.linear_model import LogisticRegression

clf2 = LogisticRegression(class_weight=wts,solver='liblinear') # Linear Kernel

#Train the model using the training sets
clf2.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,clf2.predict_proba(X_train)[:,1]))

### Decison tree

In [28]:
from sklearn import tree
DTclf2 = tree.DecisionTreeClassifier(class_weight=wts,max_depth=6)
DTclf2.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,DTclf2.predict_proba(X_train)[:,1]))

### LightGBM

In [29]:
lgclf2=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)
lgclf2.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,lgclf2.predict_proba(X_train)[:,1]))

**The best model is Lightgbm**

### Top 20 features for installments table

In [31]:
plotImp(lgclf2,df_inst[train_column])

## 3) POS table

In [32]:
def pos_appl(pos_df):
    
    pos_df=pos_df[pos_df['NAME_CONTRACT_STATUS']!='XNA'] 
    pos_df,pos_data_cat_columns,all_columns=one_hot_encoding_dataframe(pos_df)
     
    pos_data_agg={}
    for col in pos_df.columns:
        if col!='SK_ID_CURR' and col !='SK_ID_PREV':
            pos_data_agg[col]=['mean']
        if col=='MONTHS_BALANCE':
            pos_data_agg[col]=['sum','mean','max']
    
    pos_agg = pos_df.groupby('SK_ID_CURR').agg(pos_data_agg)
    
    modified_col=[]
    for col in list(pos_agg.columns):
        modified_col.append("POS_"+col[0]+"_"+col[1].upper())
    pos_agg.columns=modified_col
    pos_agg['pos_cnt'] = pos_df.groupby('SK_ID_CURR')['SK_ID_PREV'].count() 

    month = -24 
    pos_temp = pos_df[pos_df.MONTHS_BALANCE >= month].copy()
    pos_agg['24_c_inst_mean'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['24_c_inst_max'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
    
    month = -12 
    pos_temp = pos_df[pos_df.MONTHS_BALANCE >= month].copy()
    pos_agg['12_c_inst_mean'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['12_c_inst_max'] = pos_temp.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
    
    active = pos_df[pos_df['NAME_CONTRACT_STATUS_Active'] == 1]
   
    pos_agg['active_inst_mean'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT'].mean()
    pos_agg['active_inst_max'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT'].max()
  
    pos_agg['active_dpd_mean'] = active.groupby('SK_ID_CURR')['SK_DPD'].mean()
    pos_agg['active_dpd_max'] = active.groupby('SK_ID_CURR')['SK_DPD'].max()

    pos_agg['active_inst_fut_mean'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['active_inst_fut_max'] = active.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
  
    pos_agg['active_dpd_def_mean'] = active.groupby('SK_ID_CURR')['SK_DPD_DEF'].mean()
    pos_agg['active_dpd_def_max'] = active.groupby('SK_ID_CURR')['SK_DPD_DEF'].max()
  
    completed = pos_df[pos_df['NAME_CONTRACT_STATUS_Completed'] == 1]
    pos_agg['com_dpd_mean'] = completed.groupby('SK_ID_CURR')['SK_DPD'].mean()
    pos_agg['com_dpd_max'] = completed.groupby('SK_ID_CURR')['SK_DPD'].max()

    pos_agg['com_dpd_def_mean'] = completed.groupby('SK_ID_CURR')['SK_DPD_DEF'].mean()
    pos_agg['com_dpd_def_max'] = completed.groupby('SK_ID_CURR')['SK_DPD_DEF'].max()

    pos_agg['com_inst_fut_mean'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].mean()
    pos_agg['com_inst_fut_max'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max()
  
    pos_agg['com_inst_mean'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT'].mean()
    pos_agg['com_inst_max'] = completed.groupby('SK_ID_CURR')['CNT_INSTALMENT'].max()
  
    return pos_agg


In [33]:
pos_data = pd.read_csv('/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv')

df_pos=pos_appl(pos_data)
df_pos =main_df.merge(df_pos, how='left', on='SK_ID_CURR')
df_pos=df_pos[df_pos['TARGET'].notnull()]

y_train=df_pos['TARGET']
train_column=set(df_pos.columns)-set(main_df.columns)
X_train=df_pos[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)

### Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

clf3 = LogisticRegression(class_weight=wts,solver='liblinear') # Linear Kernel

#Train the model using the training sets
clf3.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,clf3.predict_proba(X_train)[:,1]))

### Decision tree

In [35]:
from sklearn import tree
DTclf3 = tree.DecisionTreeClassifier(class_weight=wts,max_depth=6)
DTclf3.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,DTclf3.predict_proba(X_train)[:,1]))

### LightGBM

In [36]:
lgclf3=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)
lgclf3.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,lgclf3.predict_proba(X_train)[:,1]))

**The best model is Lightgbm**

### Top 20 features for POS table

In [37]:
plotImp(lgclf3,df_pos[train_column])

## 4) Bureau Balance table

In [38]:
def bureau_bal(bureau_balance_data,bureau_data):

    bureau_balance_data,bureau_balance_data_cat_columns,all_columns=one_hot_encoding_dataframe(bureau_balance_data)
    #Aggregate function to be applied on numerical column 
    bureau_balance_agg = {'MONTHS_BALANCE': ['min', 'max','sum']}

    #Aggregate function to be applied on cat column 
    for col in bureau_balance_data_cat_columns:
        if (col!='SK_BUREAU_ID'):
            bureau_balance_agg[col] = ['mean']

    bal_agg = bureau_balance_data.groupby(['SK_ID_BUREAU']).agg(bureau_balance_agg)
    
    modified_col=[]
    for col in list(bal_agg.columns):
        if (col!='SK_BUREAU_ID'):
            modified_col.append(col[0]+"_"+col[1].upper())
    bal_agg.columns=modified_col
    
    bureau_data,bureau_data_cat_columns,all_columns=one_hot_encoding_dataframe(bureau_data)

    bureau_data['SEC_LOAN_COUNT']=(bureau_data[['CREDIT_TYPE_Car loan','CREDIT_TYPE_Loan for the purchase of equipment','CREDIT_TYPE_Mortgage','CREDIT_TYPE_Real estate loan','CREDIT_TYPE_Loan for purchase of shares (margin lending)'
                         ]]==1).sum(axis=1)
    
    bureau_data['UNSEC_LOAN_COUNT']=(bureau_data[[ 'CREDIT_TYPE_Another type of loan',
       'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit',
       'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit',
       'CREDIT_TYPE_Loan for business development',
       'CREDIT_TYPE_Loan for working capital replenishment',
       'CREDIT_TYPE_Microloan', 'CREDIT_TYPE_Mobile operator loan',
       'CREDIT_TYPE_Unknown type of loan']]==1).sum(axis=1)
    
    bureau_data['ex_pay'] = bureau_data['AMT_ANNUITY']-bureau_data['AMT_CREDIT_SUM']
    bureau_data['debt']=bureau_data['AMT_CREDIT_SUM_DEBT']/bureau_data['AMT_CREDIT_SUM']
    bureau_data['annu_per_cred']=bureau_data['AMT_ANNUITY']/bureau_data['AMT_CREDIT_SUM']
    
    for col in bureau_data.columns:
        if col.startswith('DAYS'):
            bureau_data[col].replace(365243, np.nan, inplace= True)
            
    bureau_data = bureau_data.join(bal_agg, how='left', on=['SK_ID_BUREAU'])
     
    bureau_data_agg={}
    for col in bureau_data.columns:
        if (col!='SK_ID_CURR' or col!='SK_BUREAU_ID'):
            bureau_data_agg[col]=['mean']
            if (col=='AMT_CREDIT_SUM_DEBT') | (col=='AMT_CREDIT_SUM_OVERDUE') | (col=='UNSEC_LOAN_COUNT') |(col=='SEC_LOAN_COUNT'):
                bureau_data_agg[col]=['sum']
            if col=='DAYS_CREDIT':
                bureau_data_agg[col]=['min']
            if col=='debt':
                bureau_data_agg[col]=['mean']
    
            
    bureau_agg = bureau_data.groupby('SK_ID_CURR').agg(bureau_data_agg)
    
    modified_col=[]
    for col in list(bureau_agg.columns):
        modified_col.append(col[0]+"_"+col[1].upper())
    bureau_agg.columns=modified_col
    
    bureau_agg['abs_cre_max']=abs(bureau_agg['DAYS_CREDIT_MIN']/365) 
    bureau_agg['cre_max_od_max'] = bureau_data.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].max()
    bureau_agg['cnt'] = bureau_data.groupby('SK_ID_CURR')['SK_ID_BUREAU'].count()
    bureau_data_active = bureau_data[bureau_data['CREDIT_ACTIVE_Active'] == 1]
    
    bureau_agg['active_cred_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].mean()
    bureau_agg['active_cred_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].max()
    bureau_agg['cred_od_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].mean()
    bureau_agg['cred_od_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].max()   
    bureau_agg['active_cred_debt_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].mean()
    bureau_agg['active_cred_debt_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].max()
    bureau_agg['active_cred_limit_mean'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].mean()
    bureau_agg['active_cred_limit_max'] = bureau_data_active.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].max()
    bureau_agg['cred_days_mean'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT'].mean()
    bureau_agg['cred_days_max'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT'].max()  
    bureau_agg['cred_ed_mean'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].mean()
    bureau_agg['cred_ed_max'] = bureau_data_active.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].max()
    bureau_data_closed = bureau_data[bureau_data['CREDIT_ACTIVE_Closed'] == 1]
    bureau_agg['cls_cred_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].mean()
    bureau_agg['cls_cred_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].max()
    bureau_agg['B_CLO_AMT_CREDIT_MAX_OVERDUE_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].mean()
    bureau_agg['B_CLO_AMT_CREDIT_MAX_OVERDUE_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].max()
    bureau_agg['cls_cred_d_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].mean()
    bureau_agg['cls_cred_d_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].max()
    bureau_agg['cls_cred_d_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].mean()
    bureau_agg['cls_cred_d_max'] = bureau_data_closed.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].max()
    bureau_agg['cls_credd_ed_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].mean()
    bureau_agg['cls_credd_ed_max'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].max()
    bureau_agg['cls_credd_mean'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT'].mean()
    bureau_agg['cls_credd_max'] = bureau_data_closed.groupby('SK_ID_CURR')['DAYS_CREDIT'].max()
    bureau_agg.drop(['SK_ID_CURR_MEAN'], axis=1, inplace= True)
    bureau_agg.drop(['SK_ID_BUREAU_MEAN'], axis=1, inplace= True)
    
    
    return bureau_agg

In [39]:
bureau_bal_data = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau_balance.csv')
bureau_data = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau.csv')

df_bureau=bureau_bal(bureau_bal_data,bureau_data)
df_bureau =main_df.merge(df_bureau, how='left', on='SK_ID_CURR')
df_bureau=df_bureau[df_bureau['TARGET'].notnull()]

y_train=df_bureau['TARGET']
train_column=set(df_bureau.columns)-set(main_df.columns)
X_train=df_bureau[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)


### Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression

clf4 = LogisticRegression(class_weight=wts,solver='liblinear') # Linear Kernel

#Train the model using the training sets
clf4.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,clf4.predict_proba(X_train)[:,1]))

### Decision tree

In [41]:
from sklearn import tree
DTclf4 = tree.DecisionTreeClassifier(class_weight=wts,max_depth=6)
DTclf4.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,DTclf4.predict_proba(X_train)[:,1]))

### Logistic regression

In [42]:
lgclf4=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)
lgclf4.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,lgclf4.predict_proba(X_train)[:,1]))

**The best model is Lightgbm**

### Top 20 features of Bureau and bureau balance tables

In [43]:
plotImp(lgclf4,df_bureau[train_column])

## 5) Previous application table

In [44]:
def prev_appl(prev_df):
    
    for col in prev_df.columns:
        if col.startswith('DAYS'):
            prev_df[col].replace(365243, np.nan, inplace= True)
    
    prev_df['extra_paid'] = prev_df['CNT_PAYMENT']*prev_df['AMT_ANNUITY']-prev_df['AMT_CREDIT']
    prev_df['to_pay'] = prev_df['CNT_PAYMENT']*prev_df['AMT_ANNUITY']-prev_df['AMT_DOWN_PAYMENT']
    prev_df['roi'] = (1/prev_df['CNT_PAYMENT'])*(((prev_df['CNT_PAYMENT']*prev_df['AMT_ANNUITY'])/prev_df['AMT_CREDIT'])-1)
    prev_df['si']= (prev_df['AMT_CREDIT']*prev_df['roi']*prev_df['CNT_PAYMENT'])/100    
    prev_df['xap']=((prev_df['CODE_REJECT_REASON']=='XAP')).astype(int)
    prev_df['dp']=(prev_df['AMT_DOWN_PAYMENT']<=(0.40*prev_df['AMT_CREDIT'])).astype(int) 
    prev_df,prev_df_cat_columns,all_columns=one_hot_encoding_dataframe(prev_df)
   
    prev_df_agg={}
    for col in prev_df.columns:
        if col!='SK_ID_CURR' and col !='SK_ID_PREV':
            prev_df_agg[col]=['mean']
        if (col=='DAYS_TERMINATION') | (col=='DAYS_FIRST_DUE') | (col=='DAYS_LAST_DUE') | (col=='AMT_CREDIT') | (col=='AMT_ANNUITY') | (col=='AMT_DOWN_PAYMENT') | (col=='DAYS_LAST_DUE_1ST_VERSION') |(col=='HOUR_APPR_PROCESS_START') :
            prev_df_agg[col]=['max','mean']
            
    prev_agg = prev_df.groupby('SK_ID_CURR').agg(prev_df_agg)
    
    modified_col=[]
    for col in list(prev_agg.columns):
        modified_col.append("PREV_"+col[0]+"_"+col[1].upper())
    
    prev_agg.columns=modified_col
    
    ref_canc = prev_df[(prev_df['NAME_CONTRACT_STATUS_Refused'] == 1) | (prev_df['NAME_CONTRACT_STATUS_Canceled'] == 1)]
    prev_agg['cr_cred_mean'] = ref_canc.groupby('SK_ID_CURR')['AMT_CREDIT'].mean()
    prev_agg['cr_cred_max'] = ref_canc.groupby('SK_ID_CURR')['AMT_CREDIT'].max()
    prev_agg['cr_int_mean'] = ref_canc.groupby('SK_ID_CURR')['roi'].mean()
    prev_agg['cr_int_max'] = ref_canc.groupby('SK_ID_CURR')['roi'].max()
    prev_agg['cr_annu_mean'] = ref_canc.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean()
    prev_agg['cr_annu_max'] = ref_canc.groupby('SK_ID_CURR')['AMT_ANNUITY'].max()
    prev_agg['cr_dp_mean'] = ref_canc.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].mean()
    prev_agg['cr_dp_max'] = ref_canc.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].max()
    prev_agg['cr_lp_mean'] = ref_canc.groupby('SK_ID_CURR')['to_pay'].mean()
    prev_agg['cr_lp_max'] = ref_canc.groupby('SK_ID_CURR')['to_pay'].max()
 
    appr = prev_df[(prev_df['NAME_CONTRACT_STATUS_Approved'] == 1)]
    prev_agg['va_cred_mean'] = appr.groupby('SK_ID_CURR')['AMT_CREDIT'].mean()
    prev_agg['va_cred_max'] = appr.groupby('SK_ID_CURR')['AMT_CREDIT'].max()
    prev_agg['va_annu_mean'] = appr.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean()
    prev_agg['va_annu_max'] = appr.groupby('SK_ID_CURR')['AMT_ANNUITY'].max()
    prev_agg['va_dp_mean'] = appr.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].mean()
    prev_agg['va_int_mean'] = appr.groupby('SK_ID_CURR')['roi'].mean()
    prev_agg['va_int_max'] = appr.groupby('SK_ID_CURR')['roi'].max()
    prev_agg['va_dp_max'] = appr.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].max()
    prev_agg['va_lp_mean'] = appr.groupby('SK_ID_CURR')['to_pay'].mean()
    prev_agg['va_lp_max'] = appr.groupby('SK_ID_CURR')['to_pay'].max()
    
    return prev_agg

In [45]:
prev_data = pd.read_csv('/kaggle/input/home-credit-default-risk/previous_application.csv')

df_prevapp=prev_appl(prev_data)
df_prevapp =main_df.merge(df_prevapp, how='left', on='SK_ID_CURR')
df_prevapp=df_prevapp[df_prevapp['TARGET'].notnull()]

y_train=df_prevapp['TARGET']
train_column=set(df_prevapp.columns)-set(main_df.columns)
X_train=df_prevapp[train_column]

train_column=X_train.columns
X_train=X_train.replace([np.inf, -np.inf],np.nan)
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)
scaler = MinMaxScaler(feature_range = (0, 1))
X_train=scaler.fit_transform(X_train)


### Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression

clf5 = LogisticRegression(class_weight=wts,solver='liblinear') # Linear Kernel

clf5.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,clf5.predict_proba(X_train)[:,1]))

### Decision Tree

In [47]:
from sklearn import tree
DTclf5 = tree.DecisionTreeClassifier(class_weight=wts,max_depth=6)
DTclf5.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,DTclf5.predict_proba(X_train)[:,1]))

### LightGBM

In [48]:
lgclf5=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)
lgclf5.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,lgclf5.predict_proba(X_train)[:,1]))

**The best model is Lightgbm**

### Top 20 features of previous applications table

In [49]:
plotImp(lgclf5,df_prevapp[train_column])

## 6) Application train and test data

In [50]:
def appl_train_test(app_train,app_test):
    
    df=app_train.append(app_test).reset_index()
        
    #as 365243 is an outlier
    df["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)
    df['hdwmqy']=(df[['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR']]).sum(axis=1)
    
    #Using domain knowledge 
    #time spent in work
    df['days_work']=df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['days_unemp']=abs(df['DAYS_BIRTH'])-abs(df['DAYS_EMPLOYED'])
    df['inc_per_price']=df['AMT_INCOME_TOTAL']/df['AMT_GOODS_PRICE']
    df['cred_per_price']=df['AMT_CREDIT']/df['AMT_GOODS_PRICE']
    df['ann_per_price']=df['AMT_ANNUITY']/df['AMT_GOODS_PRICE']
    #percentage income of person and the credit amount
    df['inc_per'] = df['AMT_INCOME_TOTAL'] / (df['CNT_FAM_MEMBERS']+1)
    #income per credit
    df['inc_per_cred'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['emp_per_cred'] = df['DAYS_EMPLOYED']/ df['AMT_CREDIT']  

    #Anually paid amount to amount credited
    df['pay_rate'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['loan_pay'] = df['AMT_INCOME_TOTAL']-df['AMT_ANNUITY']
    df['soc_cir']=((df[['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE']]).sum(axis=1))//4
    df['mean_eq']=((df[['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']]).mean(axis=1))    
    df['contact']=((df[['FLAG_MOBIL', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL']]).sum(axis=1))
    
    #Creating features from useful features
    df['ext_mean']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).mean(axis=1)   
    df['ext_med']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).median(axis=1)  
    df['ext_min']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).min(axis=1) 
    df['ext_max']=(df[['EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3']]).max(axis=1)

    df['DOCUMNNET_COUNT']=(df[['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
       'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
       'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']]==1).sum(axis=1)
    
    df,cal_cols,acols=one_hot_encoding_dataframe(df)
    
    return df

In [51]:
dtrain=pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
dtest=pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')

df=appl_train_test(dtrain,dtest)


In [None]:
train_data_df=df[df['TARGET'].notnull()]
train_column=set(train_data_df.columns)-set({'TARGET','index','SK_ID_CURR'})
test_data_df=df[df['TARGET'].isnull()]
test_column=set(test_data_df.columns)-set({'SK_ID_CURR','TARGET','index'})
#ytrain to train the model
y_train=train_data_df['TARGET']
X_train=train_data_df[train_column]
from sklearn.impute import SimpleImputer
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
X_train=imputer1.fit_transform(X_train)

scaler = MinMaxScaler(feature_range = (0, 1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)

### Logistic regression

In [52]:
from sklearn.linear_model import LogisticRegression

clf6 = LogisticRegression(class_weight=wts,solver='liblinear') # Linear Kernel

#Train the model using the training sets
clf6.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,clf6.predict_proba(X_train)[:,1]))

### Decision tree

In [53]:
from sklearn import tree
DTclf6 = tree.DecisionTreeClassifier(class_weight=wts,max_depth=6)
DTclf6.fit(X_train, y_train)
print("ROCAUC Score :",roc_auc_score(y_train,DTclf6.predict_proba(X_train)[:,1]))

### LightGBM

In [54]:
lgclf6=LGBMClassifier(boosting_type= 'goss',n_estimators= 10000,
    learning_rate= 0.005,
    num_leaves= 30,
    max_depth= 6,
    subsample_for_bin= 24000,
    reg_alpha= 0.436193,
    reg_lambda= 0.479169,
    min_split_gain= 0.024766,
    subsample= 1,
    scale_pos_weight=poswt)
lgclf6.fit(X_train,y_train,eval_metric='auc')
print("ROCAUC Score :",roc_auc_score(y_train,lgclf6.predict_proba(X_train)[:,1]))

**The best model is Lightgbm**

### Top 20 features of application train (main) table

In [55]:
plotImp(lgclf4,train_data_df[train_column])

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight=wts,max_depth=6)
rf_model= rf.fit(X_train, y_train)


# Hence the most suitable model for level 0 model is LighGBM with the same parameters used