# Kaggle Contest : Home Credit Default Risk
MAFS 6010Z Project 1 
Wong Hoi Ming(20641276)
Wong Sik Tsun(20038819)

https://www.kaggle.com/c/home-credit-default-risk/overview

In this project, we are going to apply gradient boosting trees to predict whether a client will have payment difficulties.

We will go straight into the implementation part, followed by results analysis, as a clear description of the data set can be find in the following link.
https://www.kaggle.com/c/home-credit-default-risk/data

##Data Processing

We first import our data sets. The data of current applications, previous applications and clients' history of required installmet and actual payment will be explored and used for model fitting.

In [1]:
#Import data and install the necessary packages
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt

app_train = pd.read_csv("application_train.csv",sep = ",")
app_test = pd.read_csv("application_test.csv",sep = ",")
prev_app = pd.read_csv("previous_application.csv",sep = ",")
instl_pmt = pd.read_csv("installments_payments.csv",sep = ",")

#print(app_train.shape)
    
# We define 3 functions for the exploratory analysis on the features
# This function is written for checking the situation of missing values in each column of a dataframe

def missing_val_table(dataframe):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    return missing_df

# We define two functions for a brief summary of our numerical and categorical variables
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))  
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)    
    
##########################################################################################
############ Data Cleansing/Feature Engineering in data set application_train ############
##########################################################################################

# We replace illogical data point in days_employed and replace it with na
# since one should not have worked for 1000 years. The 365243 data point is not valid
app_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

# We skim through the categorical variables and check the no. of distinct values in each of them
# We apply domain knowledge to group different values of occupation & organization type
# so that later we will have a lower dimensionality when we do one-hot encoding for the categorical variables
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)
np.unique(app_train['ORGANIZATION_TYPE'])
np.unique(str(app_train['OCCUPATION_TYPE']))

## We define a function that will process app_train and app_test in the same manner
def process_applications(df):
# Organization - We regroup the organization type so that the resultant categorization will leave 
# fewer distinct values
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].str.contains("Business Entity"), 
                                       "Business_Entity", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].str.contains("Industry"), 
                                       "Industry", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].str.contains("Trade"),"Trade",
                                   df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["Emergency","Police", 
                                                                 "Government", "Postal", 
                                                                "Military", "Security Ministries"]), 
                                       "Official", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].str.contains("Transport"),
                                       "Transport", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["School", "Kindergarten", "University"]),
                                       "Education", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["Realtor", "Housing"]), "Property", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["Hotel", "Restaurant","Services","Advertising"]), 
                                   "F&B_Hospitality_Ads", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["Bank", "Insurance"]),
                                       "Financial", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["Cleaning","Electricity", "Telecom", "Mobile", "Security"]), "Utilities", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["Medicine", "Legal Services"]), "Professional", df["ORGANIZATION_TYPE"])
    df["ORGANIZATION_TYPE"] = np.where(df["ORGANIZATION_TYPE"].isin(["Religion", "Culture"]), "Other", df["ORGANIZATION_TYPE"])

# OCCUPATION_TYPE - We do similar the similar thing for occupation
    df["OCCUPATION_TYPE"] = np.where(df["OCCUPATION_TYPE"].isin(["Low-skill Laborers", "Cooking staff", "Security staff",
                                                                 "Cleaning staff", "Waiters/barmen staff", "Laborers"]), 
                                     "Laborers", df["OCCUPATION_TYPE"])
    df["OCCUPATION_TYPE"] = np.where(df["OCCUPATION_TYPE"].isin(["IT staff", "High skill tech staff","Accountants"]), 
                                     "High_skill_staff", df["OCCUPATION_TYPE"])
    df["OCCUPATION_TYPE"] = np.where(df["OCCUPATION_TYPE"].isin(["Secretaries", "HR staff","Realty agents", 
                                                                 "Private service staff"]), 
                                     "Others", df["OCCUPATION_TYPE"])

# We group all the flags of address mismatch into 1 variable by summing them to reduce dimension
# This new variable can be interpreted as degree of mismatch, ranging from 0-6
# ADDRESS_MISMATCH
    cols = ["REG_REGION_NOT_LIVE_REGION","REG_REGION_NOT_WORK_REGION", "LIVE_REGION_NOT_WORK_REGION", 
                "REG_CITY_NOT_LIVE_CITY","REG_CITY_NOT_WORK_CITY","LIVE_CITY_NOT_WORK_CITY"]
    df["ADDRESS_MISMATCH"] = df[cols].sum(axis = 1)
    df.drop(cols, axis = 1, inplace = True)

    #### Adding new variables####

    # 1 DAYS_EMPLOYED_RATIO
    # We believe it is more reasonable to consider one's employment history relative to his/her age
    # So we add a variable to calculate the ratio.
    # The more time he/she spend in working in his/her whole life, he/she may be more responsible or capable,
    # and thus having more ability to repay
    df['DAYS_EMPLOYED_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']

    # The next two transform the credit score from external sources
    # 2 Simple average of EXT_SOURCE_1 to EXT_SOURCE_3
    df["EXTSOURCE_MEAN"] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)

    # 3 Geometric mean of EXT_SOURCE_1 to EXT_SOURCE_3, in case one client scores very high in one, but low in the other two
    df['EXTSOURCES_GM'] = pow(df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3'], 1/3)

    # The next three calculate ratios of annuity, income and credit amount. We postulate that higher income level relative to
    # loan amount should imply better ability to repay
    
    # 4 Ratio of loan annuity to the credit amount of the loan
    df['ANNUITY_CREDIT_RATIO'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

    # 5 Ratio of loan annuity to the income level of the loan
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

    # 6 Ratio of income level of client to credit amount
    df['INCOME_CREDIT_RATIO'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']

    # The next two added variable consider the credit amount relative to the customer consumption
    # 7 Ratio of credit amount to value of goods purchased
    df["CREDIT_GOODS_RATIO"] = df["AMT_CREDIT"]/df["AMT_GOODS_PRICE"]
    # 8 Diff btw credit amount and value of goods purchased
    df["CREDIT_GOODS_DIFF"] = df["AMT_CREDIT"] - df["AMT_GOODS_PRICE"]

process_applications(app_train)
process_applications(app_test)

In [2]:

##########################################################################################
#### Data Cleansing/Aggregation/Feature Engineering in data set previous application #####
##########################################################################################

df = prev_app.copy()
# We note that the below five columns contains illogical inputs 
#for the days in prev_app relative to the current application
# We consider it to be missing values
df['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
df['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
df['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
df['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
df['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)

prev_app.select_dtypes('object').apply(pd.Series.nunique, axis=0)
### Regroup Goods Category to reduce # of one-hot variables to be added in the modelling step
a = ['Auto Accessories','Vehicles']
h = ['Construction Materials','Furniture','Construction Materials','Homewares','House Construction','Gardening']
e = ['Audio/Video', 'Computers', 'Consumer Electronics', 'Photo / Cinema Equipment','Mobile']
m = ['Medical Supplies','Medicine']
c = ['Jewelry', 'Clothing and Accessories']
l = ['Sport and Leisure', 'Tourism','Fitness']
o = ['Additional Service','Weapon','Office Appliances','Insurance','Direct Sales','Animals']
df["NAME_GOODS_CATEGORY"] = df["NAME_GOODS_CATEGORY"].replace(a, 'Automobile-related')
df["NAME_GOODS_CATEGORY"] = df["NAME_GOODS_CATEGORY"].replace(h, 'Home_related')
df["NAME_GOODS_CATEGORY"] = df["NAME_GOODS_CATEGORY"].replace(e, 'Electronics')
df["NAME_GOODS_CATEGORY"] = df["NAME_GOODS_CATEGORY"].replace(m, 'Medical_related')
df["NAME_GOODS_CATEGORY"] = df["NAME_GOODS_CATEGORY"].replace(c, 'Fashion&Jewelry')
df["NAME_GOODS_CATEGORY"] = df["NAME_GOODS_CATEGORY"].replace(l, 'Leisure')
df["NAME_GOODS_CATEGORY"] = df["NAME_GOODS_CATEGORY"].replace(o, 'Other')

### Regroup seller industry to reduce the number of distinct values
df['NAME_SELLER_INDUSTRY'] = df['NAME_SELLER_INDUSTRY'].replace(['Clothing','Jewelry'],'Fashion&Jewelry')
df['NAME_SELLER_INDUSTRY'] = df['NAME_SELLER_INDUSTRY'].replace(['Construction','Furniture'],'Building-related')

## Reclassify type of suite to reduce the number of distinct values
df["NAME_TYPE_SUITE"] = df["NAME_TYPE_SUITE"].replace('Unaccompanied', 'single')
m = ['Children', 'Other_B', 'Other_A','Family', 'Spouse, partner', 'Group of people']
df["NAME_TYPE_SUITE"] = df["NAME_TYPE_SUITE"].replace(m, 'multiple')

del m, a, h, e, c, l, o

# Since the same SK_ID_CURR have multiple records in previous applications
# We need to aggregate them separately for the numerical and categorical features
cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
col_list = df.columns.tolist()
id_list = ["SK_ID_CURR","SK_ID_PREV"]
# Numerical cols are defined as those that are not categorical nor the ID column
num_cols = [col for col in col_list if col not in cat_cols + id_list]
df = pd.get_dummies(df, prefix=cat_cols)
# Update the cat_cols after we add dummies
cat_cols = [col for col in df.columns if col not in num_cols + id_list]
# Previous applications numeric features
num_to_agg = {}
for col in num_cols:
    num_to_agg[col] = ['min', 'max', 'mean']
        
# Previous applications categorical features
cat_to_agg = {}
for col in cat_cols:
    cat_to_agg[col] = ['mean']
        
prev_app_agg = df.groupby('SK_ID_CURR').agg({**num_to_agg, **cat_to_agg})


In [3]:
##########################################################################################
#### Data Cleansing/Aggregation/Feature Engineering in data set installment #####
##########################################################################################

df = instl_pmt.copy()

# Feature Engineering
# "DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT" represents the number of days of delayed installment payment 
df['DAYS_DELAY_PAYMENT'] = df['DAYS_ENTRY_PAYMENT']-df['DAYS_INSTALMENT']
# "AMT_INSTALMENT - AMT_PAYMENT" represents the overdue amount of each installment payment 
df['OVERDUE_AMT_PAYMENT'] = df['AMT_INSTALMENT']-df['AMT_PAYMENT']
# The variable we created above may bias to large payment amount, therefore we also add a relative measure here
df['OVERDUE_AMT_PAYMENT_RELATIVE'] = 0
df.loc[df['AMT_INSTALMENT'] != 0, 'OVERDUE_AMT_PAYMENT_RELATIVE'] = df['AMT_PAYMENT']/df['AMT_INSTALMENT']

# We only use part of the data fields which look more relevant and then group them by the ID of current loan for merging
# All those data fields are with numerical datatype
instl_cols = ['DAYS_INSTALMENT','DAYS_ENTRY_PAYMENT','AMT_INSTALMENT','AMT_PAYMENT','DAYS_DELAY_PAYMENT'
                  ,'OVERDUE_AMT_PAYMENT','OVERDUE_AMT_PAYMENT_RELATIVE']
instl_col_name_agg = {}
for col in instl_cols:
    instl_col_name_agg[col] = ['min', 'max', 'mean','sum']        
     
instl_agg = df.groupby('SK_ID_CURR').agg({**instl_col_name_agg})

In [None]:
################################################################
############ Model Fitting - Gradient Boosting ############
################################################################

train_data = app_train.copy()
test_data = app_test.copy()

# Preset the cross-validation fold to be 10
n_folds = 10

#Store loan ID & labels
labels = train_data['TARGET']
train_ids = train_data['SK_ID_CURR']
test_ids = test_data['SK_ID_CURR']

######### Inner join the applications data set with the aggregate previous application
train_data = train_data.join(prev_app_agg,how='left',on='SK_ID_CURR')
test_data = test_data.join(prev_app_agg,how='left',on='SK_ID_CURR')

######### Inner join the two datasets with the installment data
train_data = train_data.join(instl_agg,how='left',on='SK_ID_CURR')
test_data = test_data.join(instl_agg,how='left',on='SK_ID_CURR')

# Take out the ID & Target first before we align the feature sets by their common features
train_data = train_data.drop(columns = ['SK_ID_CURR', 'TARGET'])
test_data = test_data.drop(columns = ['SK_ID_CURR'])
    
# One-hot encoding for categorical variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align the dataframes by the common columns
train_data, test_data = train_data.align(test_data, join = 'inner', axis = 1)

# Extract feature names
feature_names = list(train_data.columns)
    
# Convert train_data to matrices
features = np.array(train_data)
test_features = np.array(test_data)
n_trn = features.shape[0]
n_tst = test_features.shape[0]
    
# Create the kfold object 
k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)

# Create empty array for feature importances, test & out-of-fold predictions
feat_importance = np.zeros(len(feature_names))
test_pred = np.zeros(n_tst)
out_of_fold = np.zeros(n_trn)
# Lists for recording validation and training scores
valid_scores = []
train_scores = []

kf = k_fold.split(features)
for train_index, valid_index in kf:
    train_features, train_labels = features[train_index], labels[train_index]
    # Validation data for the fold
    valid_features, valid_labels = features[valid_index], labels[valid_index]    
    model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
    # Train the model
    model.fit(train_features, train_labels, eval_metric = 'auc',
              eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
              eval_names = ['valid', 'train'], categorical_feature = 'auto',
              early_stopping_rounds = 50, verbose = 200)
    
    best_iteration = model.best_iteration_
    out_of_fold[valid_index] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
    
    # Store the feature importance
    feat_importance += model.feature_importances_ / n_folds
    
    # Also use the model fitted in this fold for prediction in the real test data (without target)
    test_pred += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / n_folds
    
    valid_score = model.best_score_['valid']['auc']
    train_score = model.best_score_['train']['auc']
        
    valid_scores.append(valid_score)
    train_scores.append(train_score)
    
valid_auc = roc_auc_score(labels, out_of_fold)
valid_scores.append(valid_auc)
train_scores.append(np.mean(train_scores))
    
# Table for validation scores
fold_names = list(range(n_folds))
fold_names.append('overall')
    
# Dataframe of validation scores
metrics = pd.DataFrame({'fold': fold_names,
                        'train': train_scores,
                        'valid': valid_scores})
print(metrics)




Training until validation scores don't improve for 50 rounds
[200]	train's auc: 0.822014	train's binary_logloss: 0.522323	valid's auc: 0.787875	valid's binary_logloss: 0.537871
Early stopping, best iteration is:
[324]	train's auc: 0.843292	train's binary_logloss: 0.499226	valid's auc: 0.789099	valid's binary_logloss: 0.523107
Training until validation scores don't improve for 50 rounds
[200]	train's auc: 0.822653	train's binary_logloss: 0.52112	valid's auc: 0.776544	valid's binary_logloss: 0.540029
[400]	train's auc: 0.855521	train's binary_logloss: 0.485119	valid's auc: 0.77799	valid's binary_logloss: 0.517723
Early stopping, best iteration is:
[384]	train's auc: 0.85319	train's binary_logloss: 0.487636	valid's auc: 0.778253	valid's binary_logloss: 0.519303
Training until validation scores don't improve for 50 rounds
[200]	train's auc: 0.822551	train's binary_logloss: 0.521648	valid's auc: 0.781403	valid's binary_logloss: 0.537482
Early stopping, best iteration is:
[305]	train's auc: 

In [None]:
#################################################################
######## Feature Importance Plots ###############################
#################################################################
import matplotlib.pyplot as plt
# Create a dataframe to store the feature importances & Sort the importacne in descending order
df_ft_imp = pd.DataFrame({'feature': feature_names, 'importance': feat_importance})
df_ft_imp = df_ft_imp.sort_values('importance', ascending = False).reset_index()
    
# Normalize the feature importances
df_ft_imp['importance_normalized'] = df_ft_imp['importance'] / df_ft_imp['importance'].sum()

# Plot the feature importances in horizontal bar charts
import matplotlib.pyplot as plt
plt.figure(figsize = (20, 12))
ax = plt.subplot()
ax.barh(list(reversed(list(df.index[:30]))), 
    df_ft_imp['importance_normalized'].head(30), 
    align = 'center', color = 'cyan')
    
# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:30]))))
ax.set_yticklabels(df_ft_imp['feature'].head(30))
    
# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()



In [None]:
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_pred})
#submission.head()
submission.to_csv('submission.csv', index=False)
