# **1) Import all packages**

In [None]:
import os
import numpy as np 
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
import math
from tensorflow.python.client import device_lib
from sklearn.model_selection import train_test_split 
from numpy.random import seed
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras import backend as K
from keras.models import Sequential
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Model
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, roc_auc_score,roc_curve
tf.random.set_seed(42)
seed(42)

# **2) Get Dataset**

In [None]:
app_train_df = pd.read_csv('/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Data/Data Raw/application_train.csv')
dataset = pd.read_csv('/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Data/Data Raw/1.processed_columns_merged.csv')

In [None]:
print(dataset.head())
print(dataset.shape)
print(dataset.shape)

   AMT_ANNUITY  AMT_CREDIT  ...  SK_ID_CURR_CNT_POS_CASH  SK_ID_CURR_CNT_INSTALL
0      24700.5    406597.5  ...                     19.0                    19.0
1      35698.5   1293502.5  ...                     28.0                    25.0
2       6750.0    135000.0  ...                      4.0                     3.0
3      29686.5    312682.5  ...                     21.0                    16.0
4      21865.5    513000.0  ...                     66.0                    66.0

[5 rows x 416 columns]
(356255, 416)
(356255, 416)


In [None]:
meta_cols = ['SK_ID_CURR']
meta_df = dataset[meta_cols]
dataset.drop(columns=meta_cols, inplace=True)

# **3) Undersample Dataset**

In [None]:
dataset['TARGET'].value_counts()

0.0    282686
1.0     24825
Name: TARGET, dtype: int64

In [None]:
# Shuffle the Dataset.
shuffled_df = dataset.sample(frac=1,random_state=4)

# Put all the fraud class in a separate dataset.
ones_df = shuffled_df.loc[shuffled_df['TARGET'] == 1]

#Randomly select 492 observations from the non-fraud (majority class)
zeros_df = shuffled_df.loc[shuffled_df['TARGET'] == 0].sample(n=24825,random_state=42)

# Concatenate both dataframes again
normalized_df = pd.concat([ones_df, zeros_df])

dataset = normalized_df
dataset['TARGET'].value_counts()
labels = dataset.pop('TARGET')

In [None]:
labels.value_counts()

0.0    24825
1.0    24825
Name: TARGET, dtype: int64

In [None]:
def process_dataframe(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """

    # Label encode categoricals
    print('Label encoding categorical features...')
    categorical_feats = input_df.columns[input_df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))
    print('Label encoding complete.')

    return input_df, categorical_feats.tolist(), encoder_dict

In [None]:
dataset, categorical_feats, encoder_dict = process_dataframe(input_df=dataset)

# Capture other categorical features not as object data types:
non_obj_categoricals = [
    'FONDKAPREMONT_MODE', 'HOUR_APPR_PROCESS_START', 'HOUSETYPE_MODE',
    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
    'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE', 'STATUS', 'NAME_CONTRACT_STATUS_CAVG',
    'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_CONTRACT_TYPE_BAVG',
    'WEEKDAY_APPR_PROCESS_START_BAVG', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 
    'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE_BAVG', 
    'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 
    'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 
    'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION', 'NAME_CONTRACT_STATUS_CCAVG' 
]
categorical_feats = categorical_feats + non_obj_categoricals

Label encoding categorical features...
Label encoding complete.


In [None]:
null_counts = dataset.isnull().sum()
null_counts = null_counts[null_counts > 0]
null_ratios = null_counts / len(dataset)

#Drop columns over 80% null
null_thresh = .8
null_cols = null_ratios[null_ratios > null_thresh].index
dataset.drop(null_cols, axis=1, inplace=True)
print('Columns dropped for being over {}% null:'.format(100*null_thresh))
for col in null_cols:
    print(col)
    if col in categorical_feats:
        categorical_feats.pop(col)
    
#Fill the rest with the mean

dataset.fillna(0, inplace=True)

Columns dropped for being over 80.0% null:
RATE_INTEREST_PRIMARY
RATE_INTEREST_PRIVILEGED
DAYS_FIRST_DRAWING
RATE_INTEREST_PRIMARY_PRVMAX
RATE_INTEREST_PRIVILEGED_PRVMAX
DAYS_FIRST_DRAWING_PRVMAX
RATE_INTEREST_PRIMARY_PRVMIN
RATE_INTEREST_PRIVILEGED_PRVMIN
DAYS_FIRST_DRAWING_PRVMIN
AMT_DRAWINGS_ATM_CURRENT
AMT_DRAWINGS_OTHER_CURRENT
AMT_DRAWINGS_POS_CURRENT
AMT_INST_MIN_REGULARITY
AMT_PAYMENT_CURRENT
CNT_DRAWINGS_ATM_CURRENT
CNT_DRAWINGS_OTHER_CURRENT
CNT_DRAWINGS_POS_CURRENT
CNT_INSTALMENT_MATURE_CUM
AMT_DRAWINGS_ATM_CURRENT_CCMEAN
AMT_DRAWINGS_OTHER_CURRENT_CCMEAN
AMT_DRAWINGS_POS_CURRENT_CCMEAN
AMT_INST_MIN_REGULARITY_CCMEAN
AMT_PAYMENT_CURRENT_CCMEAN
CNT_DRAWINGS_ATM_CURRENT_CCMEAN
CNT_DRAWINGS_OTHER_CURRENT_CCMEAN
CNT_DRAWINGS_POS_CURRENT_CCMEAN
CNT_INSTALMENT_MATURE_CUM_CCMEAN
AMT_DRAWINGS_ATM_CURRENT_CCMAX
AMT_DRAWINGS_OTHER_CURRENT_CCMAX
AMT_DRAWINGS_POS_CURRENT_CCMAX
AMT_INST_MIN_REGULARITY_CCMAX
AMT_PAYMENT_CURRENT_CCMAX
CNT_DRAWINGS_ATM_CURRENT_CCMAX
CNT_DRAWINGS_OTHER_CURRE

In [None]:
#Check result
print(dataset['CHANNEL_TYPE'])

29616     5
151628    4
84479     7
3341      8
19746     7
         ..
264825    7
48257     4
204475    5
278028    8
73913     8
Name: CHANNEL_TYPE, Length: 49650, dtype: int64


In [None]:
#Generate df with variables names and index
cat_feats_idx = np.array([dataset.columns.get_loc(x) for x in categorical_feats])
int_feats_idx = [dataset.columns.get_loc(x) for x in non_obj_categoricals]
cat_feat_lookup = pd.DataFrame({'feature': categorical_feats, 'column_index': cat_feats_idx})
cat_feat_lookup.head()

cont_feats_idx = np.array(
    [dataset.columns.get_loc(x) 
     for x in dataset.columns[~dataset.columns.isin(categorical_feats)]]
)
cont_feat_lookup = pd.DataFrame(
    {'feature': dataset.columns[~dataset.columns.isin(categorical_feats)], 
     'column_index': cont_feats_idx}
)
cont_feat_lookup.head()

Unnamed: 0,feature,column_index
0,AMT_ANNUITY,0
1,AMT_CREDIT,1
2,AMT_GOODS_PRICE,2
3,AMT_INCOME_TOTAL,3
4,AMT_REQ_CREDIT_BUREAU_DAY,4


In [None]:
#Scale and Normalize
scaler = StandardScaler()
final_col_names = dataset.columns
dataset = dataset.values
dataset[:, cont_feats_idx] = scaler.fit_transform(dataset[:, cont_feats_idx])

scaler_2 = MinMaxScaler(feature_range=(0, 1))
dataset[:, int_feats_idx] = scaler_2.fit_transform(dataset[:, int_feats_idx])

print(dataset.shape)
print(dataset.dtype)

(49650, 368)
float64


In [None]:
labels_array = labels.to_numpy() 
print(labels_array)
labelsv2 = np.vstack(labels_array)
print(labelsv2)
labels.value_counts(normalize=True) * 100

[1. 1. 1. ... 0. 0. 0.]
[[1.]
 [1.]
 [1.]
 ...
 [0.]
 [0.]
 [0.]]


0.0    50.0
1.0    50.0
Name: TARGET, dtype: float64

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(dataset, labelsv2, train_size=0.85, test_size=0.15, random_state=42)

In [None]:
#np.save("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/train_downsampled.npy",X_train)
#np.save("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/train_labels_downsampled.npy",y_train)
#np.save("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/test_downsampled.npy",X_test)
#np.save("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/test_labels_downsampled.npy",y_test)

In [None]:
X_train = np.load("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/train_downsampled.npy")
y_train = np.load("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/train_labels_downsampled.npy")
X_test = np.load("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/test_downsampled.npy")
y_test = np.load("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Downsampled Data/test_labels_downsampled.npy")

# **4) Baseline Model**

In [None]:
log_X_train = X_train
log_y_train = y_train
log_X_test = X_test
log_y_test = y_test

In [None]:
#Create Logistic Regression Model
logreg = LogisticRegression(max_iter= 3000,random_state=16)
logreg.fit(log_X_train,log_y_train.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=3000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#Get Score of Baseline model
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
print(logit_roc_auc)

0.6958740354771773


# **5) Built Neural Network**

In [None]:
print(X_train.shape)
print(y_train.shape)

(42202, 368)
(42202, 1)


In [None]:
print(X_test.shape)
print(y_test.shape)

(7448, 368)
(7448, 1)


In [None]:
#Prepare labels for Neural Network
y_train = np.array(keras.utils.to_categorical(y_train))
y_test = np.array(keras.utils.to_categorical(y_test))
print(y_train.shape)
print(y_train[:10])

(42202, 2)
[[0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]]


In [None]:
#Metrics for Neural Network
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

val_auc = tf.keras.metrics.AUC()

In [None]:
#Function to built model
def build_model():
  inputs = layers.Input(shape=(368))
  x = layers.Dense(80,activation='relu',name='fc1')(inputs)

  x = layers.Dense(80,activation='relu',name='fc2')(x)

  x = layers.Dropout(rate=0.1, name='Dropout1')(x)

  x = layers.BatchNormalization()(x)

  x = layers.Dense(80,activation='relu',name='fc3')(x)

  x = layers.Dropout(rate=0.1, name='Dropout2')(x)

  x = layers.BatchNormalization()(x)

  x = layers.Dense(60,activation='relu',name='fc4')(x)
  
  x = layers.Dense(2,name='logits')(x)

  preds = layers.Activation('softmax',name='Softmax')(x)

  model = Model(inputs=inputs, outputs=preds)
  model.summary()
  return model
model = build_model()

Model: "functional_40"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 368)]             0         
_________________________________________________________________
fc1 (Dense)                  (None, 80)                29520     
_________________________________________________________________
fc2 (Dense)                  (None, 80)                6480      
_________________________________________________________________
Dropout1 (Dropout)           (None, 80)                0         
_________________________________________________________________
batch_normalization_24 (Batc (None, 80)                320       
_________________________________________________________________
fc3 (Dense)                  (None, 80)                6480      
_________________________________________________________________
Dropout2 (Dropout)           (None, 80)              

In [None]:
#Saving best performing model
filepath = "/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Main Notebook/Models/ModelV1.hdf5"

In [None]:
#Set-up filepath and ModelCheckepoint with Earlystopping to save best performing moddel based on Validation AUC
model_checkpoint = ModelCheckpoint(filepath,  monitor='val_auc', save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_auc', mode='max', verbose=0, patience=5)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['acc',f1_m,precision_m, recall_m,tf.keras.metrics.AUC(name='auc')])

In [None]:
#Set epochs and bach size, and train model 
num_epoch = 100
batch_size = 64
model1 = model.fit(x=X_train, y=y_train, epochs=num_epoch, batch_size=batch_size, validation_split=0.15,callbacks=[model_checkpoint,es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


In [None]:
#Train Performance
prediction_model = tf.keras.models.load_model(filepath, custom_objects={'f1_m':f1_m,'precision_m':precision_m, 'recall_m':recall_m, 'auc': tf.keras.metrics.AUC() }) 
loss, accuracy, f1_score, precision, recall, auc = prediction_model.evaluate(X_train, y_train, verbose=0, batch_size=64)

print('Train Accuracy:',accuracy)
print('Train Recall:',recall)
print('Train Precision:',precision)
print('Train F1:',f1_score)
print('Train AUC:',auc)
prediction_model.summary()

Train Accuracy: 0.6973366141319275
Train Recall: 0.6973667144775391
Train Precision: 0.6973667144775391
Train F1: 0.6973666548728943
Train AUC: 0.7662211060523987
Model: "functional_40"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 368)]             0         
_________________________________________________________________
fc1 (Dense)                  (None, 80)                29520     
_________________________________________________________________
fc2 (Dense)                  (None, 80)                6480      
_________________________________________________________________
Dropout1 (Dropout)           (None, 80)                0         
_________________________________________________________________
batch_normalization_24 (Batc (None, 80)                320       
_________________________________________________________________
fc3 (Dense)           

In [None]:
#Test Performance
loss, accuracy, f1_score, precision, recall, auc = prediction_model.evaluate(X_test, y_test, verbose=0, batch_size=64)

print('Test Accuracy:',accuracy)
print('Test Recall:',recall)
print('Test Precision:',precision)
print('Test F1:',f1_score)
print('Test AUC:',auc)
prediction_model.summary()

Test Accuracy: 0.6797798275947571
Test Recall: 0.6797097325325012
Test Precision: 0.6797097325325012
Test F1: 0.6797096729278564
Test AUC: 0.7480786442756653
Model: "functional_40"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 368)]             0         
_________________________________________________________________
fc1 (Dense)                  (None, 80)                29520     
_________________________________________________________________
fc2 (Dense)                  (None, 80)                6480      
_________________________________________________________________
Dropout1 (Dropout)           (None, 80)                0         
_________________________________________________________________
batch_normalization_24 (Batc (None, 80)                320       
_________________________________________________________________
fc3 (Dense)                

In [None]:
#Best Performance
best_filepath = "/content/drive/MyDrive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Downsampling/Main Notebook/Models/DownsampledV1.hdf5"
best_model = tf.keras.models.load_model(best_filepath, custom_objects={'f1_m':f1_m,'precision_m':precision_m, 'recall_m':recall_m, 'auc': tf.keras.metrics.AUC() }) 
loss, accuracy, f1_score, precision, recall, auc = best_model.evaluate(X_test, y_test, verbose=0, batch_size=64)

#print('Best Accuracy:',accuracy)
#print('Best Recall:',recall)
#print('Best Precision:',precision)
#print('Best F1:',f1_score)
print('Best AUC:',auc)
best_model.summary()

Best AUC: 0.7428011298179626
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 368)]             0         
_________________________________________________________________
fc1 (Dense)                  (None, 80)                29520     
_________________________________________________________________
batch_normalization (BatchNo (None, 80)                320       
_________________________________________________________________
fc2 (Dense)                  (None, 160)               12960     
_________________________________________________________________
batch_normalization_1 (Batch (None, 160)               640       
_________________________________________________________________
fc3 (Dense)                  (None, 320)               51520     
_________________________________________________________________
Dropout1 (Dropout)       

# **6) Generate Soft Targets**

In [None]:
#Load the best model
model = tf.keras.models.load_model(best_filepath, custom_objects={'f1_m':f1_m,'precision_m':precision_m, 'recall_m':recall_m}) 

In [None]:
#Compile it again
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['acc',f1_m,precision_m, recall_m,tf.keras.metrics.AUC()])
model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 368)]             0         
_________________________________________________________________
fc1 (Dense)                  (None, 80)                29520     
_________________________________________________________________
batch_normalization (BatchNo (None, 80)                320       
_________________________________________________________________
fc2 (Dense)                  (None, 160)               12960     
_________________________________________________________________
batch_normalization_1 (Batch (None, 160)               640       
_________________________________________________________________
fc3 (Dense)                  (None, 320)               51520     
_________________________________________________________________
Dropout1 (Dropout)           (None, 320)              

In [None]:
#Remove last softmax layer and predict again
model_sans_softmax = Model(inputs=model.input, outputs=model.get_layer("logits").output)
model_logits = model_sans_softmax.predict(X_train)

In [None]:
#Get the logits
soft_targets = model_logits

In [None]:
#Save if needed for later use
#np.save("/content/drive/My Drive/Tilburg University/DS&S/Thesis/Google Collab/Home Credit Default Dataset/Data/Soft Targets/HomeCredit_logits.npy", model_logits)

# **7) Vanilla Decision Tree**

In [None]:
#Create decision tree for specified depth of 5 and 10 
model = DecisionTreeClassifier(max_depth=10,random_state=16)
model.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
#Get predictions of vanilla DT
res_pred = model.predict(X_test)

In [None]:
#Get various scores
acc = accuracy_score(y_test, res_pred)
rec = recall_score(y_test, res_pred,average='macro')
prec = precision_score(y_test, res_pred,average='macro')
auc = roc_auc_score(y_test,res_pred, average='macro')
print('Accuracy:', acc)
print('Recall: ', rec)
print('Precision:', prec)
print('AUC: ', auc)

Accuracy: 0.635875402792696
Recall:  0.6359033920009529
Precision: 0.6365391689782252
AUC:  0.6363064384974265


# **8) Knowledge Distillation Decision Tree**


In [None]:
#Set up Decision Tree for Knowledge Distillation at given depth 
model = DecisionTreeRegressor(max_depth=10,random_state=16)

In [None]:
#Train Decision Tree
model.fit(X_train,soft_targets)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
#Get predictions
res_pred = model.predict(X_test)

In [None]:
# Apply softmax function to output of Decision Tree
pred_argmax = np.zeros_like(res_pred)
pred_argmax[np.arange(len(res_pred)), res_pred.argmax(1)] = 1 

In [None]:
#Get scores of student decision tree
acc = accuracy_score(pred_argmax,y_test)
rec = recall_score(y_test, pred_argmax,average='macro')
prec = precision_score(y_test, pred_argmax,average='macro')
auc = roc_auc_score(y_test, pred_argmax,average='macro')
print('Accuracy:', acc)
print('Recall: ', rec)
print('Precision:', prec)
print('AUC: ', auc)

Accuracy: 0.6544038668098818
Recall:  0.6544511021567656
Precision: 0.6548326649414907
AUC:  0.6544511021567656
