# Deep learning-based time-to-event analysis for credit scoring
Gabriel Blumenstock

This Jupiter Notebook can be regarded as an online companion for the following article on deep learning for time-to-event analysis for credit scoring published in the Journal of the Operational Research Society:

https://doi.org/10.1080/01605682.2020.1838960

With this Notebook, the reader can re-implement all experiments that were performed with the deep learning-based model "DeepHit" within the scope of this publication using Google Colab. Please note that only the "DeepHit"-experiments are provided here. All statistical and random forest benchmark experiments were performed in R and will not be published here.

The Notebook consists of four parts:
- In **Part 1**, packages are loaded and all necessary functions are defined.
- Hyperparameter tuning is implemented in **Part 2**.
- In **Part 3**, all experiments on performance evaluation are implemented.
- Finally, variable importance experiments can be found in **Part 4**.


### Part 1: Preparations

In this part, packages are imported and all necessary functions are defined. Thereby, most functions are based on the codes provided by Lee et at. (2018) via https://github.com/chl8856/DeepHit. Yet, some functions had to be modified and additional functions were defined to adapt to the specific setting of interest.

In [None]:
#import packages
!pip install tensorflow==1.12.0
import numpy as np
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected as FC_Net
from termcolor import colored
import time, datetime, os
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
seed = 1234

In [None]:
#functions from github.com/chl8856/DeepHit


#UTILS_NETWORK

### FEEDFORWARD NETWORK
def create_FCNet(inputs, num_layers, h_dim, h_fn, o_dim, o_fn, w_init, keep_prob=1.0, w_reg=None):
    # default active functions (hidden: relu, out: None)
    if h_fn is None:
        h_fn = tf.nn.relu
    if o_fn is None:
        o_fn = None

    # default initialization functions (weight: Xavier, bias: None)
    if w_init is None:
        w_init = tf.contrib.layers.xavier_initializer() # Xavier initialization

    for layer in range(num_layers):
        if num_layers == 1:
            out = FC_Net(inputs, o_dim, activation_fn=o_fn, weights_initializer=w_init, weights_regularizer=w_reg)
        else:
            if layer == 0:
                h = FC_Net(inputs, h_dim, activation_fn=h_fn, weights_initializer=w_init, weights_regularizer=w_reg)
                if not keep_prob is None:
                    h = tf.nn.dropout(h, keep_prob=keep_prob)

            elif layer > 0 and layer != (num_layers-1): # layer > 0:
                h = FC_Net(h, h_dim, activation_fn=h_fn, weights_initializer=w_init, weights_regularizer=w_reg)
                if not keep_prob is None:
                    h = tf.nn.dropout(h, keep_prob=keep_prob)

            else: # layer == num_layers-1 (the last layer)
                out = FC_Net(h, o_dim, activation_fn=o_fn, weights_initializer=w_init, weights_regularizer=w_reg)

    return out


#UTILS_EVAL

### CONCORDANCE INDEX
def c_index(Prediction, Time_survival, Death, Time):
    N = len(Prediction)
    A = np.zeros((N,N))
    Q = np.zeros((N,N))
    N_t = np.zeros((N,N))
    Num = 0
    Den = 0
    for i in range(N):
        A[i, np.where(Time_survival[i] < Time_survival)] = 1
        Q[i, np.where(Prediction[i] > Prediction)] = 1
  
        if (Time_survival[i]<=Time and Death[i]==1):
            N_t[i,:] = 1

    Num  = np.sum(((A)*N_t)*Q)
    Den  = np.sum((A)*N_t)

    if Num == 0 and Den == 0:
        result = -1 # not able to compute c-index!
    else:
        result = float(Num/Den)

    return result


#CLASS_DEEPHIT

def log(x):
    return tf.log(x + 1e-8)

def div(x, y):
    return tf.div(x, (y + 1e-8))

### DEEPHIT NETWORK
class Model_DeepHit:
    def __init__(self, sess, name, input_dims, network_settings):
        self.sess               = sess
        self.name               = name

        # INPUT DIMENSIONS
        self.x_dim              = input_dims['x_dim']

        self.num_Event          = input_dims['num_Event']
        self.num_Category       = input_dims['num_Category']

        # NETWORK HYPER-PARMETERS
        self.h_dim_shared       = network_settings['h_dim_shared']
        self.h_dim_CS           = network_settings['h_dim_CS']
        self.num_layers_shared  = network_settings['num_layers_shared']
        self.num_layers_CS      = network_settings['num_layers_CS']

        self.active_fn          = network_settings['active_fn']
        self.initial_W          = network_settings['initial_W']
        self.reg_W              = tf.contrib.layers.l2_regularizer(scale=1.0)
        self.reg_W_out          = tf.contrib.layers.l1_regularizer(scale=1.0)

        self._build_net()

    def _build_net(self):
        with tf.variable_scope(self.name):
            #### PLACEHOLDER DECLARATION
            self.mb_size     = tf.placeholder(tf.int32, [], name='batch_size')
            self.lr_rate     = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob   = tf.placeholder(tf.float32, [], name='keep_probability')   #keeping rate
            self.a           = tf.placeholder(tf.float32, [], name='alpha')
            self.b           = tf.placeholder(tf.float32, [], name='beta')
            self.c           = tf.placeholder(tf.float32, [], name='gamma')

            self.x           = tf.placeholder(tf.float32, shape=[None, self.x_dim], name='inputs')
            self.k           = tf.placeholder(tf.float32, shape=[None, 1], name='labels')     #event/censoring label (censoring:0)
            self.t           = tf.placeholder(tf.float32, shape=[None, 1], name='timetoevents')

            self.fc_mask1    = tf.placeholder(tf.float32, shape=[None, self.num_Event, self.num_Category], name='mask1')  #for Loss 1
            self.fc_mask2    = tf.placeholder(tf.float32, shape=[None, self.num_Category], name='mask2')  #for Loss 2 / Loss 3

            ##### SHARED SUBNETWORK w/ FCNETS
            shared_out = create_FCNet(self.x, self.num_layers_shared, self.h_dim_shared, self.active_fn, self.h_dim_shared, self.active_fn, self.initial_W, self.keep_prob, self.reg_W)
            last_x = self.x  #for residual connection

            h = tf.concat([last_x, shared_out], axis=1)

            #(num_layers_CS) layers for cause-specific (num_Event subNets)
            out = []
            for _ in range(self.num_Event):
                cs_out = create_FCNet(h, (self.num_layers_CS), self.h_dim_CS, self.active_fn, self.h_dim_CS, self.active_fn, self.initial_W, self.keep_prob, self.reg_W)
                out.append(cs_out)
            out = tf.stack(out, axis=1) # stack referenced on subject
            out = tf.reshape(out, [-1, self.num_Event*self.h_dim_CS])
            out = tf.nn.dropout(out, keep_prob=self.keep_prob)

            out = FC_Net(out, self.num_Event * self.num_Category, activation_fn=tf.nn.softmax, 
                         weights_initializer=self.initial_W, weights_regularizer=self.reg_W_out, scope="Output")
            self.out = tf.reshape(out, [-1, self.num_Event, self.num_Category])

            ##### GET LOSS FUNCTIONS
            self.loss_Log_Likelihood()      #get loss1: Log-Likelihood loss
            self.loss_Ranking()             #get loss2: Ranking loss
            self.loss_Calibration()         #get loss3: Calibration loss

            self.LOSS_TOTAL = self.a*self.LOSS_1 + self.b*self.LOSS_2 + self.c*self.LOSS_3
            self.solver = tf.train.AdamOptimizer(learning_rate=self.lr_rate).minimize(self.LOSS_TOTAL)

    ### LOSS-FUNCTION 1 -- Log-likelihood loss
    def loss_Log_Likelihood(self):
        I_1 = tf.sign(self.k)

        #for uncenosred: log P(T=t,K=k|x)
        tmp1 = tf.reduce_sum(tf.reduce_sum(self.fc_mask1 * self.out, reduction_indices=2), reduction_indices=1, keep_dims=True)
        tmp1 = I_1 * log(tmp1)

        #for censored: log \sum P(T>t|x)
        tmp2 = tf.reduce_sum(tf.reduce_sum(self.fc_mask1 * self.out, reduction_indices=2), reduction_indices=1, keep_dims=True)
        tmp2 = (1. - I_1) * log(tmp2)

        self.LOSS_1 = - tf.reduce_mean(tmp1 + 1.0*tmp2)

    ### LOSS-FUNCTION 2 -- Ranking loss
    def loss_Ranking(self):
        sigma1 = tf.constant(0.1, dtype=tf.float32)

        eta = []
        for e in range(self.num_Event):
            one_vector = tf.ones_like(self.t, dtype=tf.float32)
            I_2 = tf.cast(tf.equal(self.k, e+1), dtype = tf.float32) #indicator for event
            I_2 = tf.diag(tf.squeeze(I_2))
            tmp_e = tf.reshape(tf.slice(self.out, [0, e, 0], [-1, 1, -1]), [-1, self.num_Category]) #event specific joint prob.

            R = tf.matmul(tmp_e, tf.transpose(self.fc_mask2)) #no need to divide by each individual dominator
            # r_{ij} = risk of i-th pat based on j-th time-condition (last meas. time ~ event time) , i.e. r_i(T_{j})

            diag_R = tf.reshape(tf.diag_part(R), [-1, 1])
            R = tf.matmul(one_vector, tf.transpose(diag_R)) - R # R_{ij} = r_{j}(T_{j}) - r_{i}(T_{j})
            R = tf.transpose(R)                                 # Now, R_{ij} (i-th row j-th column) = r_{i}(T_{i}) - r_{j}(T_{i})

            T = tf.nn.relu(tf.sign(tf.matmul(one_vector, tf.transpose(self.t)) - tf.matmul(self.t, tf.transpose(one_vector))))
            # T_{ij}=1 if t_i < t_j  and T_{ij}=0 if t_i >= t_j

            T = tf.matmul(I_2, T) # only remains T_{ij}=1 when event occured for subject i

            tmp_eta = tf.reduce_mean(T * tf.exp(-R/sigma1), reduction_indices=1, keep_dims=True)

            eta.append(tmp_eta)
        eta = tf.stack(eta, axis=1) #stack referenced on subjects
        eta = tf.reduce_mean(tf.reshape(eta, [-1, self.num_Event]), reduction_indices=1, keep_dims=True)

        self.LOSS_2 = tf.reduce_sum(eta) #sum over num_Events

    ### LOSS-FUNCTION 3 -- Calibration Loss
    def loss_Calibration(self):
        eta = []
        for e in range(self.num_Event):
            one_vector = tf.ones_like(self.t, dtype=tf.float32)
            I_2 = tf.cast(tf.equal(self.k, e+1), dtype = tf.float32) #indicator for event
            tmp_e = tf.reshape(tf.slice(self.out, [0, e, 0], [-1, 1, -1]), [-1, self.num_Category]) #event specific joint prob.

            r = tf.reduce_sum(tmp_e * self.fc_mask2, axis=0) #no need to divide by each individual dominator
            tmp_eta = tf.reduce_mean((r - I_2)**2, reduction_indices=1, keep_dims=True)

            eta.append(tmp_eta)
        eta = tf.stack(eta, axis=1) #stack referenced on subjects
        eta = tf.reduce_mean(tf.reshape(eta, [-1, self.num_Event]), reduction_indices=1, keep_dims=True)

        self.LOSS_3 = tf.reduce_sum(eta) #sum over num_Events
    
    def get_cost(self, DATA, MASK, PARAMETERS, keep_prob, lr_train):
        (x_mb, k_mb, t_mb) = DATA
        (m1_mb, m2_mb) = MASK
        (alpha, beta, gamma) = PARAMETERS
        return self.sess.run(self.LOSS_TOTAL, 
                             feed_dict={self.x:x_mb, self.k:k_mb, self.t:t_mb, self.fc_mask1: m1_mb, self.fc_mask2:m2_mb, 
                                        self.a:alpha, self.b:beta, self.c:gamma, 
                                        self.mb_size: np.shape(x_mb)[0], self.keep_prob:keep_prob, self.lr_rate:lr_train})

    def train(self, DATA, MASK, PARAMETERS, keep_prob, lr_train):
        (x_mb, k_mb, t_mb) = DATA
        (m1_mb, m2_mb) = MASK
        (alpha, beta, gamma) = PARAMETERS
        return self.sess.run([self.solver, self.LOSS_TOTAL], 
                             feed_dict={self.x:x_mb, self.k:k_mb, self.t:t_mb, self.fc_mask1: m1_mb, self.fc_mask2:m2_mb, 
                                        self.a:alpha, self.b:beta, self.c:gamma, 
                                        self.mb_size: np.shape(x_mb)[0], self.keep_prob:keep_prob, self.lr_rate:lr_train})

    def predict(self, x_test, keep_prob=1.0):
        return self.sess.run(self.out, feed_dict={self.x: x_test, self.mb_size: np.shape(x_test)[0], self.keep_prob: keep_prob})

def f_get_minibatch(mb_size, x, label, time, mask1, mask2):
    idx = range(np.shape(x)[0])
    idx = random.sample(idx, mb_size)

    x_mb = x[idx, :].astype(np.float32)
    k_mb = label[idx, :].astype(np.float32) # censoring(0)/event(1,2,..) label
    t_mb = time[idx, :].astype(np.float32)
    m1_mb = mask1[idx, :, :].astype(np.float32) #fc_mask
    m2_mb = mask2[idx, :].astype(np.float32) #fc_mask
    return x_mb, k_mb, t_mb, m1_mb, m2_mb

def get_valid_performance(DATA, MASK, in_parser, eval_time):
    ##### DATA & MASK
    (data, time, label)  = DATA
    (mask1, mask2)       = MASK

    x_dim                       = np.shape(data)[1]
    _, num_Event, num_Category  = np.shape(mask1)  # dim of mask1: [subj, Num_Event, Num_Category]
    
    ACTIVATION_FN               = {'relu': tf.nn.relu, 'elu': tf.nn.elu, 'tanh': tf.nn.tanh}

    ##### HYPER-PARAMETERS
    mb_size                     = in_parser['mb_size']

    iteration                   = in_parser['iteration']

    keep_prob                   = in_parser['keep_prob']
    lr_train                    = in_parser['lr_train']


    alpha                       = in_parser['alpha']  #for log-likelihood loss
    beta                        = in_parser['beta']  #for ranking loss
    gamma                       = in_parser['gamma']  #for RNN-prediction loss
    parameter_name              = 'a' + str('%02.0f' %(10*alpha)) + 'b' + str('%02.0f' %(10*beta)) + 'c' + str('%02.0f' %(10*gamma))

    initial_W                   = tf.contrib.layers.xavier_initializer()

    ##### MAKE DICTIONARIES
    # INPUT DIMENSIONS
    input_dims                  = { 'x_dim'         : x_dim,
                                    'num_Event'     : num_Event,
                                    'num_Category'  : num_Category}
    # NETWORK HYPER-PARMETERS
    network_settings            = { 'h_dim_shared'       : in_parser['h_dim_shared'],
                                    'num_layers_shared'  : in_parser['num_layers_shared'],
                                    'h_dim_CS'           : in_parser['h_dim_CS'],
                                    'num_layers_CS'      : in_parser['num_layers_CS'],
                                    'active_fn'          : ACTIVATION_FN[in_parser['active_fn']],
                                    'initial_W'          : initial_W }

    ##### CREATE DEEPHIT NETWORK
    tf.reset_default_graph()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    model = Model_DeepHit(sess, "DeepHit", input_dims, network_settings)
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())

    ### TRAINING-TESTING SPLIT
    (tr_data,va_data, tr_time,va_time, tr_label,va_label, 
     tr_mask1,va_mask1, tr_mask2,va_mask2)  = train_test_split(data, time, label, mask1, mask2, test_size=0.20, random_state=seed) 

    max_valid = -99
    stop_flag = 0

    ### TRAINING - MAIN
    print( "MAIN TRAINING ...")
    print( "EVALUATION TIMES: " + str(eval_time))

    avg_loss = 0
    for itr in range(iteration):
        if stop_flag > 5: #for faster early stopping
            break
        else:
            x_mb, k_mb, t_mb, m1_mb, m2_mb = f_get_minibatch(mb_size, tr_data, tr_label, tr_time, tr_mask1, tr_mask2)
            DATA = (x_mb, k_mb, t_mb)
            MASK = (m1_mb, m2_mb)
            PARAMETERS = (alpha, beta, gamma)
            _, loss_curr = model.train(DATA, MASK, PARAMETERS, keep_prob, lr_train)
            avg_loss += loss_curr/1000
                
            if (itr+1)%1000 == 0:
                print('|| ITR: ' + str('%04d' % (itr + 1)) + ' | Loss: ' + colored(str('%.4f' %(avg_loss)), 'yellow' , attrs=['bold']))
                avg_loss = 0

            ### VALIDATION  (based on average C-index of our interest)
            if (itr+1)%1000 == 0:
                ### PREDICTION
                pred = model.predict(va_data)

                ### EVALUATION
                va_result1 = np.zeros([num_Event, len(eval_time)])

                for t, t_time in enumerate(eval_time):
                    eval_horizon = int(t_time)

                    if eval_horizon >= num_Category:
                        print('ERROR: evaluation horizon is out of range')
                        va_result1[:, t] = va_result2[:, t] = -1
                    else:
                        risk = np.sum(pred[:,:,:(eval_horizon+1)], axis=2) #risk score until eval_time
                        for k in range(num_Event):
                            va_result1[k, t] = c_index(risk[:,k], va_time, (va_label[:,0] == k+1).astype(int), eval_horizon) #-1 for no event (not comparable)
                tmp_valid = np.mean(va_result1)


                if tmp_valid >  max_valid:
                    stop_flag = 0
                    max_valid = tmp_valid
                    print( 'updated.... average c-index = ' + str('%.4f' %(tmp_valid)))
                else:
                    stop_flag += 1

    return max_valid


#IMPORT_DATA

def f_get_Normalization(X, norm_mode):
    num_Patient, num_Feature = np.shape(X)

    if norm_mode == 'standard': #zero mean unit variance
        for j in range(num_Feature):
            if np.std(X[:,j]) != 0:
                X[:,j] = (X[:,j] - np.mean(X[:, j]))/np.std(X[:,j])
            else:
                X[:,j] = (X[:,j] - np.mean(X[:, j]))
    elif norm_mode == 'normal': #min-max normalization
        for j in range(num_Feature):
            X[:,j] = (X[:,j] - np.min(X[:,j]))/(np.max(X[:,j]) - np.min(X[:,j]))
    else:
        print("INPUT MODE ERROR!")

    return X

### MASK FUNCTIONS
def f_get_fc_mask2(time, label, num_Event, num_Category):
    mask = np.zeros([np.shape(time)[0], num_Event, num_Category]) # for the first loss function
    for i in range(np.shape(time)[0]):
        if label[i,0] != 0:  #not censored
            mask[i,int(label[i,0]-1),int(time[i,0])] = 1
        else: #label[i,2]==0: censored
            mask[i,:,int(time[i,0]+1):] =  1 #fill 1 until from the censoring time (to get 1 - \sum F)
    return mask

def f_get_fc_mask3(time, meas_time, num_Category):
    mask = np.zeros([np.shape(time)[0], num_Category]) # for the first loss function
    if np.shape(meas_time):  #lonogitudinal measurements
        for i in range(np.shape(time)[0]):
            t1 = int(meas_time[i, 0]) # last measurement time
            t2 = int(time[i, 0]) # censoring/event time
            mask[i,(t1+1):(t2+1)] = 1  #this excludes the last measurement time and includes the event time
    else:                    #single measurement
        for i in range(np.shape(time)[0]):
            t = int(time[i, 0]) # censoring/event time
            mask[i,:(t+1)] = 1  #this excludes the last measurement time and includes the event time
    return mask


#MAIN_RANDOMSEARCH (PART 1)

### SAVE AND LOAD HYPERPARAMETERS
def save_logging(dictionary, log_name): # this saves the current hyperparameters
    with open(log_name, 'w') as f:
        for key, value in dictionary.items():
            f.write('%s:%s\n' % (key, value))

def load_logging(filename): # this open can calls the saved hyperparameters
    data = dict()
    with open(filename) as f:
        def is_float(input):
            try:
                num = float(input)
            except ValueError:
                return False
            return True

        for line in f.readlines():
            if ':' in line:
                key,value = line.strip().split(':', 1)
                if value.isdigit():
                    data[key] = int(value)
                elif is_float(value):
                    data[key] = float(value)
                elif value == 'None':
                    data[key] = None
                else:
                    data[key] = value
            else:
                pass # deal with bad lines of text here    
    return data


#own functions

def train_save_deephit(DATA, MASK, in_parser, eval_time):
    ##### DATA & MASK
    (data, time, label)  = DATA
    (mask1, mask2)       = MASK

    x_dim                       = np.shape(data)[1]
    _, num_Event, num_Category  = np.shape(mask1)  # dim of mask1: [subj, Num_Event, Num_Category]
    
    ACTIVATION_FN               = {'relu': tf.nn.relu, 'elu': tf.nn.elu, 'tanh': tf.nn.tanh}

    ##### HYPER-PARAMETERS
    mb_size                     = in_parser['mb_size']

    iteration                   = in_parser['iteration']

    keep_prob                   = in_parser['keep_prob']
    lr_train                    = in_parser['lr_train']


    alpha                       = in_parser['alpha']  #for log-likelihood loss
    beta                        = in_parser['beta']  #for ranking loss
    gamma                       = in_parser['gamma']  #for RNN-prediction loss
    parameter_name              = 'a' + str('%02.0f' %(10*alpha)) + 'b' + str('%02.0f' %(10*beta)) + 'c' + str('%02.0f' %(10*gamma))

    initial_W                   = tf.contrib.layers.xavier_initializer()

    ##### MAKE DICTIONARIES
    # INPUT DIMENSIONS
    input_dims                  = { 'x_dim'         : x_dim,
                                    'num_Event'     : num_Event,
                                    'num_Category'  : num_Category}
    # NETWORK HYPER-PARMETERS
    network_settings            = { 'h_dim_shared'       : in_parser['h_dim_shared'],
                                    'num_layers_shared'  : in_parser['num_layers_shared'],
                                    'h_dim_CS'           : in_parser['h_dim_CS'],
                                    'num_layers_CS'      : in_parser['num_layers_CS'],
                                    'active_fn'          : ACTIVATION_FN[in_parser['active_fn']],
                                    'initial_W'          : initial_W }

    ##### CREATE DEEPHIT NETWORK
    tf.reset_default_graph()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    model = Model_DeepHit(sess, "DeepHit", input_dims, network_settings)
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())

    ### TRAINING-TESTING SPLIT
    (tr_data,te_data, tr_time,te_time, tr_label,te_label, 
     tr_mask1,te_mask1, tr_mask2,te_mask2)  = train_test_split(data, time, label, mask1, mask2, test_size=0.20, random_state=seed)

    stop_flag = 0

    avg_loss = 0
    for itr in range(iteration):
        if stop_flag > 5: #for faster early stopping
            break
        else:
            x_mb, k_mb, t_mb, m1_mb, m2_mb = f_get_minibatch(mb_size, tr_data, tr_label, tr_time, tr_mask1, tr_mask2)
            DATA = (x_mb, k_mb, t_mb)
            MASK = (m1_mb, m2_mb)
            PARAMETERS = (alpha, beta, gamma)
            _, loss_curr = model.train(DATA, MASK, PARAMETERS, keep_prob, lr_train)
                    
    saver.save(sess, "/content/drive/My Drive/deephitmodel/model")

def import_dataset_OWN(norm_mode='standard'):
    df = pd.read_csv('loandata.csv', sep=',')
    
    label           = np.asarray(df[['label']])
    time            = np.asarray(df[['time']])
    data            = np.asarray(df[cols])
    data            = f_get_Normalization(data, norm_mode)

    num_Category    = int(np.max(time) * 1.2)  #to have enough time-horizon
    num_Event       = int(len(np.unique(label)) - 1) #only count the number of events (do not count censoring as an event)

    x_dim           = np.shape(data)[1]

    mask1           = f_get_fc_mask2(time, label, num_Event, num_Category)
    mask2           = f_get_fc_mask3(time, -1, num_Category)

    DIM             = (x_dim)
    DATA            = (data, time, label)
    MASK            = (mask1, mask2)

    return DIM, DATA, MASK

def import_dataset_OWN_CR(norm_mode='standard'):
    df = pd.read_csv('loandataCR.csv', sep=',')
    
    label           = np.asarray(df[['label']])
    time            = np.asarray(df[['time']])
    data            = np.asarray(df[cols])
    data            = f_get_Normalization(data, norm_mode)

    num_Category    = int(np.max(time) * 1.2)  #to have enough time-horizon
    num_Event       = int(len(np.unique(label)) - 1) #only count the number of events (do not count censoring as an event)

    x_dim           = np.shape(data)[1]

    mask1           = f_get_fc_mask2(time, label, num_Event, num_Category)
    mask2           = f_get_fc_mask3(time, -1, num_Category)

    DIM             = (x_dim)
    DATA            = (data, time, label)
    MASK            = (mask1, mask2)

    return DIM, DATA, MASK

def import_dataset_HYP(norm_mode='standard'):
    df = pd.read_csv('loandataHT.csv', sep=',')
    
    label           = np.asarray(df[['label']])
    time            = np.asarray(df[['time']])
    data            = np.asarray(df[cols])
    data            = f_get_Normalization(data, norm_mode)

    num_Category    = int(np.max(time) * 1.2)
    num_Event       = int(len(np.unique(label)) - 1)

    x_dim           = np.shape(data)[1]

    mask1           = f_get_fc_mask2(time, label, num_Event, num_Category)
    mask2           = f_get_fc_mask3(time, -1, num_Category)

    DIM             = (x_dim)
    DATA            = (data, time, label)
    MASK            = (mask1, mask2)

    return DIM, DATA, MASK

### Part 2: Hyperparameter tuning

In this part, hyperparameter tuning is performed for the datasets by implementing a random search. For this purpose, the datasets "loandataHT.csv" and "newdataHT.csv" are loaded, and all available variables are used. The sets of possible values for each hyperparameter of interest are then defined. The number of times of randomly choosing values for each hyperparameter from the previously defined sets and training a DeepHit model based on these values is defined by the variable RS_NR. The hyperparameters with the best performance (here: the average concordance index recorded at 24, 48, and 72 months after loan issuance for both events) on the test set (containing 20% of the respective dataset) are then used for all subsequent DeepHit models in this study.

In [None]:
#HYPERPARAMETER TUNING - DATASET 1

cols = ["int.rate", "orig.upb", "fico.score", "dti.r",  "ltv.r", "bal.repaid", "t.act.12m", "t.del.30d.12m", "t.del.60d.12m", "hpi.st.d.t.o", "hpi.zip.o", "hpi.zip.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "equity.est", "hpi.st.log12m", "hpi.r.st.us", "hpi.r.zip.st", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

!cp "/content/drive/My Drive/datasets/loandataHT.csv" "loandataHT.csv"
(x_dim), (data, time, label), (mask1, mask2) = import_dataset_HYP(norm_mode = 'standard')
EVAL_TIMES = [24, 48, 72]
DATA = (data, time, label)
MASK = (mask1, mask2)

def get_random_hyperparameters():

    SET_LAYERS        = [1,2,3,5]
    SET_NODES         = [50, 100, 200, 300]
    SET_ACTIVATION_FN = ['relu', 'tanh'] 
    SET_BETA          = [0.1, 0.5, 1.0, 3.0, 5.0]

    new_parser = {'mb_size': 128,
                 'iteration': 3000,
                 'keep_prob': 0.6,
                 'lr_train': 1e-4,
                 'h_dim_shared': SET_NODES[np.random.randint(len(SET_NODES))],
                 'h_dim_CS': SET_NODES[np.random.randint(len(SET_NODES))],
                 'num_layers_shared':SET_LAYERS[np.random.randint(len(SET_LAYERS))],
                 'num_layers_CS':SET_LAYERS[np.random.randint(len(SET_LAYERS))],
                 'active_fn': SET_ACTIVATION_FN[np.random.randint(len(SET_ACTIVATION_FN))],
                 'alpha':1.0,
                 'beta':SET_BETA[np.random.randint(len(SET_BETA))],
                 'gamma':0,
                 }
    
    return new_parser

RS_NR                = 30
max_valid = 0.

for r_itr in range(RS_NR):
    print('Random search... itr: ' + str(r_itr))
    new_parser = get_random_hyperparameters()
    print(new_parser)

    tmp_max = get_valid_performance(DATA, MASK, new_parser, EVAL_TIMES)

    print('Current: ' + str(tmp_max))

    if tmp_max > max_valid:
        max_valid = tmp_max
        max_parser = new_parser
        #save_logging(max_parser, "/content/drive/My Drive/hyp_D1.txt")

    print('Current best: ' + str(max_valid))


Random search... itr: 0
{'mb_size': 128, 'iteration': 3000, 'keep_prob': 0.6, 'lr_train': 0.0001, 'h_dim_shared': 200, 'h_dim_CS': 50, 'num_layers_shared': 3, 'num_layers_CS': 3, 'active_fn': 'relu', 'alpha': 1.0, 'beta': 5.0, 'gamma': 0}
MAIN TRAINING ...
EVALUATION TIMES: [24, 48, 72]
|| ITR: 1000 | Loss: [1m[33m122.3980[0m
updated.... average c-index = 0.9277
|| ITR: 2000 | Loss: [1m[33m92.5101[0m
updated.... average c-index = 0.9418
|| ITR: 3000 | Loss: [1m[33m79.4223[0m
updated.... average c-index = 0.9535
Current: 0.9534890118396925
Current best: 0.9534890118396925
Random search... itr: 1
{'mb_size': 128, 'iteration': 3000, 'keep_prob': 0.6, 'lr_train': 0.0001, 'h_dim_shared': 100, 'h_dim_CS': 200, 'num_layers_shared': 3, 'num_layers_CS': 2, 'active_fn': 'relu', 'alpha': 1.0, 'beta': 3.0, 'gamma': 0}
MAIN TRAINING ...
EVALUATION TIMES: [24, 48, 72]
|| ITR: 1000 | Loss: [1m[33m57.8832[0m
updated.... average c-index = 0.9264
|| ITR: 2000 | Loss: [1m[33m38.6292[0m
upd

In [None]:
#HYPERPARAMETER TUNING - Dataset 2

cols = ["fico.score", "dti.r", "orig.upb", "ltv.r", "int.rate", "t.act.12m", "t.del.30d.12m", "t.del.60d.12m", "bal.repaid", "hpi.st.d.t.o", "hpi.r.st.us", "FRMA30Y.d.t.o", "ppi.c.FRMA", "ppi.o.FRMA", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.d.t.o", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

!cp "/content/drive/My Drive/datasets/newdataHT.csv" "loandataHT.csv"
(x_dim), (data, time, label), (mask1, mask2) = import_dataset_HYP(norm_mode = 'standard')
EVAL_TIMES = [24, 48, 72]
DATA = (data, time, label)
MASK = (mask1, mask2)

def get_random_hyperparameters():

    SET_LAYERS        = [1,2,3,5]
    SET_NODES         = [50, 100, 200, 300]
    SET_ACTIVATION_FN = ['relu', 'tanh'] 
    SET_BETA          = [0.1, 0.5, 1.0, 3.0, 5.0]

    new_parser = {'mb_size': 128,
                 'iteration': 3000,
                 'keep_prob': 0.6,
                 'lr_train': 1e-4,
                 'h_dim_shared': SET_NODES[np.random.randint(len(SET_NODES))],
                 'h_dim_CS': SET_NODES[np.random.randint(len(SET_NODES))],
                 'num_layers_shared':SET_LAYERS[np.random.randint(len(SET_LAYERS))],
                 'num_layers_CS':SET_LAYERS[np.random.randint(len(SET_LAYERS))],
                 'active_fn': SET_ACTIVATION_FN[np.random.randint(len(SET_ACTIVATION_FN))],
                 'alpha':1.0,
                 'beta':SET_BETA[np.random.randint(len(SET_BETA))],
                 'gamma':0,
                 }
    
    return new_parser

RS_NR                = 30
max_valid = 0.

for r_itr in range(RS_NR):
    print('Random search... itr: ' + str(r_itr))
    new_parser = get_random_hyperparameters()
    print(new_parser)

    tmp_max = get_valid_performance(DATA, MASK, new_parser, EVAL_TIMES)

    print('Current: ' + str(tmp_max))

    if tmp_max > max_valid:
        max_valid = tmp_max
        max_parser = new_parser
        #save_logging(max_parser, "/content/drive/My Drive/hyp_D2.txt")

    print('Current best: ' + str(max_valid))


Random search... itr: 0
{'mb_size': 128, 'iteration': 3000, 'keep_prob': 0.6, 'lr_train': 0.0001, 'h_dim_shared': 200, 'h_dim_CS': 100, 'num_layers_shared': 5, 'num_layers_CS': 2, 'active_fn': 'relu', 'alpha': 1.0, 'beta': 3.0, 'gamma': 0}
MAIN TRAINING ...
EVALUATION TIMES: [24, 48, 72]
|| ITR: 1000 | Loss: [1m[33m69.0528[0m
updated.... average c-index = 0.9179
|| ITR: 2000 | Loss: [1m[33m52.8476[0m
updated.... average c-index = 0.9361
|| ITR: 3000 | Loss: [1m[33m45.9812[0m
updated.... average c-index = 0.9366
Current: 0.9365878024789872
Current best: 0.9365878024789872
Random search... itr: 1
{'mb_size': 128, 'iteration': 3000, 'keep_prob': 0.6, 'lr_train': 0.0001, 'h_dim_shared': 100, 'h_dim_CS': 50, 'num_layers_shared': 1, 'num_layers_CS': 1, 'active_fn': 'relu', 'alpha': 1.0, 'beta': 3.0, 'gamma': 0}
MAIN TRAINING ...
EVALUATION TIMES: [24, 48, 72]
|| ITR: 1000 | Loss: [1m[33m63.7027[0m
updated.... average c-index = 0.9256
|| ITR: 2000 | Loss: [1m[33m47.7771[0m
upda

### Part 3: Analysis 1

To perform a performance experiment with DeepHit, the respective set of variables used for the experiment needs to be selected first. After that, samples are iteratively loaded. Based on 80% of each sample, a DeepHit model is trained and the performance is recorded by applying the trained model on a test set containing the remaining 20% of each sample. After that, cause-specific means, total means, and means across all sample results are calculated based on the obtained results. To switch between the experiments, one needs to follow the instructions provided in the code on how to do so.

In [None]:
# ANALYSIS 1 (Experiments 1.1/1.2/1.3/2/3/4.1/4.2/4.3)

#for experiments 1.1, 3, and 4.1:
cols = ["int.rate", "orig.upb", "fico.score", "dti.r",  "ltv.r", "bal.repaid", "t.act.12m", "t.del.30d.12m", "t.del.60d.12m"]

#for experiments 1.2 and 2:
#cols = ["hpi.st.d.t.o", "hpi.zip.o", "hpi.zip.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "equity.est", "hpi.st.log12m", "hpi.r.st.us", "hpi.r.zip.st", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#for experiment 1.3:
#cols = ["hpi.st.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "hpi.r.st.us", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#for experiment 4.2:
#cols = ["hpi.st.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "hpi.r.st.us", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#for experiment 4.3:
#cols = ["int.rate", "orig.upb", "fico.score", "dti.r",  "ltv.r", "bal.repaid", "t.act.12m", "t.del.30d.12m", "t.del.60d.12m", "hpi.st.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "hpi.r.st.us", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#experiment 1 -> 2:
# activate #!cp and #import_dataset_own_CR
# swap "#" model predict
# swap "#" results 1/2

#experiment 1 -> 3:
# 10x "loandataX.csv" -> "newdataX.csv"
# iterations 3000 -> 1500

#experiment 1 -> 4:
# 10x "loandata" -> "newdata"
# change hyperparameters

ALL_RSLTS = pd.DataFrame(columns=["24mth c(1)_index", "48mth c(1)_index", "72mth c(1)_index", "24mth c(2)_index", "48mth c(2)_index", "72mth c(2)_index"])

for i in range(10):
  if i==0:
    !cp "/content/drive/My Drive/datasets/loandata1.csv" "loandata.csv" #"loandata1s.csv" for experiment 3 #"newdata1.csv" for experiment 4.1/4.2/4.3
  elif i==1:
    !cp "/content/drive/My Drive/datasets/loandata2.csv" "loandata.csv" #"loandata2s.csv" for experiment 3 #"newdata2.csv" for experiment 4.1/4.2/4.3
  elif i==2:
    !cp "/content/drive/My Drive/datasets/loandata3.csv" "loandata.csv" #"loandata3s.csv" for experiment 3 #"newdata3.csv" for experiment 4.1/4.2/4.3
  elif i==3:
    !cp "/content/drive/My Drive/datasets/loandata4.csv" "loandata.csv" #"loandata4s.csv" for experiment 3 #"newdata4.csv" for experiment 4.1/4.2/4.3
  elif i==4:
    !cp "/content/drive/My Drive/datasets/loandata5.csv" "loandata.csv" #"loandata5s.csv" for experiment 3 #"newdata5.csv" for experiment 4.1/4.2/4.3
  elif i==5:
    !cp "/content/drive/My Drive/datasets/loandata6.csv" "loandata.csv" #"loandata6s.csv" for experiment 3 #"newdata6.csv" for experiment 4.1/4.2/4.3
  elif i==6:
    !cp "/content/drive/My Drive/datasets/loandata7.csv" "loandata.csv" #"loandata7s.csv" for experiment 3 #"newdata7.csv" for experiment 4.1/4.2/4.3
  elif i==7:
    !cp "/content/drive/My Drive/datasets/loandata8.csv" "loandata.csv" #"loandata8s.csv" for experiment 3 #"newdata8.csv" for experiment 4.1/4.2/4.3
  elif i==8:
    !cp "/content/drive/My Drive/datasets/loandata9.csv" "loandata.csv" #"loandata9s.csv" for experiment 3 #"newdata9.csv" for experiment 4.1/4.2/4.3
  elif i==9:
    !cp "/content/drive/My Drive/datasets/loandata10.csv" "loandata.csv" #"loandata10s.csv" for experiment 3 #"newdata10.csv" for experiment 4.1/4.2/4.3

  (x_dim), (data, time, label), (mask1, mask2) = import_dataset_OWN(norm_mode = 'standard')
  EVAL_TIMES = [24, 48, 72]
  DATA = (data, time, label)
  MASK = (mask1, mask2)

  # TRAIN NETWORK BASED ON OPTIMAL HYPERPARAMETERS

  in_parser = {'mb_size': 128,
                   'iteration': 3000, #1500 for experiment 3 #1000 for experiment 4.2
                   'keep_prob': 0.6,
                   'lr_train': 1e-4,
                   'h_dim_shared': 300,
                   'h_dim_CS': 200, #100 for experiment 4.1, 4.2, 4.3
                   'num_layers_shared': 3, #1 for experiment 4.1, 4.2, 4.3
                   'num_layers_CS':5, #3 for experiment 4.1, 4.2, 4.3
                   'active_fn': 'relu',
                   'alpha':1.0,
                   'beta':5.0, #1.0 for experiment 4.1, 4.2, 4.3
                   'gamma':0 }

  max_valid = 0.

  train_save_deephit(DATA, MASK, in_parser, EVAL_TIMES)

  _, num_Event, num_Category  = np.shape(mask1)

  mb_size                     = in_parser['mb_size']

  iteration                   = in_parser['iteration']

  keep_prob                   = in_parser['keep_prob']
  lr_train                    = in_parser['lr_train']

  h_dim_shared                = in_parser['h_dim_shared']
  h_dim_CS                    = in_parser['h_dim_CS']
  num_layers_shared           = in_parser['num_layers_shared']
  num_layers_CS               = in_parser['num_layers_CS']

  if in_parser['active_fn'] == 'relu':
      active_fn                = tf.nn.relu
  elif in_parser['active_fn'] == 'elu':
      active_fn                = tf.nn.elu
  elif in_parser['active_fn'] == 'tanh':
      active_fn                = tf.nn.tanh
  else:
      print('Error!')

  initial_W                   = tf.contrib.layers.xavier_initializer()

  alpha                       = in_parser['alpha']
  beta                        = in_parser['beta']
  gamma                       = in_parser['gamma']
  parameter_name              = 'a' + str('%02.0f' %(10*alpha)) + 'b' + str('%02.0f' %(10*beta)) + 'c' + str('%02.0f' %(10*gamma))

  input_dims                  = { 'x_dim'         : x_dim,
                                  'num_Event'     : num_Event,
                                  'num_Category'  : num_Category}

  network_settings            = { 'h_dim_shared'         : h_dim_shared,
                                  'h_dim_CS'          : h_dim_CS,
                                  'num_layers_shared'    : num_layers_shared,
                                  'num_layers_CS'    : num_layers_CS,
                                  'active_fn'      : active_fn,
                                  'initial_W'         : initial_W }

  tf.reset_default_graph()

  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  sess = tf.Session(config=config)

  model = Model_DeepHit(sess, "DeepHit", input_dims, network_settings)
  saver = tf.train.Saver()

  sess.run(tf.global_variables_initializer())

  saver.restore(sess, "/content/drive/My Drive/deephitmodel/model")

  # EVALUATE MODEL (test data being recreated)

  (tr_data,te_data, tr_time,te_time, tr_label,te_label, 
   tr_mask1,te_mask1, tr_mask2,te_mask2)  = train_test_split(data, time, label, mask1, mask2, test_size=0.2, random_state=seed)

  #activate next two lines for experiment_2 (to inject test data from after-crisis dataset)
  #!cp "/content/drive/My Drive/datasets/loandataCR.csv" "loandataCR.csv"
  #(x_dimCR), (te_dataCR, te_timeCR, te_labelCR), (te_mask1CR, te_mask2CR) = import_dataset_OWN_CR(norm_mode = 'standard') #loan_cr.csv now from AC dataset

  pred = model.predict(te_data)
  #activate next line and deactivate last line for experiment_2
  #pred = model.predict(te_dataCR)

  result = np.zeros([num_Event, len(EVAL_TIMES)])

  for t, t_time in enumerate(EVAL_TIMES):
      eval_horizon = int(t_time)

      if eval_horizon >= num_Category:
          print( 'ERROR: evaluation horizon is out of range')
          result[:, t] -1
      else:
          risk = np.sum(pred[:,:,:(eval_horizon+1)], axis=2) #risk score until EVAL_TIMES
          for k in range(num_Event):
              result[k, t] = c_index(risk[:,k], te_time, (te_label[:,0] == k+1).astype(float), eval_horizon)
              #activate the next line and deactivate the previous line for experiment_2
              #result[k, t] = c_index(risk[:,k], te_timeCR, (te_labelCR[:,0] == k+1).astype(float), eval_horizon)

  df1 = pd.DataFrame(result)

  RSLTS = pd.DataFrame(np.zeros((1, 6)))
  RSLTS = RSLTS.rename(columns={0: "24mth c(1)_index", 1: "48mth c(1)_index", 2: "72mth c(1)_index", 3: "24mth c(2)_index", 4: "48mth c(2)_index", 5: "72mth c(2)_index"})
  RSLTS.iat[0,0] = df1.iat[0,0]
  RSLTS.iat[0,1] = df1.iat[0,1]
  RSLTS.iat[0,2] = df1.iat[0,2]
  RSLTS.iat[0,3] = df1.iat[1,0]
  RSLTS.iat[0,4] = df1.iat[1,1]
  RSLTS.iat[0,5] = df1.iat[1,2]

  ALL_RSLTS = pd.concat([ALL_RSLTS, RSLTS])

ALL_RSLTS["c_mean"] = ALL_RSLTS.mean(axis=1)
tmp1 = ALL_RSLTS.iloc[:,0:3].copy()
tmp1["c(1)_mean"] = tmp1.mean(axis=1)
tmp2 = ALL_RSLTS.iloc[:,3:6].copy()
tmp2["c(2)_mean"] = tmp2.mean(axis=1)
ALL_RSLTS = pd.concat([ALL_RSLTS.iloc[:,0:6], tmp1.iloc[:,3], tmp2.iloc[:,3], ALL_RSLTS.iloc[:,6]], axis=1)
ALL_RSLTS = ALL_RSLTS.append(ALL_RSLTS.mean(axis=0), ignore_index=True)
ALL_RSLTS = ALL_RSLTS.rename(index={0: "1", 1: "2", 2: "3", 3: "4", 4: "5", 5: "6", 6: "7", 7: "8", 8: "9", 9: "10", 10: "col_mean"})
ALL_RSLTS = ALL_RSLTS.multiply(100)
ALL_RSLTS = ALL_RSLTS.round(2)

INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model


In [None]:
### DOWNLOAD RESULTS
ALL_RSLTS.to_csv("experiment_1_1_DHT.csv")
files.download("experiment_1_1_DHT.csv")

### Part 4: Analysis 2

To perform the experiments of Analysis 2, a DeepHit model is first trained based on 80% of the "loandataVI.csv"/"newdataVI.csv" dataset, and performance is evaluated based on the remaining 20%. After that, each variable is subsequently noised-up in the test set while restoring the previously noised-up variables back to their original values, and performance is measured again. Based on the performance differences, variable importance rankings are calculated. The entire procedure can then be repeated for different standard deviations. Thereby, the standard deviation is set by the "ND_STD" variable found in the beginning of the cell. As we do not need to train the DeepHit model again when changing the standard deviation of the noise term, we can prevent repeated training of the model by setting the variable "tyn", also found at the beginning of the cell, equal to "0".

To perform experiments A to F, one needs to select the respective variables, the respective dataset, and the respective parser, as documented in the code.

In [None]:
# ANALYSIS 2 (Experiments A/B/C/D/E/F)

tyn = 1 #train (1) for the first variance setting, after that do not train (0)
ND_STD = 10 #set the standard deviation of the random normal noise

#for experiments A and D:
cols = ["int.rate", "orig.upb", "fico.score", "dti.r",  "ltv.r", "bal.repaid", "t.act.12m", "t.del.30d.12m", "t.del.60d.12m"]

#for experiment B:
#cols = ["hpi.st.d.t.o", "hpi.zip.o", "hpi.zip.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "equity.est", "hpi.st.log12m", "hpi.r.st.us", "hpi.r.zip.st", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#for experiment C:
#cols = ["int.rate", "orig.upb", "fico.score", "dti.r",  "ltv.r", "bal.repaid", "t.act.12m", "t.del.30d.12m", "t.del.60d.12m", "hpi.st.d.t.o", "hpi.zip.o", "hpi.zip.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "equity.est", "hpi.st.log12m", "hpi.r.st.us", "hpi.r.zip.st", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#for experiment E and reimplementation of experiment B:
#cols = ["hpi.st.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "hpi.r.st.us", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#for experiment F and reimplementation of experiment C:
#cols = ["int.rate", "orig.upb", "fico.score", "dti.r",  "ltv.r", "bal.repaid", "t.act.12m", "t.del.30d.12m", "t.del.60d.12m", "hpi.st.d.t.o", "ppi.c.FRMA", "TB10Y.d.t.o", "FRMA30Y.d.t.o", "ppi.o.FRMA", "hpi.r.st.us", "st.unemp.r12m", "st.unemp.r3m", "TB10Y.r12m", "T10Y3MM", "T10Y3MM.r12m"]

#for experiments A, B, C:
!cp "/content/drive/My Drive/datasets/loandataVI.csv" "loandata.csv"

#for experiments D, E, F:
#!cp "/content/drive/My Drive/datasets/newdataVI.csv" "loandata.csv"

(x_dim), (data, time, label), (mask1, mask2) = import_dataset_OWN(norm_mode = 'standard')
EVAL_TIMES = [24, 48, 72]
DATA = (data, time, label)
MASK = (mask1, mask2)

ALL_RSLTS = pd.DataFrame(columns=["24mth c(1)_index", "48mth c(1)_index", "72mth c(1)_index", "24mth c(2)_index", "48mth c(2)_index", "72mth c(2)_index"])

for i in range(x_dim+1):
  if i==0:
    if tyn==1:

    # TRAIN MODEL BASED ON OPTIMAL HYPERPARAMETERS

     #for experiments A, B, C:
      in_parser = {'mb_size': 128,
                       'iteration': 3000,
                       'keep_prob': 0.6,
                       'lr_train': 1e-4,
                       'h_dim_shared': 300,
                       'h_dim_CS': 200,
                       'num_layers_shared': 3,
                       'num_layers_CS':5,
                       'active_fn': 'relu',
                       'alpha':1.0,
                       'beta':5.0,
                       'gamma':0 }

      #for experiments D, E, F:
      #in_parser = {'mb_size': 128,
      #                 'iteration': 3000, #1000 for experiment E
      #                 'keep_prob': 0.6,
      #                 'lr_train': 1e-4,
      #                 'h_dim_shared': 300,
      #                 'h_dim_CS': 100,
      #                 'num_layers_shared': 1,
      #                 'num_layers_CS':3,
      #                 'active_fn': 'relu',
      #                 'alpha':1.0,
      #                 'beta':1.0,
      #                 'gamma':0 }

      max_valid = 0.

      train_save_deephit(DATA, MASK, in_parser, EVAL_TIMES)

      _, num_Event, num_Category  = np.shape(mask1)

      mb_size                     = in_parser['mb_size']

      iteration                   = in_parser['iteration']

      keep_prob                   = in_parser['keep_prob']
      lr_train                    = in_parser['lr_train']

      h_dim_shared                = in_parser['h_dim_shared']
      h_dim_CS                    = in_parser['h_dim_CS']
      num_layers_shared           = in_parser['num_layers_shared']
      num_layers_CS               = in_parser['num_layers_CS']

      if in_parser['active_fn'] == 'relu':
          active_fn                = tf.nn.relu
      elif in_parser['active_fn'] == 'elu':
          active_fn                = tf.nn.elu
      elif in_parser['active_fn'] == 'tanh':
          active_fn                = tf.nn.tanh
      else:
          print('Error!')

      initial_W                   = tf.contrib.layers.xavier_initializer()

      alpha                       = in_parser['alpha']
      beta                        = in_parser['beta']
      gamma                       = in_parser['gamma'] 
      parameter_name              = 'a' + str('%02.0f' %(10*alpha)) + 'b' + str('%02.0f' %(10*beta)) + 'c' + str('%02.0f' %(10*gamma))

      input_dims                  = { 'x_dim'         : x_dim,
                                    'num_Event'     : num_Event,
                                    'num_Category'  : num_Category}

      network_settings            = { 'h_dim_shared'         : h_dim_shared,
                                      'h_dim_CS'          : h_dim_CS,
                                      'num_layers_shared'    : num_layers_shared,
                                      'num_layers_CS'    : num_layers_CS,
                                      'active_fn'      : active_fn,
                                      'initial_W'         : initial_W }


    tf.reset_default_graph()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    model = Model_DeepHit(sess, "DeepHit", input_dims, network_settings)
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())

    saver.restore(sess, "/content/drive/My Drive/deephitmodel/model")

  # EVALUATE MODEL (test data being recreated)

    (tr_data,te_data, tr_time,te_time, tr_label,te_label, 
     tr_mask1,te_mask1, tr_mask2,te_mask2)  = train_test_split(data, time, label, mask1, mask2, test_size=0.2, random_state=seed)  #for experiment 3_1: test_size=0.01336362

  if i!=0:
    (tr_data,te_data, tr_time,te_time, tr_label,te_label, 
     tr_mask1,te_mask1, tr_mask2,te_mask2)  = train_test_split(data, time, label, mask1, mask2, test_size=0.2, random_state=seed)  #for experiment 3_1: test_size=0.01336362

    noise_col = i-1
    te_data[:,noise_col] = te_data[:,noise_col]+np.random.normal(0, ND_STD, te_data.shape[0])

  pred = model.predict(te_data)

  result = np.zeros([num_Event, len(EVAL_TIMES)])

  for t, t_time in enumerate(EVAL_TIMES):
      eval_horizon = int(t_time)

      if eval_horizon >= num_Category:
          print( 'ERROR: evaluation horizon is out of range')
          result[:, t] -1
      else:
          risk = np.sum(pred[:,:,:(eval_horizon+1)], axis=2) #risk score until EVAL_TIMES
          for k in range(num_Event):
              result[k, t] = c_index(risk[:,k], te_time, (te_label[:,0] == k+1).astype(float), eval_horizon)
              #activate the next line and deactivate the previous line for experiment_2
              #result[k, t] = c_index(risk[:,k], te_timeCR, (te_labelCR[:,0] == k+1).astype(float), eval_horizon)

  df1 = pd.DataFrame(result)

  RSLTS = pd.DataFrame(np.zeros((1, 6)))
  RSLTS = RSLTS.rename(columns={0: "24mth c(1)_index", 1: "48mth c(1)_index", 2: "72mth c(1)_index", 3: "24mth c(2)_index", 4: "48mth c(2)_index", 5: "72mth c(2)_index"})
  RSLTS.iat[0,0] = df1.iat[0,0]
  RSLTS.iat[0,1] = df1.iat[0,1]
  RSLTS.iat[0,2] = df1.iat[0,2]
  RSLTS.iat[0,3] = df1.iat[1,0]
  RSLTS.iat[0,4] = df1.iat[1,1]
  RSLTS.iat[0,5] = df1.iat[1,2]

  ALL_RSLTS = pd.concat([ALL_RSLTS, RSLTS])

ALL_RSLTS["c_mean"] = ALL_RSLTS.mean(axis=1)
ALL_RSLTS = ALL_RSLTS.reset_index()
ALL_RSLTS = ALL_RSLTS.drop(ALL_RSLTS.columns[0], axis=1)
ALL_RSLTS = ALL_RSLTS.rename(index={0: "No Noise"})
for x in range(x_dim):
  ALL_RSLTS = ALL_RSLTS.rename(index={x+1: cols[x]})
tmp1 = ALL_RSLTS.iloc[:,0:3].copy()
tmp1["c(1)_mean"] = tmp1.mean(axis=1)
tmp2 = ALL_RSLTS.iloc[:,3:6].copy()
tmp2["c(2)_mean"] = tmp2.mean(axis=1)
ALL_RSLTS = pd.concat([ALL_RSLTS.iloc[:,0:6], tmp1.iloc[:,3], tmp2.iloc[:,3], ALL_RSLTS.iloc[:,6]], axis=1)
ALL_RSLTS = ALL_RSLTS.multiply(100)
ALL_RSLTSX = ALL_RSLTS.copy()
ALL_RSLTSX = ALL_RSLTSX.diff().cumsum()
ALL_RSLTSXX = ALL_RSLTSX.copy()
ALL_RSLTSXX['24mth c(1)_index'] = ALL_RSLTSXX['24mth c(1)_index'].rank(method='min', na_option='keep')
ALL_RSLTSXX['48mth c(1)_index'] = ALL_RSLTSXX['48mth c(1)_index'].rank(method='min', na_option='keep')
ALL_RSLTSXX['72mth c(1)_index'] = ALL_RSLTSXX['72mth c(1)_index'].rank(method='min', na_option='keep')
ALL_RSLTSXX['24mth c(2)_index'] = ALL_RSLTSXX['24mth c(2)_index'].rank(method='min', na_option='keep')
ALL_RSLTSXX['48mth c(2)_index'] = ALL_RSLTSXX['48mth c(2)_index'].rank(method='min', na_option='keep')
ALL_RSLTSXX['72mth c(2)_index'] = ALL_RSLTSXX['72mth c(2)_index'].rank(method='min', na_option='keep')
ALL_RSLTSXX['c(1)_mean'] = ALL_RSLTSXX['c(1)_mean'].rank(method='min', na_option='keep')
ALL_RSLTSXX['c(2)_mean'] = ALL_RSLTSXX['c(2)_mean'].rank(method='min', na_option='keep')
ALL_RSLTSXX['c_mean'] = ALL_RSLTSXX['c_mean'].rank(method='min', na_option='keep')
ALL_RSLTS_ALL = pd.concat([ALL_RSLTS,ALL_RSLTSX, ALL_RSLTSXX])
ALL_RSLTS_ALL = ALL_RSLTS_ALL.round(2)

INFO:tensorflow:Restoring parameters from /content/drive/My Drive/deephitmodel/model


In [None]:
### DOWNLOAD RESULTS
ALL_RSLTS_ALL.to_csv("experiment_A_10_DHT.csv")
files.download("experiment_A_10_DHT.csv")