In [3]:
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  2 16:22:34 2019

@author: vatsal
"""

from sklearn.ensemble import RandomForestClassifier
#other needed classes

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np
import math
from imblearn.under_sampling import RandomUnderSampler

In [6]:

df = pd.read_excel("CreditData_RareEvent.xlsx")


In [7]:

# Function for calculating loss and confusion matrix
def binary_loss(y, y_predict, fp_cost, fn_cost, display=True):
    loss = [0, 0] #False Neg Cost, False Pos Cost
    conf_mat = [0, 0, 0, 0] #tn, fp, fn, tp
    for j in range(len(y)):
        if y[j]==0:
            if y_predict[j]==0:
                conf_mat[0] += 1 #True Negative
            else:
                conf_mat[1] += 1 #False Positive
                loss[1] += fp_cost[j]
        else:
            if y_predict[j]==1:
                conf_mat[3] += 1 #True Positive
            else:
                conf_mat[2] += 1 #False Negative
                loss[0] += fn_cost[j]
    if display:
        fn_loss = loss[0]
        fp_loss = loss[1]
        total_loss = fn_loss + fp_loss
        misc = conf_mat[1] + conf_mat[2]
        misc = misc/len(y)
        print("{:.<23s}{:10.4f}".format("Misclassification Rate", misc))
        print("{:.<23s}{:10.0f}".format("False Negative Loss", fn_loss))
        print("{:.<23s}{:10.0f}".format("False Positive Loss", fp_loss))
        print("{:.<23s}{:10.0f}".format("Total Loss", total_loss))
    return loss, conf_mat


Getting the loss using the randomforest algorithm without performing the under sampling of the data.

In [8]:

# Attribute Map for CreditData_RareEvent.xlsx, N=10,500
df = pd.get_dummies(df, columns=["checking", "history", "coapp", "savings", "employed", "installp", "marital", "resident", "property", "other", "housing", "existcr", "job"])
df.shape
df['good_bad'] = df['good_bad'].map({'bad': 0, 'good': 1})
y=np.asarray(df['good_bad'])
X = np.asarray(df.drop(columns='good_bad'))

# Setup false positive and false negative costs for each transaction
fp_cost = np.array(0.1*df['amount'])
fn_cost = np.array(df['amount'])

depth=[]
Recall=[]
Accuracy=[]
Precision=[]
F1=[]
best_depth = 0
max_f = 0
for i in [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    depth = i
    decmodel = RandomForestClassifier(max_depth=i)
    recall = cross_val_score(estimator=decmodel, X=X, y=y, cv=10 ,scoring='recall')
    precision = cross_val_score(estimator=decmodel, X=X, y=y, cv=10 ,scoring='precision')
    f1=cross_val_score(estimator=decmodel, X=X, y=y, cv=10 ,scoring='f1')
    accuracy=cross_val_score(estimator=decmodel, X=X, y=y, cv=10 ,scoring='accuracy')
    decmodel_10 = cross_val_score(decmodel, X, y, scoring='f1', cv=10)
    mean = decmodel_10.mean()
    if mean > max_f:
        max_f = mean
        best_depth = i
        best_lgr = decmodel
answers=pd.DataFrame({'Depth':depth,'Recall':Recall,'Accuracy':Accuracy,'Precision':Precision,'F1':F1})    

print("\nDecision Tree Model using Entire Dataset and depth = ",best_depth)
clf_model = best_lgr.fit(X,y)

Y_pred=clf_model.predict(X)
cm=confusion_matrix(y,Y_pred)
loss,conf_mat = binary_loss(y,best_lgr.predict(X),\
fp_cost,fn_cost)


Decision Tree Model using Entire Dataset and depth =  5
Misclassification Rate.    0.0457
False Negative Loss....         0
False Positive Loss....    190757
Total Loss.............    190757


As it can be seen above we got false positive loss as zero since all the observations were classified as negative as the availability of positive/defaulters of credit card are very less.

Applying the undersampling with different ratios and tuning the hyperparameters to check out best model producing the minimum loss.

In [9]:

#Setup 20 random number seeds for use in creating random samples
np.random.seed(12345)
max_seed = 2**16 - 1
rand_val = np.random.randint(low=1, high=max_seed, size=10)
# Ratios of Majority:Minority Events
ratio = [ '50:50', '60:40', '70:30', '75:25', '80:20', '85:15' ]
# Dictionaries contains number of minority and majority
# events in each ratio sample where n_majority = ratio x n_minority
rus_ratio = ({0:500, 1:500}, {0:500, 1:750}, {0:500, 1:1167}, \
{0:500, 1:1500},{0:500, 1:2000}, {0:500, 1:2834})

# Best model is one that minimizes the loss
c_list = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
min_loss = 1e64
best_ratio = 0
for k in range(len(rus_ratio)):
    print("\nRandom Forest Model using " + ratio[k] + " RUS")
    best_c = 0
    min_loss_c = 1e64
    for j in range(len(c_list)):
        c = c_list[j]
        fn_loss = np.zeros(len(rand_val))
        fp_loss = np.zeros(len(rand_val))
        misc = np.zeros(len(rand_val))
        for i in range(len(rand_val)):
            rus = RandomUnderSampler(ratio=rus_ratio[k], \
                                     random_state=rand_val[i], \
                                     return_indices=False, \
                                     replacement=False)
            X_rus, y_rus = rus.fit_sample(X, y)
            
            
            decmodel = RandomForestClassifier(max_depth=c)
            decmodel.fit(X_rus, y_rus)
            loss, conf_mat = binary_loss(y, decmodel.predict(X), \
                                                   fp_cost, fn_cost, display=False)
            fn_loss[i] = loss[0]
            fp_loss[i] = loss[1]
            misc[i] = (conf_mat[1] + conf_mat[2])/y.shape[0]
        avg_misc = np.average(misc)
        t_loss = fp_loss+fn_loss
        avg_loss = np.average(t_loss)
        if avg_loss < min_loss_c:
            min_loss_c = avg_loss
            se_loss_c = np.std(t_loss)/math.sqrt(len(rand_val))
            best_c = c
            misc_c = avg_misc
            fn_avg_loss = np.average(fn_loss)
            fp_avg_loss = np.average(fp_loss)
    if min_loss_c < min_loss:
        min_loss = min_loss_c
        se_loss = se_loss_c
        best_ratio = k
        best_reg = best_c
    print("{:.<23s}{:12.2E}".format("Best depth", best_c))
    print("{:.<23s}{:12.4f}".format("Misclassification Rate",misc_c))
    print("{:.<23s} ${:10,.0f}".format("False Negative Loss",fn_avg_loss))
    print("{:.<23s} ${:10,.0f}".format("False Positive Loss",fp_avg_loss))
    print("{:.<23s} ${:10,.0f}{:5s}${:<,.0f}".format("Total Loss", \
             min_loss_c, " +/- ", se_loss_c))
print("")
print("{:.<23s}{:>12s}".format("Best RUS Ratio", ratio[best_ratio]))
print("{:.<23s}{:12.2E}".format("Best Depth", best_reg))
print("{:.<23s} ${:10,.0f}{:5s}${:<,.0f}".format("Lowest Loss", \
min_loss, " +/-", se_loss))


Random Forest Model using 50:50 RUS
Best depth.............    1.20E+01
Misclassification Rate.      0.1294
False Negative Loss.... $ 4,832,932
False Positive Loss.... $     1,124
Total Loss............. $ 4,834,056 +/- $235,374

Random Forest Model using 60:40 RUS
Best depth.............    2.00E+00
Misclassification Rate.      0.0770
False Negative Loss.... $ 2,008,080
False Positive Loss.... $   146,141
Total Loss............. $ 2,154,221 +/- $363,074

Random Forest Model using 70:30 RUS
Best depth.............    2.00E+00
Misclassification Rate.      0.0482
False Negative Loss.... $   260,179
False Positive Loss.... $   188,044
Total Loss............. $   448,224 +/- $79,965

Random Forest Model using 75:25 RUS
Best depth.............    2.00E+00
Misclassification Rate.      0.0474
False Negative Loss.... $         0
False Positive Loss.... $   207,213
Total Loss............. $   207,213 +/- $1,144

Random Forest Model using 80:20 RUS
Best depth.............    4.00E+00
Misclassif

In [10]:

# Ensemble Modeling - Averaging Classification Probabilities
n_obs = len(y)
n_rand = 100
predicted_prob = np.zeros((n_obs,n_rand))
avg_prob = np.zeros(n_obs)
# Setup 100 random number seeds for use in creating random samples
np.random.seed(12345)
max_seed = 2**16 - 1
rand_value = np.random.randint(1, high=max_seed, size=n_rand)
# Model 100 random samples, each with a 85:15 ratio
for i in range(len(rand_value)):
    rus = RandomUnderSampler(ratio=rus_ratio[best_ratio], \
                             random_state=rand_value[i], return_indices=False, \
                             replacement=False)
    X_rus, y_rus = rus.fit_sample(X, y)
    decmodel = RandomForestClassifier(max_depth=c)
    decmodel.fit(X_rus, y_rus)
    predicted_prob[0:n_obs, i] = decmodel.predict_proba(X)[0:n_obs, 0]
for i in range(n_obs):
    avg_prob[i] = np.mean(predicted_prob[i,0:n_rand])
# Set y_pred equal to the predicted classification
y_pred = avg_prob[0:n_obs] < 0.5
y_pred.astype(np.int)
# Calculate loss from using the ensemble predictions
print("\nEnsemble Estimates based on averaging",len(rand_value), "Models")
loss, conf_mat = binary_loss(y, y_pred, fp_cost, fn_cost)



Ensemble Estimates based on averaging 100 Models
Misclassification Rate.    0.0000
False Negative Loss....         0
False Positive Loss....         0
Total Loss.............         0


Finally developing the ensembled model for the selected under sampling ratio leads us with 0 miss-classifications.