## Analyze Lending Club's issued loans with Decision Tress

In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import tree
from sklearn import metrics




In [2]:
data = pd.read_csv(os.path.join("", "loan_sub.csv"), sep=',')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

In [4]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
data['safe_loans'] = data['bad_loans'].apply(lambda x : +1 if x==0 else -1)
data = data.drop('bad_loans', axis=1)

Safe loans vs bad loads percentage

In [5]:
data['safe_loans'].value_counts(normalize=True)

 1    0.811185
-1    0.188815
Name: safe_loans, dtype: float64

Select features

In [6]:
cols = ['grade', 'term','home_ownership', 'emp_length']
target = 'safe_loans'

data = data[cols + [target]]
data.head()

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
0,B,36 months,RENT,10+ years,1
1,C,60 months,RENT,< 1 year,-1
2,C,36 months,RENT,10+ years,1
3,C,36 months,RENT,10+ years,1
4,A,36 months,RENT,3 years,1


In [7]:
data['safe_loans'].value_counts()


 1    99457
-1    23150
Name: safe_loans, dtype: int64

In [9]:

# use the percentage of bad and good loans to undersample the safe loans.
bad_ones = data[data[target] == -1]
safe_ones = data[data[target] == 1]
percentage = len(bad_ones)/float(len(safe_ones))

risky_loans = bad_ones
safe_loans = safe_ones.sample(frac=percentage, random_state=33)

# combine two kinds of loans
data_set = pd.concat([risky_loans, safe_loans], axis=0)

Now, let's verify that the resulting percentage of safe and risky loans are each nearly 50%.

In [10]:
data_set[target].value_counts(normalize=True)

-1    0.5
 1    0.5
Name: safe_loans, dtype: float64

Preprocessing features

In [11]:
def dummies(data, columns=['pclass','name_title','embarked', 'sex']):
    for col in columns:
        data[col] = data[col].apply(lambda x: str(x))
        new_cols = [col + '_' + i for i in data[col].unique()]
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)[new_cols]], axis=1)
        del data[col]
    return data

In [12]:
#grade, home_ownership, target
cols = ['grade', 'term','home_ownership', 'emp_length']
data_set = dummies(data_set, columns=cols)
data_set.head()

Unnamed: 0,safe_loans,grade_C,grade_F,grade_B,grade_D,grade_A,grade_E,grade_G,term_ 60 months,term_ 36 months,...,emp_length_3 years,emp_length_10+ years,emp_length_1 year,emp_length_9 years,emp_length_2 years,emp_length_8 years,emp_length_7 years,emp_length_5 years,emp_length_nan,emp_length_6 years
1,-1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,-1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,-1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,-1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
12,-1,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


Split training data and testing data

In [13]:
train_data, test_data = train_test_split(data_set, test_size=0.2, random_state=33)

## Writing my own Decision Tree codes

In [14]:
def count_errors(labels_in_node):
    if len(labels_in_node) == 0:
        return 0
    
    positive_ones = labels_in_node.apply(lambda x: x==1).sum()
    negative_ones = labels_in_node.apply(lambda x: x==-1).sum()
    
    return min(positive_ones, negative_ones)


def entropy(y):
    # 0 or 1
    n = len(y)
    s1 = (y==1).sum()
    if s1 == 0 or s1 == n:
        return 0
    
    p1 = float(s1) / n
    p0 = 1 - p1
    return -p0 * np.log2(p0) - p1 * np.log2(p1)

In [15]:
def best_split(data, features, target):
    # return the best feature
    best_feature = None
    best_error = 2.0 
    num_data_points = float(len(data))  

    for feature in features:
        
        
        left_split = data[data[feature] == 0]
        
        
        right_split = data[data[feature] == 1] 
        
        
        left_misses = count_errors(left_split[target])            

        right_misses = count_errors(right_split[target])
            
        
        error = (left_misses + right_misses) * 1.0 / num_data_points

        
        if error < best_error:
            best_error = error
            best_feature = feature
    return best_feature


def best_split_entropy(data, features, target):
    
    best_feature = None
    best_info_gain = float('-inf') 
    num_data_points = float(len(data))
    
    entropy_original = entropy(data[target])

    for feature in features:
        
        
        left_split = data[data[feature] == 0]
        
       
        right_split = data[data[feature] == 1] 
        
        
        left_entropy = entropy(left_split[target])            

       
        right_entropy = entropy(right_split[target])
            
        
        entropy_split = len(left_split) / num_data_points * left_entropy + len(right_split) / num_data_points * right_entropy
        
       
        info_gain = entropy_original - entropy_split

        
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature
    return best_feature
    

In [16]:
class TreeNode:
    def __init__(self, is_leaf, prediction, split_feature):
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.split_feature = split_feature
        self.left = None
        self.right = None
        

In [17]:
def create_leaf(target_values):
  
    leaf = TreeNode(True, None, None)
    
    num_positive_ones = len(target_values[target_values == +1])
    num_negative_ones = len(target_values[target_values == -1])
    
    if num_positive_ones > num_negative_ones:
        leaf.prediction = 1
    else:
        leaf.prediction = -1
               
    return leaf 

In [18]:
def create_tree(data, features, target, current_depth = 0, max_depth = 10, min_error=0):
   
    remaining_features = features[:]
    
    target_values = data[target]
    
    # termination 1
    if count_errors(target_values) <= min_error:
        print("Termination 1 reached.")     
        return create_leaf(target_values)
    
    # termination 2
    if len(remaining_features) == 0:
        print("Termination 2 reached.")    
        return create_leaf(target_values)    
    
    # termination 3
    if current_depth >= max_depth: 
        print("Termination 3 reached.")
        return create_leaf(target_values)

    
    #split_feature = best_split(data, features, target)   
    split_feature = best_split_entropy(data, features, target)  
    
    left_split = data[data[split_feature] == 0]
    right_split = data[data[split_feature] == 1]
    
    remaining_features = remaining_features.drop(split_feature)
    print("Split on feature %s. (%s, %s)" % (split_feature, str(len(left_split)), str(len(right_split))))
    
    if len(left_split) == len(data):
        print("Perfect split!")
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print("Perfect split!")
        return create_leaf(right_split[target])
        
    left_tree = create_tree(left_split, remaining_features, target, current_depth + 1, max_depth,min_error)        
    right_tree = create_tree(right_split,remaining_features,target, current_depth + 1, max_depth,min_error)
    
    
    result_node = TreeNode(False, None, split_feature)
    result_node.left = left_tree
    result_node.right = right_tree
    return result_node

In [20]:
def predict(tree, x, annotate = False):   
    
    if tree.is_leaf:
        if annotate: 
            print("leaf node, predicting %s" % tree.prediction)
        return tree.prediction 
    else:
       
        split_feature_value = x[tree.split_feature]
        
        if annotate: 
            print("Split on %s = %s" % (tree.split_feature, split_feature_value))
        if split_feature_value == 0:
            return predict(tree.left, x, annotate)
        else:
            return predict(tree.right, x, annotate)

In [21]:
def evaluate_accuracy(tree, data):
    prediction = data.apply(lambda row: predict(tree, row), axis=1)
    
    accuracy = (prediction == data['safe_loans']).sum() * 1.0 / len(data)
    return accuracy

In [22]:

features = train_data.columns.drop(target)

In [23]:
my_decesion_tree = create_tree(train_data, features, target, current_depth = 0, max_depth = 10)

Split on feature grade_A. (31776, 5264)
Split on feature grade_B. (21587, 10189)
Split on feature grade_C. (12308, 9279)
Split on feature grade_D. (5553, 6755)
Split on feature term_ 60 months. (1743, 3810)
Split on feature grade_E. (459, 1284)
Split on feature emp_length_10+ years. (358, 101)
Split on feature emp_length_6 years. (328, 30)
Split on feature home_ownership_OTHER. (325, 3)
Split on feature emp_length_4 years. (297, 28)
Termination 3 reached.
Termination 3 reached.
Termination 1 reached.
Split on feature home_ownership_MORTGAGE. (23, 7)
Split on feature home_ownership_OWN. (21, 2)
Termination 3 reached.
Termination 3 reached.
Termination 1 reached.
Split on feature grade_F. (25, 76)
Split on feature home_ownership_OWN. (21, 4)
Split on feature home_ownership_RENT. (13, 8)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_G. (0, 4)
Perfect split!
Split on feature home_ownership_RENT. (47, 29)
Split on feature home_ownership_OWN. (39, 8)
Termination 3 reac

Split on feature home_ownership_MORTGAGE. (4700, 4039)
Split on feature emp_length_2 years. (4091, 609)
Split on feature emp_length_6 years. (3795, 296)
Split on feature emp_length_4 years. (3405, 390)
Split on feature home_ownership_OTHER. (3392, 13)
Split on feature emp_length_8 years. (3207, 185)
Termination 3 reached.
Termination 3 reached.
Split on feature emp_length_10+ years. (7, 6)
Termination 3 reached.
Termination 1 reached.
Split on feature home_ownership_RENT. (42, 348)
Split on feature grade_C. (42, 0)
Perfect split!
Split on feature grade_C. (348, 0)
Perfect split!
Split on feature home_ownership_OTHER. (294, 2)
Split on feature home_ownership_RENT. (44, 250)
Split on feature grade_C. (44, 0)
Perfect split!
Split on feature grade_C. (250, 0)
Perfect split!
Termination 1 reached.
Split on feature home_ownership_OWN. (547, 62)
Split on feature home_ownership_RENT. (1, 546)
Termination 1 reached.
Split on feature grade_C. (546, 0)
Perfect split!
Split on feature grade_C. (62

In [24]:
evaluate_accuracy(my_decesion_tree, test_data)

0.6193304535637149

In [25]:
def count_leaves(tree):
    if tree.is_leaf:
        return 1
    return count_leaves(tree.left) + count_leaves(tree.right)



- max_depth = 3  
- max_depth = 7  
- max_depth = 15


In [26]:
model_1 = create_tree(train_data, features, target, current_depth = 0, max_depth = 3)
model_2 = create_tree(train_data, features, target, current_depth = 0, max_depth = 7)
model_3 = create_tree(train_data, features, target, current_depth = 0, max_depth = 15)

Split on feature grade_A. (31776, 5264)
Split on feature grade_B. (21587, 10189)
Split on feature grade_C. (12308, 9279)
Termination 3 reached.
Termination 3 reached.
Split on feature term_ 60 months. (9134, 1055)
Termination 3 reached.
Termination 3 reached.
Split on feature emp_length_nan. (5037, 227)
Split on feature home_ownership_RENT. (3142, 1895)
Termination 3 reached.
Termination 3 reached.
Split on feature term_ 60 months. (220, 7)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_A. (31776, 5264)
Split on feature grade_B. (21587, 10189)
Split on feature grade_C. (12308, 9279)
Split on feature grade_D. (5553, 6755)
Split on feature term_ 60 months. (1743, 3810)
Split on feature grade_E. (459, 1284)
Split on feature emp_length_10+ years. (358, 101)
Termination 3 reached.
Termination 3 reached.
Split on feature emp_length_nan. (1223, 61)
Termination 3 reached.
Termination 3 reached.
Split on feature home_ownership_MORTGAGE. (1919, 1891)
Split on feature emp_le

Split on feature grade_F. (25, 76)
Split on feature home_ownership_OWN. (21, 4)
Split on feature home_ownership_RENT. (13, 8)
Split on feature grade_G. (0, 13)
Perfect split!
Split on feature grade_G. (0, 8)
Perfect split!
Split on feature grade_G. (0, 4)
Perfect split!
Split on feature home_ownership_RENT. (47, 29)
Split on feature home_ownership_OWN. (39, 8)
Split on feature grade_G. (39, 0)
Perfect split!
Split on feature grade_G. (8, 0)
Perfect split!
Split on feature grade_G. (29, 0)
Perfect split!
Split on feature emp_length_nan. (1223, 61)
Split on feature emp_length_1 year. (1117, 106)
Split on feature home_ownership_OTHER. (1113, 4)
Split on feature emp_length_2 years. (963, 150)
Split on feature home_ownership_MORTGAGE. (630, 333)
Split on feature emp_length_10+ years. (475, 155)
Split on feature emp_length_9 years. (446, 29)
Split on feature emp_length_7 years. (402, 44)
Split on feature emp_length_8 years. (367, 35)
Termination 3 reached.
Termination 3 reached.
Split on fea

Split on feature grade_F. (211, 0)
Perfect split!
Split on feature home_ownership_RENT. (23, 131)
Split on feature grade_F. (23, 0)
Perfect split!
Split on feature grade_F. (131, 0)
Perfect split!
Split on feature home_ownership_OTHER. (187, 1)
Split on feature home_ownership_RENT. (26, 161)
Split on feature grade_F. (26, 0)
Perfect split!
Split on feature grade_F. (161, 0)
Perfect split!
Termination 1 reached.
Split on feature home_ownership_RENT. (42, 263)
Split on feature home_ownership_OWN. (2, 40)
Termination 1 reached.
Split on feature grade_F. (40, 0)
Perfect split!
Split on feature grade_F. (263, 0)
Perfect split!
Split on feature home_ownership_OTHER. (370, 1)
Split on feature home_ownership_RENT. (32, 338)
Split on feature grade_F. (32, 0)
Perfect split!
Split on feature grade_F. (338, 0)
Perfect split!
Termination 1 reached.
Split on feature home_ownership_RENT. (42, 102)
Split on feature grade_F. (42, 0)
Perfect split!
Split on feature grade_F. (102, 0)
Perfect split!
Split

Split on feature grade_F. (33, 0)
Perfect split!
Split on feature home_ownership_RENT. (10, 22)
Split on feature grade_F. (10, 0)
Perfect split!
Split on feature grade_F. (22, 0)
Perfect split!
Split on feature home_ownership_RENT. (5, 82)
Split on feature grade_F. (5, 0)
Perfect split!
Split on feature grade_F. (82, 0)
Perfect split!
Split on feature home_ownership_RENT. (7, 56)
Split on feature grade_F. (7, 0)
Perfect split!
Split on feature grade_F. (56, 0)
Perfect split!
Split on feature emp_length_2 years. (1263, 73)
Split on feature emp_length_1 year. (1210, 53)
Split on feature emp_length_7 years. (1131, 79)
Split on feature emp_length_3 years. (1047, 84)
Split on feature emp_length_9 years. (970, 77)
Split on feature emp_length_4 years. (893, 77)
Split on feature emp_length_8 years. (811, 82)
Split on feature emp_length_10+ years. (246, 565)
Split on feature emp_length_< 1 year. (171, 75)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_F. (565, 0)
Perfect s

Split on feature home_ownership_OWN. (2430, 351)
Split on feature emp_length_4 years. (2263, 167)
Split on feature emp_length_3 years. (2070, 193)
Split on feature emp_length_6 years. (1900, 170)
Split on feature emp_length_< 1 year. (1724, 176)
Split on feature home_ownership_MORTGAGE. (3, 1721)
Split on feature emp_length_10+ years. (1, 2)
Termination 1 reached.
Split on feature grade_C. (2, 0)
Perfect split!
Split on feature emp_length_9 years. (1605, 116)
Split on feature emp_length_7 years. (1453, 152)
Split on feature emp_length_5 years. (1230, 223)
Split on feature emp_length_10+ years. (283, 947)
Termination 3 reached.
Termination 3 reached.
Split on feature grade_C. (223, 0)
Perfect split!
Split on feature grade_C. (152, 0)
Perfect split!
Split on feature grade_C. (116, 0)
Perfect split!
Split on feature home_ownership_MORTGAGE. (4, 172)
Termination 1 reached.
Split on feature grade_C. (172, 0)
Perfect split!
Split on feature grade_C. (170, 0)
Perfect split!
Split on feature h

In [27]:
print("model_1 training accuracy :", evaluate_accuracy(model_1, train_data))
print("model_2 training accuracy :", evaluate_accuracy(model_2, train_data))
print("model_3 training accuracy :", evaluate_accuracy(model_3, train_data))

model_1 training accuracy : 0.6173326133909287
model_2 training accuracy : 0.6229481641468683
model_3 training accuracy : 0.6266198704103672


In [28]:
print("model_1 testing accuracy :", evaluate_accuracy(model_1, test_data))
print("model_2 testing accuracy :", evaluate_accuracy(model_2, test_data))
print("model_3 testing accuracy :", evaluate_accuracy(model_3, test_data))

model_1 testing accuracy : 0.6173866090712743
model_2 testing accuracy : 0.6206263498920086
model_3 testing accuracy : 0.6187904967602592


In [29]:
print("model_1 complexity is: ", count_leaves(model_1))
print("model_2 complexity is: ", count_leaves(model_2))
print("model_3 complexity is: ", count_leaves(model_3))

model_1 complexity is:  8
model_2 complexity is:  74
model_3 complexity is:  384
