## Determine the risk of loan using Decision Tree

[LendingClub](https://www.lendingclub.com/).

In [238]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import tree
from sklearn import metrics


## 1 - Read the data

In [239]:
raw_data = pd.read_csv("loan.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [240]:
raw_data.shape

(887379, 74)

In [241]:
raw_data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,


In [242]:
raw_data.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [243]:
raw_data.loan_status.value_counts()

Current                                                601779
Fully Paid                                             207723
Charged Off                                             45248
Late (31-120 days)                                      11591
Issued                                                   8460
In Grace Period                                          6253
Late (16-30 days)                                        2357
Does not meet the credit policy. Status:Fully Paid       1988
Default                                                  1219
Does not meet the credit policy. Status:Charged Off       761
Name: loan_status, dtype: int64

## 2 - Data Processing

### 2.1 - Cleaning the data

The only useful label for our training is the loans that are fully paid or charged off. So we will keep only those data below.

In [244]:
data = raw_data.loc[raw_data.loan_status.isin(['Fully Paid','Charged Off'])]
#data.loan_status.value_counts(normalize=True)

### 2.2 - Feature Selection

In [245]:
cols = ['loan_status','grade','term','home_ownership','emp_length']
data = data[cols]
print('data.shape =',data.shape)
data.head()

data.shape = (252971, 5)


Unnamed: 0,loan_status,grade,term,home_ownership,emp_length
0,Fully Paid,B,36 months,RENT,10+ years
1,Charged Off,C,60 months,RENT,< 1 year
2,Fully Paid,C,36 months,RENT,10+ years
3,Fully Paid,C,36 months,RENT,10+ years
5,Fully Paid,A,36 months,RENT,3 years


check whether there are null values. If yes, we will simply drop them for the sake of simplicity.

In [246]:
data.isnull().any()

loan_status       False
grade             False
term              False
home_ownership    False
emp_length         True
dtype: bool

In [247]:
data.emp_length.value_counts(normalize=True,dropna=False)

10+ years    0.303912
2 years      0.093137
< 1 year     0.082563
3 years      0.080563
5 years      0.071388
1 year       0.066632
4 years      0.064027
6 years      0.058319
7 years      0.055678
8 years      0.046839
NaN          0.039123
9 years      0.037819
Name: emp_length, dtype: float64

In [248]:
data.dropna(inplace=True)
print('data.shape =',data.shape)
data.notnull().any()

data.shape = (243074, 5)


loan_status       True
grade             True
term              True
home_ownership    True
emp_length        True
dtype: bool

In [249]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243074 entries, 0 to 887369
Data columns (total 5 columns):
loan_status       243074 non-null object
grade             243074 non-null object
term              243074 non-null object
home_ownership    243074 non-null object
emp_length        243074 non-null object
dtypes: object(5)
memory usage: 11.1+ MB


### 2.3 Data Preprocessing

We will use one-hot encoding to convert discreet values into numerical vectors. For loan_status:
- Fully Paid $\to$ 1
- Charged Off $\to$ 0

In [250]:
data['loan_status'] = data['loan_status'].apply(lambda x: +1 if x=='Fully Paid' else 0)
data = data.rename(columns = {'loan_status':'label'})

In [251]:
data.head()

Unnamed: 0,label,grade,term,home_ownership,emp_length
0,1,B,36 months,RENT,10+ years
1,0,C,60 months,RENT,< 1 year
2,1,C,36 months,RENT,10+ years
3,1,C,36 months,RENT,10+ years
5,1,A,36 months,RENT,3 years


In [252]:
data.label.value_counts(normalize=True)

1    0.824239
0    0.175761
Name: label, dtype: float64

Note that the dataset is not balanced between good loans and bad loans. To ensure the bad loans and good loans have the same weight in training, we need to build a balanced dataset. Below we will downsample the good loans to this aim.

In [253]:
bad_loan = data[data.label==0]
good_loan = data[data.label==1]
good_loan = good_loan.sample(frac = len(bad_loan)/len(good_loan),random_state = 99)

data = pd.concat([good_loan,bad_loan],axis=0)
print('data.shape =',data.shape)
print(data.label.value_counts(normalize=True))
data.head()

data.shape = (85446, 5)
1    0.5
0    0.5
Name: label, dtype: float64


Unnamed: 0,label,grade,term,home_ownership,emp_length
134430,1,C,36 months,RENT,5 years
371117,1,E,60 months,RENT,10+ years
798597,1,D,60 months,OWN,10+ years
229928,1,B,36 months,RENT,6 years
848748,1,D,60 months,MORTGAGE,10+ years


In [254]:
data.grade.unique()

array(['C', 'E', 'D', 'B', 'A', 'G', 'F'], dtype=object)

Refer to [here](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html) for pandas.get_dummies().

In [255]:
# Convert categorical variable into dummy/indicator variables
def dummies(data, cols):
    for col in cols:
        data[col] = data[col].apply(lambda x: str(x))
        sub_cols = [col + '_' + i for i in data[col].unique()]
        data = pd.concat([data, pd.get_dummies(data[col],prefix=col)], axis=1)
        del data[col]
        data = data.sample(frac=1).reset_index(drop=True)
    return data


cols = ['grade','term','home_ownership','emp_length']
data = dummies(data,cols)
data.head()

Unnamed: 0,label,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,...,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year
0,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,1,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


### 2.4 - Train & Test Split

In [89]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=99)

## 3 - Build Binary Decision Tree From Scratch

- There are two criteria for feature selection: `error` and `entropy`. What about Gini impurity? 
The entropy is defined as $H = - \sum_i p_i \log_2 p_i$.

In [329]:
def count_errors(y):
    '''
    input: data.label in the Data Frame. We are looking at binary values for all features.
    output: the number of wrong prediction. We assume the prediction for all data will follow the more popular label.
    '''
    num_good = y.sum() #for multiple label values, we can use: y.apply(lambda x: x==1).sum()
    num_bad = len(y) - num_good
    
    return min(num_good,num_bad)


def entropy(y):
    '''
    input: data.label in the DataFrame. We are looking at binary values for all features.
    output: entropy in data.label. 
    '''
    if len(y)==0:
        return 0
    
    prob_good = y.sum()/len(y) 
    prob_bad = 1-prob_good
    
    if prob_good==0 or prob_good==1:
        return 0
    else:
        return -prob_good*np.log2(prob_good)-prob_bad*np.log2(prob_bad)
    
    
def best_feature(data, label, features, criterion):
    '''
    input: 
        data -> DataFrame, 
        label -> our label column,
        features -> remaining features for selection, this is necessary when building the tree,
        criterion -> 'error' or 'entropy'.
    output: the best feature to build the node.
    '''
    
    #initialization:
    if criterion == 'error':
        best_error = float(len(data))
        best_feature = None

        for feature in features:
            left_split = data[data[feature]==0]
            right_split = data[data[feature]==1]
        
            left_error = count_errors(left_split[label])
            right_error = count_errors(right_split[label])
            overall_error = left_error + right_error
                
            if  overall_error < best_error:
                best_feature = feature
                best_error = overall_error
        
        return best_feature
        
    elif criterion == 'entropy':
        best_entropy =  np.inf
        best_feature = None

        for feature in features:
            left_split = data[data[feature]==0]
            right_split = data[data[feature]==1]
        
            left_entropy = entropy(left_split[label])
            right_entropy = entropy(right_split[label])
            overall_entropy = left_entropy*len(left_split)/len(data) + right_entropy*len(right_split)/len(data)
                
            if  overall_entropy < best_entropy:
                best_feature = feature
                best_entropy = overall_entropy
        
        return best_feature
        
    else:
        print('Error: no such criterion')
        return None
    
    
        

- 1, define tree node;
- 2, create tree until any ending condition is satisfied.
- 3, create leaf. The ending conditions include: max_depth, remaining_feature, ...

In [330]:
class TreeNode():
    def __init__(self,is_leaf=False):
        self.is_leaf = is_leaf
        self.prediction = None
        self.split_feature = None
        self.left = None
        self.right = None
        
        
def create_leaf(data_label):
    #define a leaf
    leaf = TreeNode(is_leaf=True)
    
    #figure out the prediction
    l = len(data_label)
    good = data_label.sum()
    leaf.prediction = 1 if good >= l/2 else 0
    
    return leaf


def create_tree(data,label,remaining_features,criterion, depth, max_depth=5):
    
    if len(remaining_features) > 0 and depth < max_depth:
        # define a tree node, is_leaf takes the default value
        node = TreeNode()
        
        # split_feature
        bestfeature = best_feature(data,label,remaining_features,criterion)
        node.split_feature = bestfeature
        
        # prediction
        good = data[label].sum()
        node.prediction = 1 if good >= len(data)/2 else 0
        
        # define the children
        data_left = data[data[bestfeature]==0]
        data_right = data[data[bestfeature]==1]
        remaining_features = remaining_features.drop(bestfeature)
        if len(data_left) != 0:
            node.left = create_tree(data_left,label,remaining_features,criterion,depth+1,max_depth)        
        if len(data_right) != 0:
            node.right = create_tree(data_right,label,remaining_features,criterion,depth+1,max_depth)
        
        return node
    
    elif len(remaining_features) == 0:
#        print(' no remaining feature left')
        return create_leaf(data[label])

    elif depth == max_depth:
#        print(' maximum depth reached')
        return create_leaf(data[label])
    
        
    

In [331]:
def predict_row(row,node):
    if node.is_leaf == True:
        return node.prediction
    else:
        if row[node.split_feature]==0:
            return predict_row(row,node.left)
        else:
            return predict_row(row,node.right)

def predict(data,tree):
    return data.apply(lambda row: predict_row(row,tree), axis=1)

def accuracy(data,label,tree):
    pred = predict(data,tree)
    return (data.label==pred).sum()/len(data)

In [336]:
remaining_features = data.columns.drop('label')
tree = create_tree(data,'label',remaining_features,criterion='entropy', depth=0, max_depth=15)
pred = predict(data,tree)

df = pd.Series(pred==data.label.values)
df.value_counts()

True     53831
False    31615
dtype: int64

In [337]:
accuracy(data,'label',tree)

0.6300002340659598