# W3_Implementing Binary Decision Trees

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('lending-club-data.csv')
data['safe_loans'] = data['bad_loans'].apply(lambda x: 1 if x==0 else -1)
data.drop('bad_loans', axis=1, inplace=True)

features = ['grade', 'term', 'home_ownership', 'emp_length']
target = 'safe_loans'
data = pd.get_dummies(data[features+[target]])
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,...,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,emp_length_n/a
0,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,-1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [3]:
train_index = list(pd.read_json('module-5-assignment-2-train-idx.json')[0])
test_index = list(pd.read_json('module-5-assignment-2-test-idx.json')[0])
train = data.iloc[train_index]
test = data.iloc[test_index]

In [4]:
# used to decide the best splitting feature
def count_mistakes(labels_in_node):
    num_1 = (labels_in_node==1).sum()
    num_minus_1 = (labels_in_node==-1).sum()
    if num_1 >= num_minus_1:
        return num_minus_1
    else:
        return num_1

In [5]:
def best_splitting_feature(data, features, target):
    best_feature = None
    best_error = float('inf')
    
    for feature in features:
        left_split = data[data[feature]==0]
        right_split = data[data[feature]==1]
        left_error = count_mistakes(np.array(left_split[target]))
        right_error = count_mistakes(np.array(right_split[target]))
        total_error = left_error + right_error
        if total_error < best_error:
            best_feature = feature
            best_error = total_error
            
    return best_feature

In [6]:
def create_leaf(target_values):
        leaf = {'splitting_feature' : None,
                'left' : None,
                'right' : None,
                'is_leaf': True}
        num_1 = (target_values==1).sum()
        num_minus_1 = (target_values==-1).sum()
        if num_1 >= num_minus_1:
            leaf['prediction'] = 1
        else:
            leaf['prediction'] = -1
        return leaf

In [7]:
def create_decision_tree(data, features, target, current_depth=0, max_depth=10):
    remaining_features = features[:]
    target_values = data[target]
    
    print('--------------------------------------------------------------------')
    print('Subtree, current depth:{}, data point:{}'.format(current_depth, len(data)))
    
    # stopping condition 1
    if count_mistakes(target_values) == 0:
        print('Stopping condition reached: no mistake')
        return create_leaf(target_values)
    
    # stopping condition 2
    if len(remaining_features) == 0:
        print('Stopping condition reached: all features are used')
        return create_leaf(target_values)
    
    #stopping condition 3
    if current_depth == max_depth:
        print('Stopping condition reached: maximum depth')
        return create_leaf(target_values)

    splitting_feature = best_splitting_feature(data, remaining_features, target)
    
    left_split = data[data[splitting_feature]==0]
    right_split = data[data[splitting_feature]==1]
    remaining_features.remove(splitting_feature)
    
    if (len(left_split) == len(data) or len(right_split) == len(data)):
        print('All feature values are the same. Create leaf node')
        return create_leaf(target_values)
    
    left_tree = create_decision_tree(left_split, remaining_features, target, current_depth + 1, max_depth)  
    right_tree = create_decision_tree(right_split, remaining_features, target, current_depth + 1, max_depth)  
        
    return {'is_leaf': False, 
            'prediction': None,
            'splitting_feature': splitting_feature,
            'left': left_tree, 
            'right': right_tree}

In [8]:
features = list(train.columns)
features.remove('safe_loans')
target = 'safe_loans'

# build a decision tree
decision_tree = create_decision_tree(train, features, target, current_depth=0, max_depth=6)

--------------------------------------------------------------------
Subtree, current depth:0, data point:37224
--------------------------------------------------------------------
Subtree, current depth:1, data point:9223
--------------------------------------------------------------------
Subtree, current depth:2, data point:9122
--------------------------------------------------------------------
Subtree, current depth:3, data point:8074
--------------------------------------------------------------------
Subtree, current depth:4, data point:5884
--------------------------------------------------------------------
Subtree, current depth:5, data point:3826
--------------------------------------------------------------------
Subtree, current depth:6, data point:1693
Stopping condition reached: maximum depth
--------------------------------------------------------------------
Subtree, current depth:6, data point:2133
Stopping condition reached: maximum depth
---------------------------

In [9]:
def make_predicion(tree, x, annotate=False):
    if tree['is_leaf']:
        if annotate:
            print('reach leaf!')
        return tree['prediction']
    else:
        splitting_feature_value = x[tree['splitting_feature']]
        if annotate:
            print('splitting feature:', tree['splitting_feature'])
            print('splitting feature value:', splitting_feature_value)
        if splitting_feature_value == 0:
            return make_predicion(tree['left'], x, annotate)
        else:
            return make_predicion(tree['right'], x, annotate)

In [10]:
# make a prediction
make_predicion(decision_tree, test.iloc[0])

-1

In [11]:
def compute_classification_error(tree, data):
    predictions_list = []
    for i in range(len(data)):
        predictions_list.append(make_predicion(tree, data.iloc[i]))
    predictions = np.array(predictions_list)
    error_num = (predictions!=data['safe_loans']).sum()
    error_ratio = error_num / len(data)
    return error_ratio

In [12]:
compute_classification_error(decision_tree, test)

0.38377854373115039