In [34]:
import numpy as np
import pandas as pd


In [35]:
loans = pd.read_csv('../data/lending-club-data.csv')

In [36]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : 1 if x ==0 else -1)
loans = loans.drop(['bad_loans'],axis=1)

In [37]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

In [38]:
loans = loans[features + [target]]

In [39]:
train_id = list(pd.read_json('../data/module-5-assignment-1-train-idx.json')[0])
validaiton_id = list(pd.read_json('../data/module-5-assignment-1-validation-idx.json')[0])


In [40]:
data_train = loans.iloc[train_id]
data_validation = loans.iloc[validaiton_id]

In [41]:
print "Data train", len(data_train)
print "Data validation", len(data_validation)

Data train 37224
Data validation 9284


# Sub sampling the data to make it balanced

In [42]:
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

safe_loans = safe_loans_raw.sample(frac=percentage,random_state=1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print "Percentage of safe loans : ",len(safe_loans)/float(len(loans_data))
print "Percentage of risky loans : ",len(risky_loans)/float(len(loans_data))
print "Total loan data : ",len(loans_data)

Percentage of safe loans :  0.5
Percentage of risky loans :  0.5
Total loan data :  46300


In [43]:
string = []
for x in features :
    if loans_data[x].dtype == 'object':
        string.append(x)


# Converting string features into binary values

In [44]:
for x in string:
    for y in loans_data[x].unique():
        loans_data[y] = loans_data[x].apply(lambda z : 1 if z == y else 0)

In [45]:
loans_data = loans_data.drop(string,axis=1)

In [46]:
from sklearn import model_selection
train_data,test_data = model_selection.train_test_split(loans_data,random_state = 1)

In [47]:
len(list(loans_data))

26

In [57]:
def intermediate_node_num_mistakes(labels_in_node):
    if len(labels_in_node) == 0:
        return 0
    safe = list(labels_in_node).count(1)
    risky = list(labels_in_node).count(-1)
    if safe > risky:
        return risky
    elif safe < risky:
        return safe
    else :
        return safe
    

In [59]:
# Test case 1
example_labels = np.array([-1, -1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 1 failed... try again!'

# Test case 2
example_labels = np.array([-1, -1, 1, 1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 3 failed... try again!'
    
# Test case 3
example_labels = np.array([-1, -1, -1, -1, -1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 3 failed... try again!'

Test passed!
Test passed!
Test passed!


In [60]:
def best_splitting_features(data,features,target):
    target_value = data[target]
    best_feature = None
    best_error = 10
    num_data_points = float(len(data))
    
    for feature in features:
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1]
        
        left_mistakes = intermediate_node_num_mistakes(left_split[target])
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
        
        error = (left_mistakes + right_mistakes) / num_data_points
        if error < best_error:
            best_feature = feature
    return best_feature

# A function to define the leaf node

In [61]:
def create_leaf(target_values):
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True }   ## YOUR CODE HERE 
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    
    if num_ones > num_minus_ones:
        leaf['prediction'] = +1
    else:
        leaf['prediction'] = -1
    return leaf
            

# Create decision tree

In [71]:
def decision_tree_create(data,features,target,current_depth = 0,max_depth = 10):
    remaining_features = features[:]
    target_values = data[target]
    print "-----------------------------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    if intermediate_node_num_mistakes(target_values) == 0:
        print "Stopping condition 1 reached"
        return create_leaf(target_values)
    if remaining_features == []:
        print "Stopping condition 2 reached"
        return create_leaf(target_values)
    if current_depth >= max_depth:
        print "Reached maximum depth. Stopping for now"
        return create_leaf(target_values)
    
    splitting_feature = best_splitting_features(data,remaining_features,target)
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    remaining_features.remove(splitting_feature)
    print "Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split))
    
    
    if len(left_split) == len(data):
        print "Creating leaf node"
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print "Creating leaf node"
        return create_leaf(right_split[target])
    
    left_tree = decision_tree_create(left_split,remaining_features,
                                    target,current_depth+1,max_depth)
    right_tree = decision_tree_create(right_split,remaining_features,
                                     target,current_depth+1,max_depth)
    
    return {
        "is_leaf" : False,
        "prediction" : None,
        'splitting_feature' : splitting_feature,
        'left' : left_tree,
        'right' : right_tree
    }
      
    

In [68]:
features = list(loans_data)
print features.pop(features.index(target))

safe_loans


In [73]:
decision_tree = decision_tree_create(train_data,features,target,max_depth=6)

-----------------------------------------------------------------------------------------
Subtree, depth = 0 (34725 data points).
Split on feature 6 years. (32556, 2169)
-----------------------------------------------------------------------------------------
Subtree, depth = 1 (32556 data points).
Split on feature n/a. (31216, 1340)
-----------------------------------------------------------------------------------------
Subtree, depth = 2 (31216 data points).
Split on feature 5 years. (28516, 2700)
-----------------------------------------------------------------------------------------
Subtree, depth = 3 (28516 data points).
Split on feature 7 years. (26514, 2002)
-----------------------------------------------------------------------------------------
Subtree, depth = 4 (26514 data points).
Split on feature 8 years. (24987, 1527)
-----------------------------------------------------------------------------------------
Subtree, depth = 5 (24987 data points).
Split on feature 2 years

In [158]:
def classify(tree, x, annotate = False):
       # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print "At leaf, predicting %s" % tree['prediction']
        return tree['prediction']
    else:
        # split on feature.
        splitting_feature = tree['splitting_feature']
        split_feature_value = x[splitting_feature]
        if annotate:
             print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'],x,annotate)




In [159]:
x = test_data.iloc[0]

print 'Predicted class: %s ' % classify(decision_tree, x)


Predicted class: 1 


In [160]:
classify(decision_tree, x, annotate=True)

Split on 6 years = 0
Split on n/a = 0
Split on 5 years = 0
Split on 7 years = 0
Split on 8 years = 0
Split on 2 years = 0
At leaf, predicting 1


1

# Quiz 1)

## Split on 6 years = 0
## Split on n/a = 0
## Split on 5 years = 0
## Split on 7 years = 0
## Split on 8 years = 0
## Split on 2 years = 0
## At leaf, predicting 1

# Prediction = 1

In [165]:
def evaluate_classification(tree,data):
    data['prediction'] = data.apply(lambda z : classify(decision_tree,z),axis = 1)
    error = data[data[target] != data['prediction']]
    print "Error  : ", len(error)/float(len(data))
    return len(error)/float(len(data))
    

In [167]:
train_error = evaluate_classification(decision_tree,train_data)
test_error = evaluate_classification(decision_tree,test_data)
print "Train error : ",train_error
print "Test error : ",test_error

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Error  :  0.486652267819
Error  :  0.490971922246
Train error :  0.486652267819
Test error :  0.490971922246


# Train error :  0.486652267819

# Test error :  0.490971922246

In [170]:
def print_stump(tree, name = 'root'):
    split_name = tree['splitting_feature'] # split_name is something like 'term. 36 months'
    if split_name is None:
        print "(leaf, label: %s)" % tree['prediction']
        return None
    print '                       %s' % name
    print '         |---------------|----------------|'
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '  [{0} == 0]               [{0} == 1]    '.format(split_name)
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '    (%s)                         (%s)' \
        % (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
           ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))

In [171]:
print_stump(decision_tree)

                       root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [6 years == 0]               [6 years == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (leaf, label: -1)


In [172]:
print_stump(decision_tree['left'], 
            decision_tree['splitting_feature'])

                       6 years
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [n/a == 0]               [n/a == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (leaf, label: -1)


In [173]:
print_stump(decision_tree['left']['left'], decision_tree['left']['splitting_feature'])

                       n/a
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [5 years == 0]               [5 years == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (leaf, label: 1)
