In [1]:
import numpy as np
import pandas as pd

In [2]:
loans = pd.read_csv('../data/lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### Adding safe loans

In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : 1 if x == 0 else -1)

In [4]:
loans = loans.drop('bad_loans',axis=1)

### One hot encoding

In [5]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'
loans = loans[features+[target]]

In [6]:
for feature in features:
    for value in loans[feature].unique():
        loans[value] = loans[feature].apply(lambda x : 1 if x == value else 0)

### Removing old features

In [7]:
features = list(set(list(loans))-set(features+[target]))

In [8]:
loans = loans[features + [target]]

In [9]:
loans.head(5)

Unnamed: 0,5 years,n/a,36 months,OWN,10+ years,6 years,9 years,< 1 year,1 year,7 years,...,F,60 months,OTHER,RENT,3 years,4 years,MORTGAGE,8 years,2 years,safe_loans
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,-1
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1


In [10]:
train_id = pd.read_json('../data/module-8-assignment-1-train-idx.json')
validation_id = pd.read_json('../data/module-8-assignment-1-validation-idx.json')

In [11]:
data_train = loans.iloc[train_id[0]]
data_validation = loans.iloc[validation_id[0]]

In [12]:
print "Data train : ",data_train.shape
print "Data validation : ",data_validation.shape

Data train :  (37219, 26)
Data validation :  (9284, 26)


### Intermediate weighted node mistake

In [13]:
def intermediate_node_weighted_mistake(labels_in_node, data_weights):
    
    total_weight_positive = sum(data_weights[labels_in_node == +1])
    weighted_mistake_all_negative = total_weight_positive
    
    total_weight_negative = sum(data_weights[labels_in_node == -1])
    weighted_mistake_all_positive = total_weight_negative
    
    if weighted_mistake_all_negative < weighted_mistake_all_positive:
        min_weight = weighted_mistake_all_negative
        min_label = -1
    elif weighted_mistake_all_negative > weighted_mistake_all_positive:
        min_weight = weighted_mistake_all_positive
        min_label = +1
    else:
        min_weight = weighted_mistake_all_positive
        min_label = +1
    
    return min_weight,min_label
    

# Quiz 1:

##    Weights of mistake will be equal to classification error

### Testing 

In [14]:
example_labels = np.array([-1, -1, 1, 1, 1])
example_data_weights = np.array([1., 2., .5, 1., 1.])
if intermediate_node_weighted_mistake(example_labels, example_data_weights) == (2.5, -1):
    print 'Test passed!'
else:
    print 'Test failed... try again!'
#total_weight_positive = (example_data_weights[example_labels == +1])
#print total_weight_positive

Test passed!


In [15]:
def best_splitting_feature(data,features,target,data_weights):
    
    best_feature = None
    best_error = float('+inf')
    num_points = float(len(data))
    
    for feature in features :
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1]
        
        left_data_weights = data_weights[data[feature] == 0]
        right_data_weights = data_weights[data[feature] == 1]
        
        left_weighted_mistakes,left_class = intermediate_node_weighted_mistake(left_split[target],left_data_weights)
        right_weighted_mistakes,right_class = intermediate_node_weighted_mistake(right_split[target],right_data_weights)
        
        error = (left_weighted_mistakes + right_weighted_mistakes)/sum(data_weights)
        if error < best_error:
            best_feature = feature
            best_error = error
    return best_feature
        

### Creating a lead node

In [16]:
def create_leaf_node(target_values,data_weights):
    
    leaf = {'splitting_feature':None,'is_leaf':True}
    
    weighted_error,best_class = intermediate_node_weighted_mistake(target_values,data_weights)
    leaf['prediction'] = best_class
    
    return leaf

### Creating a weighted decision tree

In [17]:
def weighted_decision_tree_create(data,features,target,data_weights,current_depth = 1,max_depth = 10):
    remaining_features = features[:]
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    
    #stopping condtion 1 ie. error = 0
    if intermediate_node_weighted_mistake(target_values,data_weights) <= 1e-15:
        print "Stopping Condition 1 reached."
        return create_leaf_node(target_values,data_weights)
        
    #Stopping condition 2 ie. No more features
    if remaining_features == []:
        print "Stopping condition 2 reached"
        return create_leaf_node(target_values,data_weights)
    
    #Stopping condition 3 ie. Max depth reached
    if current_depth > max_depth:
        print "Reached Maximum depth, Stopping for now"
        return create_leaf_node(target_values,data_weights)
    
    splitting_feature = best_splitting_feature(data,features,target,data_weights)
    remaining_features.remove(splitting_feature)
    
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    left_data_weights = data_weights[data[splitting_feature] == 0]
    right_data_weights = data_weights[data[splitting_feature] == 1]
    
    print "Split on feature %s. (%s, %s)" % (\
              splitting_feature, len(left_split), len(right_split))
    
    if len(left_split) == len(data):
        print "Creating Leaf node"
        return create_leaf_node(left_split[target],data_weights)
    if len(right_split) == len(data):
        print "Creating Leaf node"
        return create_leaf_node(right_split[target],data_weights)
    
    left_tree = weighted_decision_tree_create(left_split,
                                              remaining_features,target,
                                             left_data_weights,current_depth+1,max_depth)
    right_tree = weighted_decision_tree_create(right_split,
                                              remaining_features,target,
                                              right_data_weights,current_depth+1,
                                              max_depth)
    return {
        'is_leaf':False,
        "prediction":None,
        'splitting_feature':splitting_feature,
        'left':left_tree,
        'right':right_tree
    }
    

### A function to classify

In [22]:
def classify(tree, x, annotate = False):
       # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print "At leaf, predicting %s" % tree['prediction']
        return tree['prediction']
    else:
        # split on feature.
        splitting_feature = tree['splitting_feature']
        split_feature_value = x[splitting_feature]
        if annotate:
             print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'],x,annotate)

### A function to calculate classification error

In [23]:
def evaluate_classification(tree,data):
    data['prediction'] = data.apply(lambda z : classify(decision_tree,z),axis = 1)
    error = data[data[target] != data['prediction']]
    print "Error  : ", len(error)/float(len(data))
    return len(error)/float(len(data))
    

In [24]:
# Assign weights
example_data_weights = np.array([1.] * 10 + [0.]*(len(data_train) - 20) + [1.] * 10)
# Train a weighted decision tree model.
small_data_decision_tree_subset_20 = weighted_decision_tree_create(data_train, features, target,
                         example_data_weights, max_depth=2)

--------------------------------------------------------------------
Subtree, depth = 1 (37219 data points).
Split on feature B. (26041, 11178)
--------------------------------------------------------------------
Subtree, depth = 2 (26041 data points).
Split on feature C. (16831, 9210)
--------------------------------------------------------------------
Subtree, depth = 3 (16831 data points).
Reached Maximum depth, Stopping for now
--------------------------------------------------------------------
Subtree, depth = 3 (9210 data points).
Reached Maximum depth, Stopping for now
--------------------------------------------------------------------
Subtree, depth = 2 (11178 data points).
Split on feature 5 years. (10271, 907)
--------------------------------------------------------------------
Subtree, depth = 3 (10271 data points).
Reached Maximum depth, Stopping for now
--------------------------------------------------------------------
Subtree, depth = 3 (907 data points).
Reached Maxi

In [25]:
ev

NameError: name 'evaluate_classification_error' is not defined

{'is_leaf': False,
 'left': {'is_leaf': False,
  'left': {'is_leaf': True, 'prediction': 1, 'splitting_feature': None},
  'prediction': None,
  'right': {'is_leaf': True, 'prediction': -1, 'splitting_feature': None},
  'splitting_Feature': 'C'},
 'prediction': None,
 'right': {'is_leaf': False,
  'left': {'is_leaf': True, 'prediction': -1, 'splitting_feature': None},
  'prediction': None,
  'right': {'is_leaf': True, 'prediction': 1, 'splitting_feature': None},
  'splitting_Feature': '5 years'},
 'splitting_Feature': 'B'}