In [1]:
import pandas as pd
import numpy as np

In [2]:
loans=pd.read_csv('lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
tr_idx=pd.read_json('module-8-assignment-2-train-idx.json')[0]
tt_idx=pd.read_json('module-8-assignment-2-test-idx.json')[0]

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans=loans[loans.columns.drop('bad_loans')]
target = 'safe_loans'
loans = loans[features + [target]]

In [5]:
loans=pd.get_dummies(loans)

In [6]:
train_data=loans.iloc[tr_idx]
test_data=loans.iloc[tt_idx]

In [7]:
def intermediate_node_weighted_mistakes(labels_in_node, data_weights):
    # Sum the weights of all entries with label +1
    total_weight_positive = sum(data_weights[labels_in_node == +1])
    
    # Weight of mistakes for predicting all -1's is equal to the sum above
    ### YOUR CODE HERE
    weighted_mistakes_all_negative = total_weight_positive
    
    # Sum the weights of all entries with label -1
    ### YOUR CODE HERE
    total_weight_negative = sum(data_weights[labels_in_node == -1])
    
    # Weight of mistakes for predicting all +1's is equal to the sum above
    ### YOUR CODE HERE
    weighted_mistakes_all_positive = total_weight_negative
    
    # Return the tuple (weight, class_label) representing the lower of the two weights
    #    class_label should be an integer of value +1 or -1.
    # If the two weights are identical, return (weighted_mistakes_all_positive,+1)
    ### YOUR CODE HERE
    if weighted_mistakes_all_negative < weighted_mistakes_all_positive:
        return (weighted_mistakes_all_negative, -1)
    else: 
        return (weighted_mistakes_all_positive, +1)

In [8]:
def best_splitting_feature(data, features, target, data_weights):
    
    # These variables will keep track of the best feature and the corresponding error
    best_feature = None
    best_error = float('+inf') 
    num_points = float(len(data))

    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        # The right split will have all data points where the feature value is 1
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1]
        
        # Apply the same filtering to data_weights to create left_data_weights, right_data_weights
        ## YOUR CODE HERE
        left_data_weights = data_weights[data[feature]==0]
        right_data_weights = data_weights[data[feature]==1]
                    
        # DIFFERENT HERE
        # Calculate the weight of mistakes for left and right sides
        ## YOUR CODE HERE
        left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights)
        right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights)
        
        # DIFFERENT HERE
        # Compute weighted error by computing
        #  ( [weight of mistakes (left)] + [weight of mistakes (right)] ) / [total weight of all data points]
        ## YOUR CODE HERE
        error = left_weighted_mistakes + right_weighted_mistakes/sum(data_weights)
        
        # If this is the best error we have found so far, store the feature and the error
        if error < best_error:
            best_feature = feature
            best_error = error
    
    # Return the best feature we found
    return best_feature

In [9]:
def create_leaf(target_values, data_weights):
    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'is_leaf': True}
    
    # Computed weight of mistakes.
    # Store the predicted class (1 or -1) in leaf['prediction']
    weighted_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights)
    leaf['prediction'] = best_class ## YOUR CODE HERE
    
    return leaf

In [10]:
def weighted_decision_tree_create(data, features, target, data_weights, current_depth = 1, max_depth = 10):
    remaining_features = features[:] # Make a copy of the features.
    target_values = data[target]
    print ("--------------------------------------------------------------------")
    print ("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    
    # Stopping condition 1. Error is 0.
    if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
        print ("Stopping condition 1 reached.")                
        return create_leaf(target_values, data_weights)
    
    # Stopping condition 2. No more features.
    if remaining_features == []:
        print ("Stopping condition 2 reached.")                
        return create_leaf(target_values, data_weights)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth > max_depth:
        print ("Reached maximum depth. Stopping for now.")
        return create_leaf(target_values, data_weights)
    
    # If all the datapoints are the same, splitting_feature will be None. Create a leaf
    splitting_feature = best_splitting_feature(data, features, target, data_weights)
    remaining_features.remove(splitting_feature)
        
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    left_data_weights = data_weights[data[splitting_feature] == 0]
    right_data_weights = data_weights[data[splitting_feature] == 1]
    
    print ("Split on feature %s. (%s, %s)" % (\
              splitting_feature, len(left_split), len(right_split)))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print ("Creating leaf node.")
        return create_leaf(left_split[target], data_weights)
    if len(right_split) == len(data):
        print ("Creating leaf node.")
        return create_leaf(right_split[target], data_weights)
    
    # Repeat (recurse) on left and right subtrees
    left_tree = weighted_decision_tree_create(
        left_split, remaining_features, target, left_data_weights, current_depth + 1, max_depth)
    right_tree = weighted_decision_tree_create(
        right_split, remaining_features, target, right_data_weights, current_depth + 1, max_depth)
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [11]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [12]:
def classify(tree, x, annotate = False):   
    # If the node is a leaf node.
    if tree['is_leaf']:
        if annotate: 
            print ("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction'] 
    else:
        # Split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate: 
            print ("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [13]:
def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = [classify(tree,data.iloc[i]) for i in range(len(data))]
    
    # Once you've made the predictions, calculate the classification error
    return (prediction != data[target]).sum() / float(len(data))

In [14]:
example_weights=np.append(np.ones(10),np.zeros(len(train_data)-20))

In [15]:
example_weights=np.append(example_weights,np.ones(10))

In [16]:
example_weights_2=np.ones(len(train_data))

In [17]:
train_data.columns

Index(['safe_loans', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E',
       'grade_F', 'grade_G', 'term_ 36 months', 'term_ 60 months',
       'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN',
       'home_ownership_RENT', 'emp_length_1 year', 'emp_length_10+ years',
       'emp_length_2 years', 'emp_length_3 years', 'emp_length_4 years',
       'emp_length_5 years', 'emp_length_6 years', 'emp_length_7 years',
       'emp_length_8 years', 'emp_length_9 years', 'emp_length_< 1 year'],
      dtype='object')

In [22]:
small_data_decision_tree_subset_20 = weighted_decision_tree_create(train_data, train_data.columns.drop('safe_loans').tolist(), target,example_weights,current_depth=1, max_depth=2)
                                                             

--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature home_ownership_RENT. (20514, 16710)
--------------------------------------------------------------------
Subtree, depth = 2 (20514 data points).
Split on feature grade_F. (19613, 901)
--------------------------------------------------------------------
Subtree, depth = 3 (19613 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 3 (901 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 2 (16710 data points).
Split on feature grade_D. (13315, 3395)
--------------------------------------------------------------------
Subtree, depth = 3 (13315 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 3 (3395 data points).
Stoppin

In [24]:
subset_20  = train_data.head(10).append(train_data.tail(10))

In [25]:
small_data_decision_tree_subset_20['splitting_feature']

'home_ownership_RENT'

In [26]:
evaluate_classification_error(small_data_decision_tree_subset_20, subset_20)

0.05

In [56]:
from math import log
from math import exp

def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
    # start with unweighted data
    alpha = np.ones(len(data))
    weights = []
    tree_stumps = []
    target_values = data[target]
    
    for t in range(num_tree_stumps):
        print ('=====================================================')
        print ('Adaboost Iteration %d' % t)
        print ('=====================================================')
        # Learn a weighted decision tree stump. Use max_depth=1
        tree_stump = weighted_decision_tree_create(data, features, target, data_weights=alpha, max_depth=1)
        tree_stumps.append(tree_stump)
        
        # Make predictions
        predictions =[classify(tree_stump,data.iloc[i]) for i in range(len(data))]
        
        # Produce a Boolean array indicating whether
        # each data point was correctly classified
        is_correct = predictions == target_values
        is_wrong   = predictions != target_values
        
        # Compute weighted error
        # YOUR CODE HERE
        weighted_error = np.sum(alpha[is_wrong])/np.sum(alpha)
        
        # Compute model coefficient using weighted error
        # YOUR CODE HERE
        print (weighted_error)
        weight = log(float(1.0-weighted_error)/weighted_error)*0.5
        weights.append(weight)
        
        # Adjust weights on data point
        adjustment = is_correct.apply(lambda is_correct : exp(-weight) if is_correct else exp(weight))
        
        # Scale alpha by multiplying by adjustment
        # Then normalize data points weights
        ## YOUR CODE HERE 
        alpha=np.multiply(alpha,adjustment)
        alpha=alpha/sum(alpha)
    
    return weights, tree_stumps

In [57]:
adaboost_with_tree_stumps(train_data, train_data.columns.drop('safe_loans').tolist(), target, num_tree_stumps=10)

Adaboost Iteration 0
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 2 (9223 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (28001 data points).
Reached maximum depth. Stopping for now.
0.4216365785514722
Adaboost Iteration 1
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_A. (32094, 5130)
--------------------------------------------------------------------
Subtree, depth = 2 (32094 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (5130 data points).
Reached maximum depth. Stopping for now.
0.41249824891504494
Adaboost Iterat

([0.15802933659263743,
  0.17682363293635978,
  0.0931188897118565,
  0.07288885525865686,
  0.06706306914162666,
  0.0645691696162263,
  0.05456055779184845,
  0.043510936733712306,
  0.028988711500361326,
  0.01933343817058745],
 [{'is_leaf': False,
   'prediction': None,
   'splitting_feature': 'term_ 36 months',
   'left': {'splitting_feature': None, 'is_leaf': True, 'prediction': -1},
   'right': {'splitting_feature': None, 'is_leaf': True, 'prediction': 1}},
  {'is_leaf': False,
   'prediction': None,
   'splitting_feature': 'grade_A',
   'left': {'splitting_feature': None, 'is_leaf': True, 'prediction': -1},
   'right': {'splitting_feature': None, 'is_leaf': True, 'prediction': 1}},
  {'is_leaf': False,
   'prediction': None,
   'splitting_feature': 'grade_D',
   'left': {'splitting_feature': None, 'is_leaf': True, 'prediction': 1},
   'right': {'splitting_feature': None, 'is_leaf': True, 'prediction': -1}},
  {'is_leaf': False,
   'prediction': None,
   'splitting_feature': 'ho

In [109]:
def predict_adaboost(stump_weights, tree_stumps, data):
    scores = np.zeros(len(data))
    
    for i in range(len(tree_stumps)):
        predictions = [classify(tree_stumps[i],data.iloc[j]) for j in range(len(data))]
        
        # Accumulate predictions on scores array
        # YOUR CODE HERE
        scores+=predictions
    scores[scores>0]=1
    scores[scores<=0]=-1
    return scores

In [105]:
adaboost_30_weights,adaboost_30_stumps=adaboost_with_tree_stumps(train_data, train_data.columns.drop('safe_loans').tolist(), target, num_tree_stumps=30)

Adaboost Iteration 0
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 2 (9223 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (28001 data points).
Reached maximum depth. Stopping for now.
0.4216365785514722
Adaboost Iteration 1
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_A. (32094, 5130)
--------------------------------------------------------------------
Subtree, depth = 2 (32094 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (5130 data points).
Reached maximum depth. Stopping for now.
0.41249824891504494
Adaboost Iterat

0.4898027334553226
Adaboost Iteration 14
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_E. (33815, 3409)
--------------------------------------------------------------------
Subtree, depth = 2 (33815 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (3409 data points).
Reached maximum depth. Stopping for now.
0.4927549622523402
Adaboost Iteration 15
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature emp_length_4 years. (34593, 2631)
--------------------------------------------------------------------
Subtree, depth = 2 (34593 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (2631 data points).
Reached maximum depth. Stopping for now.
0.491235816

0.4950507541862053
Adaboost Iteration 28
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 2 (9223 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (28001 data points).
Reached maximum depth. Stopping for now.
0.4942711461288952
Adaboost Iteration 29
--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_C. (27812, 9412)
--------------------------------------------------------------------
Subtree, depth = 2 (27812 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (9412 data points).
Reached maximum depth. Stopping for now.
0.496467315454

In [110]:
error_all = []
for n in range(1, 31):
    predictions = predict_adaboost(adaboost_30_weights[:n], adaboost_30_stumps[:n], train_data)
    error = 1.0 - np.sum(train_data[target]==predictions)/len(predictions)
    error_all.append(error)
    print ("Iteration %s, training error = %s" % (n, error_all[n-1]))

Iteration 1, training error = 0.4216365785514722
Iteration 2, training error = 0.4340479260692026
Iteration 3, training error = 0.4000376101439931
Iteration 4, training error = 0.4050612508059317
Iteration 5, training error = 0.3862024500322373
Iteration 6, training error = 0.3843488072211476
Iteration 7, training error = 0.3842413496668816
Iteration 8, training error = 0.38292499462712226
Iteration 9, training error = 0.38292499462712226
Iteration 10, training error = 0.38292499462712226
Iteration 11, training error = 0.3853965183752418
Iteration 12, training error = 0.3863905007522028
Iteration 13, training error = 0.38730388996346443
Iteration 14, training error = 0.38829787234042556
Iteration 15, training error = 0.4007360842467226
Iteration 16, training error = 0.38829787234042556
Iteration 17, training error = 0.4020524392864818
Iteration 18, training error = 0.38956049860305175
Iteration 19, training error = 0.40465828497743395
Iteration 20, training error = 0.393160326670965
It

In [92]:
sc=np.array([0,0,0,1,0,11,1])

In [93]:
de=lambda x :1 if x>0 else -1
np.fromiter(de(sci) for sci in sc),sc.dtype,count=len(de))

SyntaxError: invalid syntax (<ipython-input-93-2d029050660c>, line 2)

In [99]:
x = np.array([1, 2, 3, 4, 5])
f = lambda x: 1 if x>0 else x=-1
squares = np.fromiter((f(xi) for xi in x), x.dtype, count=len(x))
squares

SyntaxError: can't assign to lambda (<ipython-input-99-e0f904a3fb33>, line 2)

In [None]:
x = np.array([1, 2, 3, 4, 5])
f = (lambda x: True if x>2 else x=0)
squares = np.fromiter((f(xi) for xi in x), x.dtype, count=len(x))