In [24]:
import numpy as np
import pandas as pd

In [25]:
loans = pd.read_csv('../data/lending-club-data.csv')

### Adding safe loans

In [26]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : 1 if x == 0 else -1)

In [27]:
loans = loans.drop('bad_loans',axis=1)

### One hot encoding

In [28]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'
loans = loans[features+[target]]

In [29]:
for feature in features:
    for value in loans[feature].unique():
        loans[value] = loans[feature].apply(lambda x : 1 if x == value else 0)

### Removing old features

In [30]:
features = list(set(list(loans))-set(features+[target]))

In [31]:
loans = loans[features + [target]]

In [32]:
loans.head(5)

Unnamed: 0,5 years,n/a,36 months,OWN,10+ years,6 years,9 years,< 1 year,1 year,7 years,...,F,60 months,OTHER,RENT,3 years,4 years,MORTGAGE,8 years,2 years,safe_loans
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,-1
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1


In [33]:
train_id = pd.read_json('../data/module-8-assignment-1-train-idx.json')
validation_id = pd.read_json('../data/module-8-assignment-1-validation-idx.json')

In [34]:
data_train = loans.iloc[train_id[0]]
data_validation = loans.iloc[validation_id[0]]

In [35]:
print "Data train : ",data_train.shape
print "Data validation : ",data_validation.shape

Data train :  (37219, 26)
Data validation :  (9284, 26)


### Intermediate weighted node mistake

In [36]:
def intermediate_node_weighted_mistake(labels_in_node, data_weights):
    
    total_weight_positive = sum(data_weights[labels_in_node == +1])
    weighted_mistake_all_negative = total_weight_positive
    
    total_weight_negative = sum(data_weights[labels_in_node == -1])
    weighted_mistake_all_positive = total_weight_negative
    
    if weighted_mistake_all_negative < weighted_mistake_all_positive:
        min_weight = weighted_mistake_all_negative
        min_label = -1
    elif weighted_mistake_all_negative > weighted_mistake_all_positive:
        min_weight = weighted_mistake_all_positive
        min_label = +1
    else:
        min_weight = weighted_mistake_all_positive
        min_label = +1
    
    return min_weight,min_label
    

# Quiz 1:

##    Weights of mistake will be equal to classification error

### Choosing the best splitting feature

In [38]:
example_labels = np.array([-1, -1, 1, 1, 1])
example_data_weights = np.array([1., 2., .5, 1., 1.])
if intermediate_node_weighted_mistake(example_labels, example_data_weights) == (2.5, -1):
    print 'Test passed!'
else:
    print 'Test failed... try again!'

Test passed!
