# Decision Tree Classifier

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

### The Model

In [2]:
def get_num_mistakes(y):
    '''Returns the number of mistakes in case of majority class label
    to all inputs. Also returns the majority label.
    '''
    
    y = y.flatten()
    label_counts = Counter(y)
    max_count = max(label_counts.values())
    # First majority class
    max_count_label = [label for label, count in label_counts.items() if 
                          count == max_count][0]
    num_mistakes = sum([count for label, count in label_counts.items() if 
                    label != max_count_label])
    return num_mistakes, max_count_label


def get_best_threshold(x, y):
    '''Returns the best threshold for the real valued input x with least 
    classification error on class labels y. Also returns classification error
    It binary classifies the data. For multiclass classification we can do 
    thresholding again and again.
    '''
    
    x = x.flatten()
    y = y.flatten()
    sorted_x = sorted(x)
    # No of Data Points
    m = x.shape[0]
    best_error = 2.0
    best_threshold = None
    
    for i in range(m - 1):
        
        threshold = (sorted_x[i] + sorted_x[i + 1]) / 2.0
        error = (get_num_mistakes(y[x < threshold])[0] +
                 get_num_mistakes(y[x >= threshold])[0]) / float(m)
        
        if error < best_error:
            
            best_error = error
            best_threshold = threshold
            
    return best_threshold, best_error
    
    
def get_best_splitting_feature(data, target, features_list, is_categorical_list):
    '''Returns the best feature with least classification error
    data : pandas dataframe containing training data
    target : target column name in data
    features_list : a list of column names used as features
    is_categorical_list : boolean list representing the corresponding columns
                          are categorical or not
    Also returns classification error and whether the feature is categorical.
    '''
    
    m = len(data)                       # No of data points
    best_feature = None
    is_categorical_best_feature = None
    best_error = 2.0

    for feature, is_categorical in zip(features_list, is_categorical_list):
        
        if is_categorical: # Categorical Feature
            
            num_mistakes = 0
            
            for category, frame in data.groupby(feature):
                
                num_mistakes += get_num_mistakes(np.array(frame[target]))[0]
                
            error = num_mistakes / float(m)
            
        else: # Continous Feature
            
            threshold, error = get_best_threshold(np.array(data[feature]),
                                                  np.array(data[target]))
            
        if error < best_error:
            
            best_error = error
            best_feature = feature
            is_categorical_best_feature = is_categorical
            
    return best_feature, best_error, is_categorical_best_feature

In [3]:
children_str = 'children'
splitting_feature_str = 'splitting_feature'
split_value_str = 'split_value'
label_str = 'label'
probability_str = 'probability'
is_categorical_str = 'is_categorical'


def create_node(y, children, splitting_feature, is_categorical, split_value):
    
    y = y.flatten()
    num_mistakes, label = get_num_mistakes(y)
    probability = 1 - (num_mistakes / float(len(y)))
    
    return {
        children_str: children,
        splitting_feature_str: splitting_feature,
        split_value_str: split_value,
        is_categorical_str: is_categorical,
        label_str: label,
        probability_str: probability
    }



def create_decision_tree(data, target, features_list, is_categorical_list,
                        current_depth=0, max_depth=10):
    
    current_features = features_list[:]
    current_categorical_list = is_categorical_list[:]
    current_node = create_node(np.array(data[target]), None, None, None, None)
    
    if current_node[probability_str] == 1 or not current_features or current_depth >= max_depth:
        
        return current_node
    
    feature, error, is_categorical = get_best_splitting_feature(data, target, current_features, 
                                                               current_categorical_list)
    
    current_node[splitting_feature_str] = feature
    current_node[is_categorical_str] = is_categorical
    ind = current_features.index(feature)
    children = []
    
    if is_categorical:
        
        del current_categorical_list[ind]
        del current_features[ind]
        
        for category, frame in data.groupby(feature):
            
            child = create_decision_tree(frame, target, current_features, current_categorical_list, 
                                        current_depth + 1, max_depth)
            child[split_value_str] = category
            children.append(child)
    
    else:
        
        threshold = get_best_threshold(np.array(data[feature]), np.array(data[target]))
        current_node['threshold'] = threshold
        
        frame1 = data[data[feature] < threshold]
        frame2 = data[data[feature] >= threshold]
        
        child1 = create_decision_tree(frame1, target, current_features, current_categorical_list, 
                                 current_depth + 1, max_depth)
        child2 = create_decision_tree(frame2, target, current_features, current_categorical_list, 
                                     current_depth + 1, max_depth)
        
        children.append(child1)
        children.append(child2)
        
    current_node[children_str] = children
    
    return current_node


def classify(tree, x):
    
    if not tree[children_str]:
        
        return tree[label_str]
    
    feature = tree[splitting_feature_str]
    is_categorical = tree[is_categorical_str]
    
    if is_categorical:
        
        for child in tree[children_str]:
            
            if child[split_value_str] == x[feature]:
                
                return classify(child, x)
            
    else:
        
        threshold = tree['threshold']
        children = tree[children_str]
        
        if x[feature] < threshold:
            
            return classify(children[0], x)
        
        else:
            
            return classify(children[1], x)
        
        

def display_tree(tree, level=0):
    
    print('\t' * level, tree[split_value_str], '\b:', tree[splitting_feature_str],
          '->', tree[label_str])
    
    if not tree[children_str]:
        return
    
    for child in tree[children_str]:
        
        display_tree(child, level + 1)
    

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'bad_loans'

In [5]:
df = pd.read_csv('data/lending-club-data.csv', usecols=features + [target])
df.head()

Unnamed: 0,term,grade,emp_length,home_ownership,bad_loans
0,36 months,B,10+ years,RENT,0
1,60 months,C,< 1 year,RENT,1
2,36 months,C,10+ years,RENT,0
3,36 months,C,10+ years,RENT,0
4,36 months,A,3 years,RENT,0


In [6]:
tree = create_decision_tree(df, target, features, list(np.ones(len(features), dtype=np.bool)))

In [7]:
classify(tree, df.iloc[10][features])

0