In [1]:
import numpy as np
import pandas as pd

## Weather Data

This is a toy dataset of weather data and its relationship with playing golf.

In [32]:
data = pd.DataFrame(
{"Outlook":["R","R", "O", "S", "S", "S","O", "R", "R", "S"],
 "Temperature": ["H", "H", "H", "M", "C", "C", "C", "M", "C", "M"],
 "Humidity": ["H", "H", "H", "H", "N", "N", "N", "H", "N", "N"],
 "Windy": ["F", "T", "F", "F", "F", "T", "T", "F", "F", "F"],
 "Play_Golf": ["N", "N", "Y", "Y", "Y", "N", "Y", "N", "Y", "Y"]
}
)

data["Outlook"] = data["Outlook"].map({"R":0, "O": 1, "S": 2})
data["Temperature"] = data["Temperature"].map({"C":0, "M": 1, "H": 2})
data["Humidity"] = data["Humidity"].map({"N":0, "H": 1})
data["Windy"] = data["Windy"].map({"F":0, "T": 1})
data["Play_Golf"] = data["Play_Golf"].map({"N":0, "Y": 1})

data.head(n=10)

Unnamed: 0,Humidity,Outlook,Play_Golf,Temperature,Windy
0,1,0,0,2,0
1,1,0,0,2,1
2,1,1,1,2,0
3,1,2,1,1,0
4,0,2,1,0,0
5,0,2,0,0,1
6,0,1,1,0,1
7,1,0,0,1,0
8,0,0,1,0,0
9,0,2,1,1,0


## Computing Information Gain for Weather Data

Here we would manually compute information gain for different features. We will also validate that our computation is correct by computing information gain for two edge cases (i.e. a pure split, random split). We will see that a pure split gives an `information gain=1` and random split gives `information gain=0`.

In [45]:

def compute_infogain(data, feature, label_col_name):
    """
    This function computes the information gain
    IG(Y|X) = H(Y) - H(Y|X)
    data: A pd.DataFrame container all data
    grouped_data: A groupby object that groups Play_GOLF value counts by some feature
    """

    grouped_data = data.groupby(feature)[label_col_name].value_counts()
    
    h_y = compute_entropy(data[label_col_name])
    h_y_given_x = 0
    #print(grouped_data)
    for k in grouped_data.keys():
        # k is a tuple, which has the feature index followed by label index (groupby object)
        k_f, k_y = k
        h_y_given_x += (grouped_data[k_f][k_y].sum()*1.0/data.shape[0])* compute_entropy_with_counts(grouped_data[k_f])
    
    return h_y - h_y_given_x
    
def compute_entropy(ser):
    """
    This function computes the entropy
    """
    total = 0
    counts = ser.value_counts()
    for k in ser.unique():
        #print(k)
        total += -(counts[k]*1.0/ser.shape[0])*np.log2(counts[k]*1.0/ser.shape[0])
        
    return total

def compute_entropy_with_counts(data_counts):
    """
    This function computes the entropy
    """
    total = 0
    for k in data_counts:
        total += -(k*1.0/data_counts.sum())*np.log2(k*1.0/data_counts.sum())
       
    return total

## Making sure computations are correct

## The answer should be 1.0 because the data will be perfectly split to 0s and 1s
## by splitting according to x
print('Checking information gain with toy data')
toy_data = pd.DataFrame({'x':[0,0,0,0,1,1,1,1], 'y':[0,0,0,0,1,1,1,1]})
print('\tInformation Gain for Toy data is: Actual ({}) Expected ({})'.format(
    compute_infogain(toy_data, 'x', "y"), 1.0
))

## The answer should be 0.0 because the data will be random after splitting according to x
toy_data = pd.DataFrame({'x':[1,0,1,0,1,0,1,0], 'y':[0,0,0,0,1,1,1,1]})
print('\tInformation Gain for Toy data is: Actual ({}) Expected ({})\n'.format(
    compute_infogain(toy_data, 'x', "y"), 0.0
))

print('Checking information gain with actual data')
print('\tEntropy for Play_Golf is: {}\n'.format(compute_entropy(data["Play_Golf"])))

for col in ["Outlook", "Temperature", "Humidity", "Windy"]:
    print('\tInformation gain for feature {} is : {}'.format(col, compute_infogain(data, col, "Play_Golf")))

Checking information gain with toy data
	Information Gain for Toy data is: Actual (1.0) Expected (1.0)
	Information Gain for Toy data is: Actual (0.0) Expected (0.0)

Checking information gain with actual data
	Entropy for Play_Golf is: 0.9709505944546686

	Information gain for feature Outlook is : 0.3219280948873623
	Information gain for feature Temperature is : 0.0954618442383216
	Information gain for feature Humidity is : 0.12451124978365313
	Information gain for feature Windy is : 0.09127744624168


## Computing Tree

Let us now create a decision tree, which uses information gain to find the best split. The algorithm is as follows:

1. Find a feature to split data (based on the information gain)
2. Split/Partition data into `n` sets depending on the unique values
3. For each partition
  * If termination condition is not met
    * Repeat step 1
    * Repeat step 2
    
    
Following code is adopted from [this repository](https://github.com/random-forests/tutorials/blob/master/decision_tree.ipynb). This code is a more general version, where the tree can have an arbitrary number of children.

### Tree related Objects

In [158]:
class DecisionNode:

    def __init__(self, feature, value, data=None, children=None):
        self.feature = feature
        self.value = value
        self.data = data
        self.children = children

    def set_parent(self, parent):
        self.parent = parent

    def set_data(self, data):
        self.data = data
        
class LeafNode:
    
    def __init__(self, data, label_column):
        self.prediction = data[label_column].value_counts().idxmax()

### Tree related helper functions

In [161]:
def partition_data(feature, values, data_to_split):
    """
    Splits the data such that each dataframe in the returned list 
    has the same value for the selected feature
    """
    data_splits = {}
    for v in values:
        true_data = data_to_split.loc[data_to_split[feature]==v,:]
        data_splits[v] = true_data
    return data_splits


def find_best_feature(data_to_split, features, label_column):
    """
    This function finds the best feature to split that maximizes
    the information gain for a given dataframe
    """
    #feature_infogain_tuples = []
    max_feature, max_ig = None, 0.0
    for f in features:
        #partitions = partition_data(f, set(data_to_split[f].tolist()), data_to_split)
        ig = compute_infogain(data_to_split, f, 'Play_Golf')
        if ig > max_ig:
            max_feature = f
            max_ig = ig
        
    return max_feature, max_ig
     

### Recursive tree building method

In [None]:
def build_tree(data_to_split, best_feature=None, value=None, min_leaf_count=3, ig_tol=1e-3):
    """
    This Function computes the sub tree recursively. This is a more general tree model
    where there can be arbitrary number of children for a given node
    """
    
    label_column = "Play_Golf"
    
    # Termination condition (minimum count on leaf)
    if data_to_split.shape[0]<=3:
        print('Too little data. Terminating growth ...')
        children = [LeafNode(data_to_split, label_column)]
        return DecisionNode(best_feature, value, data_to_split, children=children)
    
    # Finding the next best feature to split the data on
    next_best_feature, infogain = find_best_feature(data_to_split, features, "Play_Golf")
    feature_unique_values = list(set(data_to_split[next_best_feature].tolist()))
    
    print('Choosing {} as the best feature with {} information gain'.format(next_best_feature, infogain))
    
    # Termination condition (minimum information gain)
    if infogain < ig_tol:
        print('Too little information gain. Terminating growth ...')
        return DecisionNode(best_feature, value, data_to_split, children=children)
    
    # Partition the data according to the selected features values
    parts_dict = partition_data(next_best_feature, feature_unique_values, data_to_split)
    
    # For each partition create a child, where child recursively calls build_tree
    children = []
    for attr, p in parts_dict.items():
        print('\tCreating child node {}={} having {} data points...'.format(next_best_feature, attr, p.shape[0]))
        children.append(build_tree(p, next_best_feature, attr, min_leaf_count, ig_tol))
        
    # Return the node
    return DecisionNode(best_feature, value, data_to_split, children=children)

## Running, building the tree

In [162]:
# Returns the root node
my_tree = build_tree(data)

Choosing Outlook as the best feature with 0.3219280948873623 information gain
	Creating child node Outlook=0 having 4 data points...
Choosing Temperature as the best feature with 0.8112781244591328 information gain
	Creating child node Temperature=0 having 1 data points...
Too little data. Terminating growth ...
	Creating child node Temperature=1 having 1 data points...
Too little data. Terminating growth ...
	Creating child node Temperature=2 having 2 data points...
Too little data. Terminating growth ...
	Creating child node Outlook=1 having 2 data points...
Too little data. Terminating growth ...
	Creating child node Outlook=2 having 4 data points...
Choosing Windy as the best feature with 0.8112781244591328 information gain
	Creating child node Windy=0 having 3 data points...
Too little data. Terminating growth ...
	Creating child node Windy=1 having 1 data points...
Too little data. Terminating growth ...


## Printing the tree

In [166]:
print('Printing the decision tree ...')
def print_tree(node, spacing=''):
    """
    This function recursively prints the tree with indentation
    """
    spacing += ' '
    
    if isinstance(node, LeafNode):
        print(spacing, 'Leaf Prediction: ', node.prediction)
        return 
    
    
    print(spacing, "Node",node.feature, '=', node.value)
    for c in node.children:
        print_tree(c, spacing)


print_tree(my_tree)

Printing the decision tree ...
  Node None = None
   Node Outlook = 0
    Node Temperature = 0
     Leaf Prediction:  1
    Node Temperature = 1
     Leaf Prediction:  0
    Node Temperature = 2
     Leaf Prediction:  0
   Node Outlook = 1
    Leaf Prediction:  1
   Node Outlook = 2
    Node Windy = 0
     Leaf Prediction:  1
    Node Windy = 1
     Leaf Prediction:  0
