# Decision Trees

## Materials: 
- For comparison between different ensemble model approaches, ie. bagging, boosting and stacking: https://towardsdatascience.com/ensemble-methods-bagging-boosting-and-stacking-c9214a10a205

In [2]:
from typing import NamedTuple, Optional

class Candidate(NamedTuple):
    leve: str
    lang: str
    tweets: bool
    phd: bool
    did_well: Optional[bool] = None

inputs = [Candidate('Senior', 'Java',   False, False, False),
          Candidate('Senior', 'Java',   False, True,  False),
          Candidate('Mid',    'Python', False, False, True),
          Candidate('Junior', 'Python', False, False, True),
          Candidate('Junior', 'R',      True,  False, True),
          Candidate('Junior', 'R',      True,  True,  False),
          Candidate('Mid',    'R',      True,  True,  True),
          Candidate('Senior', 'Python', False, False, False),
          Candidate('Senior', 'R',      True,  False, True),
          Candidate('Junior', 'Python', True,  False, True),
          Candidate('Senior', 'Python', True,  True,  True),
          Candidate('Mid',    'Python', False, True,  True),
          Candidate('Mid',    'Java',   True,  False, True),
          Candidate('Junior', 'Python', False, True,  False)
         ]

In [32]:
# define the model data structure and how to use the model (predict method)

from typing import NamedTuple, Union, Any


class NonLeaf(NamedTuple):
    attribute: str
    children: dict
    default_value: Any = None

class Leaf(NamedTuple):
    value: Any

Model = Union[Leaf, NonLeaf]

def predict(input:Candidate, model: Model) -> Any:
    if isinstance(model, Leaf):
        return model.value
    else:
        key = getattr(input, model.attribute)
        if key in model.children:
            sub_model = model.children[key]
            return predict(input, sub_model)
        else:
            return model.default_value

In [51]:
# train and construct the model
from typing import List, Dict, Set
from collections import Counter
import math

def partition(inputs:List[Candidate], attri:str) -> Dict[Any, List[Candidate]]:
    partitions = {}
    for candidate in inputs:
        attri_value = getattr(candidate, attri)
        if attri_value in partitions:
            partitions[attri_value].append(candidate)
        else:
            partitions[attri_value] = [candidate]
    return partitions

def entropy_of_data(inputs:List[Candidate], label:str) -> float:
    count = len(inputs)
    counter = Counter([getattr(candidate, label) for candidate in inputs])
    return sum([c/count*math.log(c/count) for c in [item[1] for item in counter.items()]])

def entropy_of_partitions(partitions:Dict[Any, List[Candidate]], label:str) -> float:
    count = sum([len(l) for _,l in partitions.items()])
    return sum([len(l)/count * entropy_of_data(l, label) for _, l in partitions.items()])

def inner_train(inputs:List[Candidate], label: str, ignored_key: Set[str]) -> Model:
    label_counts = Counter([getattr(candidate, label) for candidate in inputs])
    most_common = label_counts.most_common()[0][0]
    if len(label_counts) == 1:
        return Leaf(most_common)
    
    attributes = [attri for attri in Candidate._fields if (attri not in ignored_key)]
    lowest_entropy_sofar = 2 # entroy will never > 1
    winner = None
    winner_partitions = None
    for attri in attributes:
        partitions = partition(inputs, attri)
        entropy = entropy_of_partitions(partitions, label)
        if entropy < lowest_entropy_sofar:
            lowest_entropy_sofar = entropy
            winner = attri
            winner_partitions = partitions

    if winner:
        # we got a further partition
        updated_ignored_keys = ignored_key + [winner]
        children = {k: inner_train(v, label, updated_ignored_keys) for k,v in winner_partitions.items()}
        return NonLeaf(winner, children, most_common)
    else:
        # no further partiton, return most common labels
        return Leaf(most_common)

def train(inputs:List[Candidate], label:str) -> Model:
    return inner_train(inputs, label, ['did_well'])
    
    

In [52]:
model = train(inputs, 'did_well')
print(model)

NonLeaf(attribute='phd', children={False: NonLeaf(attribute='lang', children={'Java': NonLeaf(attribute='leve', children={'Senior': Leaf(value=False), 'Mid': Leaf(value=True)}, default_value=False), 'Python': NonLeaf(attribute='tweets', children={False: NonLeaf(attribute='leve', children={'Mid': Leaf(value=True), 'Junior': Leaf(value=True), 'Senior': Leaf(value=False)}, default_value=True), True: Leaf(value=True)}, default_value=True), 'R': Leaf(value=True)}, default_value=True), True: NonLeaf(attribute='tweets', children={False: NonLeaf(attribute='lang', children={'Java': Leaf(value=False), 'Python': NonLeaf(attribute='leve', children={'Mid': Leaf(value=True), 'Junior': Leaf(value=False)}, default_value=True)}, default_value=False), True: NonLeaf(attribute='lang', children={'R': NonLeaf(attribute='leve', children={'Junior': Leaf(value=False), 'Mid': Leaf(value=True)}, default_value=False), 'Python': Leaf(value=True)}, default_value=True)}, default_value=False)}, default_value=True)


In [53]:
assert not predict(Candidate('Junior', 'Java', True, False), model)
assert predict(Candidate('Intern', 'Java', True, True), model)