# Desicion Tree


$${H} (X)=-\sum _{i=1}^{n}{{P} (x_{i})\log _{b} {P} (x_{i})}$$


In [9]:
import collections
from math import log

'''
Calculate Shanno Entropy

The higher the entropy, the more mixed up the data is.
'''
def calcShannonEnt(dataSet):
    n = len(dataSet)
    labelCounts = collections.defaultdict(int)
    for featVec in dataSet:
        currentLabel = featVec[-1]
        labelCounts[currentLabel] +=1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/n
        shannonEnt -= prob * log(prob,2)
    return shannonEnt
    

def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1 :])
            retDataSet.append(reducedFeatVec)
    return retDataSet
        

def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) -1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * clacShannoEnt(subDataSet)
            if (infoGain > bestInfoGain):
                bestInfoGain = infoGain
                bestFeature = I
    return bestFeature
        

def createDataSet():
    dataSet = [[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'no'],
              [0,1,'no'],
              [0,1,'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels

In [10]:
myDat, labels = createDataSet()

calcShannonEnt(myDat)

In [15]:
myDat = dataSet = [[1,1,'maybe'],
              [1,1,'yes'],
              [1,0,'no'],
              [0,1,'ok'],
              [0,1,'jo']]
calcShannonEnt(myDat)

2.321928094887362

In [17]:
myDat = dataSet = [[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'yes'],
              [0,1,'yes'],
              [0,1,'no']]
calcShannonEnt(myDat)

0.7219280948873623

In [22]:
dataSet1 = [[1,2,'yes'],
              [3,4,'yes'],
              [5,6,'yes'],
              [7,8,'yes'],
              [9,10,'no']]
numFeatures = len(dataSet[0]) -1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
    featList = [example[i] for example in dataSet1]

In [23]:
featList


[2, 4, 6, 8, 10]