https://goo.gl/UdZoNr

In [1]:
import numpy as np

In [2]:
data = [
    ['Green', 3, 2, 0.01,'Apple'],
    ['Yellow', 3, 1, 0.07,'Apple'],
    ['Red', 1, 0, 0.02,'Grape'],
    ['Red', 1, 5, 0.03,'Grape'],
    ['Yellow', 3, 2, 0.01,'Lemon'],
    ['White', 2, 1, 0.09,'Guava'],
    ['Blue', 1, 4, 0.07,'Berry'],
    ['Black', 1, 3,0.02, 'Berry']
    ]
header = ['Color', 'Dimention', 'Wideth', 'Size', 'Label']
print(header)
data

['Color', 'Dimention', 'Wideth', 'Size', 'Label']


[['Green', 3, 2, 0.01, 'Apple'],
 ['Yellow', 3, 1, 0.07, 'Apple'],
 ['Red', 1, 0, 0.02, 'Grape'],
 ['Red', 1, 5, 0.03, 'Grape'],
 ['Yellow', 3, 2, 0.01, 'Lemon'],
 ['White', 2, 1, 0.09, 'Guava'],
 ['Blue', 1, 4, 0.07, 'Berry'],
 ['Black', 1, 3, 0.02, 'Berry']]

In [3]:
def getUnique(rows, col):
    return set([row[col] for row in rows])
    
    
def labelCounts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 1
        else:
            counts[label] += 1
            
    return counts


def isNumeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [4]:
class Question:
    def __init__(self, col, value):
        self.col = col
        self.value = value
        
    def match(self, row):
        val = row[self.col]
        if isNumeric(val):    
            return val >= self.value
        else:
            return val == self.value
        
    def __repr__(self):
        if isNumeric(self.value):
            condition = '>='
        else:
            condition = '=='
        return "Is %s %s %s?" % (header[self.col], condition, str(self.value))

In [5]:
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
            
    return true_rows, false_rows


def gini(rows):
    counts = labelCounts(rows) #counts is dictionary
    
    impurity = 1
    for label in counts:
        probability = counts[label]/float(len(rows)) 
        impurity -= (probability * probability)       # 1 - np.sum(p**2)
        
    return impurity


def informationGain(left, right, currentUncertinity):
    totalLength = len(left) + len(right)
    probability_left = len(left)/float(totalLength)
    probability_right = 1 - probability_left
    weightedGini = (probability_left * gini(left)) + (probability_right * gini(right))
    info_Gain = currentUncertinity - weightedGini
    return info_Gain

In [6]:
def Buble_sorting(all_QuestionGain):
    new_Question_Gain_Order = []
    Q_list = []
    G_list = []
    for Question, Gain in all_QuestionGain:
        Q_list.append(Question) 
        G_list.append(Gain)
    
    tempGain = G_list.copy()
    tempQues = Q_list.copy()
    
    l = len(all_QuestionGain)
    new_Gain_Order = l* [0]
    new_Ques_Order = l* [0]


    for i in range(l):
        new_Gain_Order[i] = tempGain[0]
        for j in range(l-i-1):
            if tempGain[j+1] > new_Gain_Order[i]:
                new_Gain_Order[i] = tempGain[j+1]
        
        Ques_index = tempGain.index(new_Gain_Order[i])
        new_Ques_Order[i] = tempQues[Ques_index]
        
        new_Question_Gain_Order.append( (new_Ques_Order[i],new_Gain_Order[i]) )
        
        tempGain.remove(new_Gain_Order[i])
        tempQues.remove(new_Ques_Order[i])

    return new_Question_Gain_Order

In [7]:
def findBestSplit(rows):
    Question_Gain = []
    
    currentUncertinity = gini(rows)
    no_feature = len(rows[0]) - 1    #if a row has 5 columns and the last column is label, then total feature no = 5-1
    
    for col in range(no_feature):
        values = set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            trueRows, falseRows = partition(rows, question)

            if ( (len(trueRows) == 0) or (len(falseRows) == 0) ):
                continue

            gain = informationGain(trueRows, falseRows, currentUncertinity)
            Question_Gain.append((question,gain))
            
            
    if(len(Question_Gain) >= 1):
        Question_Gain = Buble_sorting(Question_Gain)
        bestQuestion = Question_Gain[0][0]
        bestGain = Question_Gain[0][1]
    else:
        bestQuestion = None
        bestGain = 0
        
    
    return bestGain, bestQuestion, Question_Gain

In [8]:
class Leaf:
    def __init__(self, rows):
        self.predict = labelCounts(rows)
        
        
class DecisionNode:
    def __init__(self, question, trueBranch, falseBranch):
        self.question = question
        self.trueBranch = trueBranch
        self.falseBranch = falseBranch

In [9]:
def buildTree(rows):
    bestGain, bestQuestion, Question_Gain = findBestSplit(rows)
    if (bestGain == 0):
        return Leaf(rows)
    
    trueRows, falseRows = partition(rows, bestQuestion)
    
    trueBranch = buildTree(trueRows)
    falseBranch = buildTree(falseRows)
    
    return DecisionNode(bestQuestion, trueBranch, falseBranch)

In [10]:
def printTrees(node, spacing=''):
    if isinstance(node, Leaf):
        print (spacing +'Predict', node.predict)
        return
    
    spacing += '|'
    print(spacing + str(node.question))
    
    spacing += '-'
    print (spacing  + '> True:')
    spacing += '--'
    printTrees(node.trueBranch, spacing)
    
    spacing = spacing.replace(spacing, spacing[:-2])
    print (spacing  + '> False:')
    spacing += '--'
    printTrees(node.falseBranch, spacing)


In [11]:
bestGain, bestQuestion, Question_Gain = findBestSplit(data)
Question_Gain

[(Is Color == Red?, 0.23958333333333348),
 (Is Dimention >= 2?, 0.21875),
 (Is Dimention >= 3?, 0.21458333333333335),
 (Is Wideth >= 3?, 0.16458333333333341),
 (Is Color == White?, 0.1383928571428571),
 (Is Size >= 0.09?, 0.1383928571428571),
 (Is Color == Yellow?, 0.11458333333333326),
 (Is Size >= 0.02?, 0.11458333333333326),
 (Is Color == Green?, 0.1026785714285714),
 (Is Color == Black?, 0.1026785714285714),
 (Is Color == Blue?, 0.1026785714285714),
 (Is Wideth >= 1?, 0.1026785714285714),
 (Is Wideth >= 5?, 0.1026785714285714),
 (Is Wideth >= 2?, 0.08125000000000016),
 (Is Size >= 0.07?, 0.08125000000000016),
 (Is Wideth >= 4?, 0.07291666666666674),
 (Is Size >= 0.03?, 0.03125)]

In [12]:
myTree = buildTree(data)
printTrees(myTree)

|Is Color == Red?
|-> True:
|---Predict {'Grape': 2}
|-> False:
|---|Is Dimention >= 2?
|---|-> True:
|---|---|Is Color == White?
|---|---|-> True:
|---|---|---Predict {'Guava': 1}
|---|---|-> False:
|---|---|---|Is Color == Yellow?
|---|---|---|-> True:
|---|---|---|---|Is Wideth >= 2?
|---|---|---|---|-> True:
|---|---|---|---|---Predict {'Lemon': 1}
|---|---|---|---|-> False:
|---|---|---|---|---Predict {'Apple': 1}
|---|---|---|-> False:
|---|---|---|---Predict {'Apple': 1}
|---|-> False:
|---|---Predict {'Berry': 2}


In [13]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predict

    if node.question.match(row):
        return classify(row, node.trueBranch)
    else:
        return classify(row, node.falseBranch)
    
    
def printLeaf(counts):
    probability = {}
    total_N_Outputs = sum(counts.values())
    
    for lbl in counts:
        probability[lbl] = str( (counts[lbl] / total_N_Outputs)*100 ) + '%'
    
    return probability

In [14]:
printLeaf(classify(['green',3], myTree))

{'Apple': '100.0%'}