https://goo.gl/UdZoNr

In [1]:
import pickle
import random
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def getUnique(rows, col):
    return set([row[col] for row in rows])
    
    
def labelCounts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 1
        else:
            counts[label] += 1
            
    return counts


def isNumeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [3]:
class Question:
    def __init__(self, col, value):
        self.col = col
        self.value = value
        
    def match(self, row):
        val = row[self.col]
        if isNumeric(val):    
            return val >= self.value
        else:
            return val == self.value
        
    def __repr__(self):
        if isNumeric(self.value):
            condition = '>='
        else:
            condition = '=='
        return "Is %s %s %s?" % (header[self.col], condition, str(self.value))

In [4]:
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
            
    return true_rows, false_rows


def gini(rows):
    counts = labelCounts(rows) #counts is dictionary
    
    impurity = 1
    for label in counts:
        probability = counts[label]/float(len(rows)) 
        impurity -= (probability * probability)       # 1 - np.sum(p**2)
        
    return impurity


def informationGain(left, right, currentUncertinity):
    totalLength = len(left) + len(right)
    probability_left = len(left)/float(totalLength)
    probability_right = 1 - probability_left
    weightedGini = (probability_left * gini(left)) + (probability_right * gini(right))
    info_Gain = currentUncertinity - weightedGini
    return info_Gain


def sort_QuestionGain(all_QuestionGain):
    newQuestion_Gain = []
    Q_list = []
    G_list = []
    for Question, Gain in all_QuestionGain:
        if(Gain not in G_list):
            Q_list.append(Question) 
            G_list.append(Gain)
        
    gainSort = sorted(G_list, reverse=True)
    for gain in gainSort:
        index = G_list.index(gain)
        newQuestion_Gain.append((Q_list[index], gain))
        
    return newQuestion_Gain

In [5]:
def findBestSplit(rows):
    Question_Gain = []
    
    currentUncertinity = gini(rows)
    no_feature = len(rows[0])-1    #if a row has 5 columns and the last column is label, then total feature no = 5-1
    
    for col in range(no_feature):
        values = set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            trueRows, falseRows = partition(rows, question)

            if ( (len(trueRows) == 0) or (len(falseRows) == 0) ):
                continue

            gain = informationGain(trueRows, falseRows, currentUncertinity)
            Question_Gain.append((question,gain))
            
            
    if(len(Question_Gain) >= 1):
        Question_Gain = sort_QuestionGain(Question_Gain)
        bestQuestion = Question_Gain[0][0]
        bestGain = Question_Gain[0][1]
    else:
        bestQuestion = None
        bestGain = 0
        
    
    return bestGain, bestQuestion, Question_Gain

In [6]:
class Leaf:
    def __init__(self, rows):
        self.predict = labelCounts(rows)
        
        
class DecisionNode:
    def __init__(self, question, trueBranch, falseBranch):
        self.question = question
        self.trueBranch = trueBranch
        self.falseBranch = falseBranch

In [7]:
def buildTree(rows, nTree):
    bestGain, bestQuestion, Question_Gain = findBestSplit(rows)
    
    if (bestGain == 0):
        return Leaf(rows)
    
    if (nTree >= 0):
        bestQuestion = Question_Gain[nTree][0]
        nTree = -1
    
    trueRows, falseRows = partition(rows, bestQuestion)
    
    trueBranch = buildTree(trueRows, -1)
    falseBranch = buildTree(falseRows, -1)
    
    return DecisionNode(bestQuestion, trueBranch, falseBranch)

In [8]:
def printTrees(node, spacing=''):
    if isinstance(node, Leaf):
        print (spacing +'Predict', node.predict)
        return
    
    spacing += '|'
    print(spacing + str(node.question))
    
    spacing += '-'
    print (spacing  + '> True:')
    spacing += '--'
    printTrees(node.trueBranch, spacing)
    
    spacing = spacing.replace(spacing, spacing[:-2])
    print (spacing  + '> False:')
    spacing += '--'
    printTrees(node.falseBranch, spacing)


In [9]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predict

    if node.question.match(row):
        return classify(row, node.trueBranch)
    else:
        return classify(row, node.falseBranch)
    
    
def printLeaf(counts):
    probability = {}
    total_N_Outputs = sum(counts.values())
    
    for lbl in counts:
        probability[lbl] = str( (counts[lbl] / total_N_Outputs)*100 ) + '%'
    
    return probability

In [10]:
def My_DecisionTree(X, Y, nTree, Tree_print=False):
    data = add_XY(X,Y)
    myTree = buildTree(data, nTree)
        
    if (Tree_print):
        printTrees(myTree)
        
    return myTree


def make_predict(x, tree):
    y_pred = []
    if( np.array(x).ndim > 1):
        for data in x:
            dic = classify(data, tree)
            pred = list(dic.keys())
            y_pred.append(int(pred[0]))
    else:
        dic = classify(x, tree)
        pred = list(dic.keys())
        y_pred = int(pred[0])
        
    return y_pred


def performance_report(y_test, y_pred, label_names=[], index=[], columns=[]):
    cMatrix = confusion_matrix(y_test, y_pred)
    
    if ( (len(index) > 0) | (len(index) > 0) ):
        cm_df = pd.DataFrame(cMatrix, index, columns)

        plt.figure(figsize=(5,4))
        sns.heatmap(cm_df, annot=True)
        plt.title('Confusion Matrix')
        plt.ylabel('Actal Values')
        plt.xlabel('Predicted Values')
        plt.show()
    
    if ( len(label_names) > 0 ):
        print(classification_report(y_test, y_pred, target_names=label_names))
        
    return accuracy_score(y_test, y_pred)


def add_XY(X,Y):
    data = []
    for i in range(len(X)):
        row = []
        row.extend(X[i])
        row.append(str(Y[i]))
        data.append(row)
        
    return data

In [11]:
X_train = [
    ['Green', 3],
    ['Yellow', 3],
    ['Red', 1],
    ['Red', 1],
    ['Yellow', 3],
    ['White', 2],
    ['Blue', 1],
    ['Black', 1]
    ]

Y_train = ['Apple', 'Apple', 'Grape', 'Grape', 'Lemon', 'Guava', 'Berry', 'Berry']

header = ['Color', 'Dimention', 'Label']
label_names = ['Apple','Grape','Mango', 'Lemon', 'Guava']


np.array(X_train).shape

(8, 2)

In [12]:
_,_,all_information_gain = findBestSplit(add_XY(X_train, Y_train))

all_information_gain

[(Is Color == Red?, 0.23958333333333348),
 (Is Dimention >= 2?, 0.21875),
 (Is Dimention >= 3?, 0.21458333333333335),
 (Is Color == White?, 0.1383928571428571),
 (Is Color == Yellow?, 0.11458333333333326),
 (Is Color == Blue?, 0.1026785714285714)]

In [13]:
trees = []
for i in range(len(all_information_gain)):
    print('========================================= Tree ', i+1, '=========')
    print('===========================================================')
    trees.append(My_DecisionTree(X_train, Y_train, i, True))
    

|Is Color == Red?
|-> True:
|---Predict {'Grape': 2}
|-> False:
|---|Is Dimention >= 2?
|---|-> True:
|---|---|Is Color == White?
|---|---|-> True:
|---|---|---Predict {'Guava': 1}
|---|---|-> False:
|---|---|---|Is Color == Yellow?
|---|---|---|-> True:
|---|---|---|---Predict {'Apple': 1, 'Lemon': 1}
|---|---|---|-> False:
|---|---|---|---Predict {'Apple': 1}
|---|-> False:
|---|---Predict {'Berry': 2}
|Is Dimention >= 2?
|-> True:
|---|Is Color == White?
|---|-> True:
|---|---Predict {'Guava': 1}
|---|-> False:
|---|---|Is Color == Yellow?
|---|---|-> True:
|---|---|---Predict {'Apple': 1, 'Lemon': 1}
|---|---|-> False:
|---|---|---Predict {'Apple': 1}
|-> False:
|---|Is Color == Red?
|---|-> True:
|---|---Predict {'Grape': 2}
|---|-> False:
|---|---Predict {'Berry': 2}
|Is Dimention >= 3?
|-> True:
|---|Is Color == Yellow?
|---|-> True:
|---|---Predict {'Apple': 1, 'Lemon': 1}
|---|-> False:
|---|---Predict {'Apple': 1}
|-> False:
|---|Is Color == Red?
|---|-> True:
|---|---Predict