In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sc
from numpy.random import randn
import seaborn as sns
from scipy import stats

%matplotlib inline

In [2]:
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score

from sklearn.metrics import accuracy_score 

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [3]:
import os
import os.path

In [17]:
import math

In [18]:
def entropy_func(cls_cnt, all_cnt):
    return -(cls_cnt*1.0 /all_cnt)*math.log(cls_cnt*1.0 /all_cnt,2)

In [19]:
def get_entropy_cls(cl1_cnt,cl2_cnt):
    if cl1_cnt == 0 or cl2_cnt == 0:
        return 0
    return entropy_func(cl1_cnt, cl2_cnt + cl1_cnt) + entropy_func(cl2_cnt, cl2_cnt + cl1_cnt)

In [20]:
def get_entropy_one_division(division):
    n = len(division)
    classes = set(division)
    s = 0
    for c in classes:
        cl_cnt = np.sum(division == c)
        s += cl_cnt * 1.0/ n * get_entropy_cls(np.sum(division == c),np.sum(division != c))
    return s, n

In [21]:
def get_entropy(y_pred, y):
    
    if len(y_pred) != len(y):
        print("y_pred and y must be the same length")
        return None
    n = len(y)
    s_true, n_true = get_entropy_one_division(y[y_pred])
    s_false, n_false  = get_entropy_one_division(y[~y_pred])
    s = n_true * 1.0 / n * s_true + n_false * 1.0 / n * s_false
    return s

In [43]:
class DecisionTreeClassifier(object):
    def __init__(self, max_depth, min_node_records=10):
        self.depth = 0
        self.max_depth = max_depth
        self.min_node_records = min_node_records
        
    def find_best_split(self, col, y):
        min_entropy = 10
        for value in col:
            y_pred = col < value
            entr = get_entropy(y_pred,y)
            if entr < min_entropy:
                min_entropy = entr
                cutoff = value
        return min_entropy, cutoff
    
    def find_best_split_all(self, x, y):
        min_entr = 1
        col = None
        cutoff = None
        
        for i, c in enumerate(x.T):
            if len(c) < self.min_node_records:
                print("Column have fewer than min node records")
                return None,0,0            
            entr, cur_cutoff = self.find_best_split(c, y)
            if entr == 0:
                return i, cur_cutoff, entr
            if entr < min_entr:
                min_entr = entr
                cutoff = cur_cutoff
                col = i
        return col, cutoff, entr
    
    def fit(self, x, y, par_node = {}, depth = 0):
        if par_node is None:
            return None
        elif len(y) == 0:
            return None
        elif depth > self.max_depth:
            return None
        elif self.all_same(y):
            return {'val':y[0]}
        else:
            col, cutoff, entr = self.find_best_split_all(x,y)
            if not col: 
                return None
            y_left = y[x[:, col] < cutoff]
            y_right = y[x[:,col] >= cutoff]
            par_node = {'col': iris.feature_names[col], 'index_col':col,
                    'cutoff':cutoff,
                   'val': np.round(np.mean(y))}   
        
        par_node['left'] = self.fit(x[x[:, col] < cutoff], y_left, {}, depth+1)   
        
        par_node['right'] = self.fit(x[x[:, col] >= cutoff], y_right, {}, depth+1)  
        self.depth += 1  
        self.trees = par_node  
        return par_node
    
    def all_same(self, items):
        return all(x == items[0] for x in items)
    
    def predict(self,x):
        res = np.array([0]*len(x))
        
        for i,row in enumerate(x):
            res[i] = self._predict(row)
            
        return res
    
    def _predict(self, row):
        tree = self.trees
        while tree.get('cutoff'):
            if row[tree['index_col']] < tree['cutoff']:
                if not tree['left']:
                    return tree.get('val')
                tree = tree['left']
                 
            else:
                if not tree['right']:
                    return tree.get('val')
                tree = tree['right']
        else:   
            return tree.get('val')
            
            
    
                

In [44]:
from sklearn.datasets import load_iris
from pprint import pprint
from sklearn.model_selection import train_test_split


iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split( iris.data, iris.target, test_size=0.3, random_state=121)


clf = DecisionTreeClassifier(max_depth=7)
m = clf.fit(X_train, y_train)

pprint(m)

Column have fewer than min node records
Column have fewer than min node records
{'col': 'petal length (cm)',
 'cutoff': 3.0,
 'index_col': 2,
 'left': {'val': 0},
 'right': {'col': 'petal width (cm)',
           'cutoff': 1.8,
           'index_col': 3,
           'left': {'col': 'petal length (cm)',
                    'cutoff': 5.6,
                    'index_col': 2,
                    'left': {'col': 'petal width (cm)',
                             'cutoff': 1.5,
                             'index_col': 3,
                             'left': {'val': 1},
                             'right': {'col': 'sepal width (cm)',
                                       'cutoff': 2.7,
                                       'index_col': 1,
                                       'left': None,
                                       'right': {'val': 1},
                                       'val': 1.0},
                             'val': 1.0},
                    'right': {'val': 2},
          

In [45]:
y_pred = clf.predict(X_test)

In [49]:
def simple_accuracy(y_pred, y_test):
    return sum(y_pred == y_test) / len(y_pred)

In [50]:
simple_accuracy(y_pred, y_test)

0.98