In [1]:
import numpy as np 

In [15]:
class Node:
    def __init__(self):
        
        # links to the left and right child nodes
        self.right = None
        self.left = None
        
        # derived from splitting criteria
        self.column = None
        self.threshold = None
        
        # probability for object inside the Node to belong for each of the given classes
        self.probas = None
        # depth of the given node
        self.depth = None
        
        # if it is the root Node or not
        self.is_terminal = False

In [21]:
class DecisionTreeClassifier2:
    def __init__(self, max_depth = 3, min_samples_leaf = 1, min_samples_split = 2):
        
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        
        self.classes = None
        
        # Decision tree itself
        self.Tree = None
    
    def nodeProbas(self, y):
        '''
        Calculates probability of class in a given node
        '''
        
        probas = []
        
        # for each unique label calculate the probability for it
        for one_class in self.classes:
            proba = y[y == one_class].shape[0] / y.shape[0]
            probas.append(proba)
        return np.asarray(probas)

    def gini(self, probas):
        '''
        Calculates gini criterion
        '''
        
        return 1 - np.sum(probas**2)
    
    def getImpurity_faster(self, target):
        # 기본적으로 np.where 을 하면 tuple 형태이다. ( array([idx1, idx2 ,,, ]), ) 
        probas = []
        for one_class in self.classes:
            prob = np.where(target == one_class)[0].size / target.shape[0]
            probas.append(prob)
    
        # gini 계수를 구한다. 
        return 1-np.sum(np.asarray(probas)**2)
    
    def getInfoGainForColumn(self, x_col, y, impurityBefore):
        '''
        xcol 은 X[:, col_idx] 한 값으로써, 중복제거 없이, 그 column만을 딱 떼어온것.
        '''
        class_list = [0,1,2]
        bestSplitCol = None
        bestThresh = None
        bestInfoGain = -999
        
        unique_value = np.unique(np.sort(x_col))
        for idx in range(len(unique_value)-1): 
            front = unique_value[idx]
            back = unique_value[idx+1]
            threshold = (front+back)/2
            
            y_right = y.values[np.where(x_col  > threshold)[0]]
            y_left = y.values[np.where(x_col < threshold)[0]]
        
            if y_right.shape[0] == 0 or y_left.shape[0] == 0:
                continue
            
            impurityRight = self.getImpurity_faster(y_right)
            impurityLeft = self.getImpurity_faster(y_left)
            
            # calculate information gain
            infoGain = impurityBefore
            infoGain -= (impurityLeft * y_left.shape[0] / y.shape[0]) + (impurityRight * y_right.shape[0] / y.shape[0])
            
            if bestInfoGain < infoGain:
                bestInfoGain = infoGain
                bestThresh = threshold
                
        return bestInfoGain, bestThresh
        
    def calcBestSplit(self, X, y):
        '''
        X : np.asarray 를 통해서 이미 numpy array 형태로 변환된 것이다. 
        Calculates the best possible split for the concrete node of the tree
        '''
        class_list = [0,1,2]
        bestSplitCol = None
        bestThresh = None
        bestInfoGain = -999
        
        # 나누기 전 현재의 데이터 분포에서 impurity 를 계산한다. 
        impurityBefore = self.getImpurity_faster(y)
        
        
        for col in range(X.shape[1]):
            
            # X 가 이미 np.array 아래와같이 indexing이 가능. .iloc이 아님. 
            x_col = X[:, col]  # 행은 전부다, 열은 col으로 지정한 열만. 
            infoGain, threshold = self.getInfoGainForColumn(x_col, y, impurityBefore)
            
            if infoGain > bestInfoGain:
                bestSplitCol = col
                bestThresh = threshold
                bestInfoGain = infoGain
                    
        
        # if we still didn't find the split
        if bestInfoGain == -999:
            return None, None, None, None, None, None
        
        # making the best split
        
        x_col = X[:, bestSplitCol]
        x_left, x_right = X[x_col <= bestThresh, :], X[x_col > bestThresh, :]
        y_left, y_right = y[x_col <= bestThresh], y[x_col > bestThresh]
        
        return bestSplitCol, bestThresh, x_left, y_left, x_right, y_right
                
                
    
    def buildDT(self, X, y, node):
        '''
        Recursively builds decision tree from the top to bottom
        '''
        
        # checking for the terminal conditions
        
        if node.depth >= self.max_depth:
            node.is_terminal = True
            return
        
        if X.shape[0] < self.min_samples_split:
            node.is_terminal = True
            return
        
        if np.unique(y).shape[0] == 1:
            node.is_terminal = True
            return
        
        # calculating current split
        splitCol, thresh, x_left, y_left, x_right, y_right = self.calcBestSplit(X, y)
        
        if splitCol is None:
            node.is_terminal = True
            
        if x_left.shape[0] < self.min_samples_leaf or x_right.shape[0] < self.min_samples_leaf:
            node.is_terminal = True
            return
        
        node.column = splitCol
        node.threshold = thresh
        
        # creating left and right child nodes
        node.left = Node()
        node.left.depth = node.depth + 1
        node.left.probas = self.nodeProbas(y_left)
        
        node.right = Node()
        node.right.depth = node.depth + 1
        node.right.probas = self.nodeProbas(y_right)
        
        # splitting recursevely
        self.buildDT(x_right, y_right, node.right)
        self.buildDT(x_left, y_left, node.left)
        
        
        
        
    
    def fit(self, X, y):
        '''
        X : y만 빠진 column들이 모두 존재하는 pandas dataframe.
        y : y 하나만 존재하는 pandas dataframe
        '''
        
        if type(X) == pd.DataFrame:
            X = np.asarray(X)
        
        self.classes = np.unique(y)
        # root node creation
        self.Tree = Node()
        self.Tree.depth = 1
        
        # 현재 root node, 즉 모든 데이터에 대해서 probas 를 계산한다. 
        self.Tree.probas = self.nodeProbas(y)
        
        # 그렇게 root node 를 손수 만들어준 다음, 그 root node 를 기준으로 tree를 만든다. 
        self.buildDT(X, y, self.Tree)
    
    def predictSample(self, x, node):
        '''
        Passes one object through decision tree and return the probability of it to belong to each class
        '''
       
    
        # if we have reached the terminal node of the tree
        if node.is_terminal:
            return node.probas
        
        if x[node.column] > node.threshold:
            probas = self.predictSample(x, node.right)
        else:
            probas = self.predictSample(x, node.left)
            
        return probas
        
        
    
    def predict(self, X):
        '''
        Returns the labels for each X
        '''
        
        if type(X) == pd.DataFrame:
            X = np.asarray(X)
            
        predictions = []
        for x in X:
            pred = np.argmax(self.predictSample(x, self.Tree))
            predictions.append(pred)
        
        return np.asarray(predictions)

In [17]:
class Node:
    def __init__(self):
        
        # links to the left and right child nodes
        self.right = None
        self.left = None
        
        # derived from splitting criteria
        self.column = None
        self.threshold = None
        
        # probability for object inside the Node to belong for each of the given classes
        self.probas = None
        # depth of the given node
        self.depth = None
        
        # if it is the root Node or not
        self.is_terminal = False

In [2]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

data = load_iris()
X, y, column_names = data['data'], data['target'], data['feature_names']
X = pd.DataFrame(X, columns = column_names)
X['target'] = y

In [3]:
data["data"].shape

(150, 4)

In [4]:
X, y = X.drop(columns = 'target'), X['target']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, random_state = 44)

In [46]:

def bootstrap(df, nboot):
    idx = np.random.randint(df.shape[0], size = (nboot, df.shape[0]))
    idx_flat = np.ravel(idx)
    
    return df.iloc[idx_flat, :].reset_index(drop=True)
data = load_iris()
X, y, column_names = data['data'], data['target'], data['feature_names']
X = pd.DataFrame(X, columns = column_names)
X['target'] = y

bootstappedDF = bootstrap(X, 100000)
bigX, bigy = bootstappedDF.drop(columns = 'target'), bootstappedDF['target']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(bigX,bigy, random_state = 44)

In [36]:
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
1306633,4.6,3.2,1.4,0.2
539452,5.1,3.8,1.9,0.4
568261,5.6,2.5,3.9,1.1
288382,6.7,3.1,4.7,1.5
735171,6.3,2.5,4.9,1.5
...,...,...,...,...
1445972,5.1,3.5,1.4,0.3
641120,6.1,2.6,5.6,1.4
49723,6.9,3.1,5.1,2.3
156845,6.3,2.5,5.0,1.9


In [37]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(1125000, 4)
(375000, 4)
(1125000,)
(375000,)


In [48]:
%%time

model = DecisionTreeClassifier2(max_depth = 8, min_samples_leaf=1, min_samples_split=2)
model.fit(X_train, y_train)

CPU times: user 47.8 s, sys: 2.92 s, total: 50.7 s
Wall time: 50.7 s


In [39]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_val)
print(f'Accuracy for self built model {accuracy_score(y_val, y_pred)}')

Accuracy for self built model 1.0


In [40]:
from sklearn.tree import DecisionTreeClassifier

In [47]:
%%time

dt_clf = DecisionTreeClassifier(max_depth = 8, min_samples_leaf=1, min_samples_split=2)
dt_clf.fit(X_train, y_train)

CPU times: user 14.2 s, sys: 127 ms, total: 14.4 s
Wall time: 14.4 s


DecisionTreeClassifier(max_depth=8)

In [42]:
dt_prediction = dt_clf.predict(X_val)

In [43]:
dt_prediction

array([0, 1, 0, ..., 1, 2, 2])

In [44]:
from sklearn.metrics import classification_report, confusion_matrix

In [45]:
confusion_matrix(y_val, dt_prediction)

array([[125416,      0,      0],
       [     0, 124681,      0],
       [     0,      0, 124903]])