In [5]:
import numpy as np
import pandas as pd
import os

In [6]:
BASE_DIR = os.getcwd()
csv_path = "sample_data/drug200.csv"

df = pd.read_csv(os.path.join(BASE_DIR, csv_path))
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [7]:
class Tree:
    def __init__(self):
        self.prediction = None
        self.feature = None
        self.threshold = None
        self.right = None
        self.left = None

    def create_child(self, leaf_side):
        if leaf_side == 'right':
            self.right = Tree()
            return self.right
        elif leaf_side == 'left':
            self.left = Tree()
            return self.left
        else:
            print('To use `set_vertex` method you must specify which leaf to add: `right` or `left`')

    def set_threshold(self, threshold, feature):
        self.threshold = threshold
        self.feature = feature

        self.prediction = None

    def set_prediction(self, prediction):
        self.prediction = prediction

        self.left, self.right = None, None

    def get_prediction(self):
        return self.prediction

    def is_leaf(self):
        if self.prediction is not None:
            return True
        elif self.right and self.left and self.feature and self.threshold:
            return False
        print('Not a leaf and not a predicate')
        return None

In [83]:
class Decision_tree:
    def __init__(self):
        self.classes = {}
        self.tree = Tree()

    def _find_classes_amount(self, y):
        classes_amount = np.zeros(len(self.classes))

        for i in y:
            classes_amount[self.classes[i]] += 1

        return classes_amount

    def _impurity_entropy(self, amount):
        impurity = 0
        probability_vector = amount / np.sum(amount)
        for i in probability_vector:
            if i != 0:
                impurity += i * np.log(i)

        return -impurity

    def _impurity_Gini_criterion(self, amount):
        impurity = 0
        probability_vector = amount / np.sum(amount)
        for i in probability_vector:
            impurity += i * (1 - i)

        return impurity

    def _find_best_split(self, feature, y):
        best_separator, best_gain = None, 0

        if isinstance(feature[0], (int, float)):
            index_sorted = np.argsort(feature)

            y = y[index_sorted]
            amount = self._find_classes_amount(y)
            feature = feature[index_sorted]
            n = len(feature)

            main_impurity = self._impurity_entropy(amount)

            left, right = amount, np.zeros(len(amount))

            index = 0
            while index < n - 1:
                label = y[index]
                left[self.classes[label]] -= 1
                right[self.classes[label]] += 1

                if index != 0 and feature[index-1] == feature[index]:
                    index += 1
                    continue

                left_impurity = self._impurity_entropy(left)
                right_impurity = self._impurity_entropy(right)

                gain = n * main_impurity - ((n - index - 1) * left_impurity + (index + 1) * right_impurity)

                if gain > best_gain:
                    best_gain = gain
                    if index == 0: threshold = feature[index]
                    else: threshold = np.mean(feature[index - 1], feature[index])
                    best_separator = [index, threshold]

                index += 1
        return best_separator

    def _build_tree(self, X, y):
        for feature in X.T:
            self._find_best_split(feature, y)

    def train(self, X, y):
        index = 0
        for i in np.unique(y):
            self.classes[i] = index
            index += 1

        self._build_tree(X, y)

    def predict(self):
        pass

In [84]:
train_data = df.to_numpy()
m, n = train_data.shape

X_train, y_train = train_data[:, :n-1], train_data[:, -1]

a = Decision_tree()
a.train(X_train, y_train)


0.7904684261238231
1.274205194681656
1.639027155552128
1.92683416735332
2.591345596915801
1.3106770336170257
1.6061425720913576
1.7726478702752502
2.235394543113898
2.7950273261709526
3.1314376058149946
3.2642700641246165
3.8403762279356783
4.770748129855917
4.772560524382072
4.913009418024217
6.065611792403217
6.907951031547896
6.981888761509595
7.678857626125762
7.645260677334704
8.430100150664657
9.318013198151334
11.176811915969324
11.419973523439637
11.652306282679035
12.58155026469467
12.950953573158529
14.22638122557106
15.549606429592586
15.444785666626444
17.523712857239047
20.750877685506396
23.508541482408873
22.783959357842576
22.723363614439194
19.799555330090584
19.813164774822155
17.3317804887227
17.379774709903018
15.636800021564511
15.587920271468818
13.628199313976154
12.240824794511866
7.886874407379253
7.598861441606118
7.5183737175695455
7.556795826469454
7.659019288302034
6.354528366792181
6.556120490176056
5.477742362282584
4.16775152580766
3.2651246548804806
1.