In [4]:
import numpy as np
import pandas as pd
import os

In [5]:
BASE_DIR = os.getcwd()
csv_path = "sample_data/drug200.csv"

df = pd.read_csv(os.path.join(BASE_DIR, csv_path))
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [14]:
class Tree:
    def __init__(self, threshold=None, feature=None, prediction=None):
        if threshold and feature and prediction is None:
            self.feature = feature
            self.threshold = threshold
            self.is_leaf = False
        elif prediction is not None and not threshold and not feature:
            self.prediction = prediction
            self.is_leaf = True
        else:
            print('Vertex in decision tree must be either predicat(only feature and threshold)'
                  'or leaf(only prediction)')

        self.right = None
        self.left = None

    def is_leaf(self):
        if self.is_leaf:
            return True
        elif not self.is_leaf:
            return False
        print('Not a leaf and not a predicate')
        return None

In [12]:
class Decision_tree:
    def __init__(self, max_depth=4, min_impurity_decrease=0.05, min_samples_leaf=5):
        self.classes = {}
        self.min_impurity_decrease = min_impurity_decrease
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _find_classes_amount(self, y):
        classes_amount = np.zeros(len(self.classes))

        for i in y:
            classes_amount[self.classes[i]] += 1

        return classes_amount

    def _impurity_entropy(self, amount):
        impurity = 0
        probability_vector = amount / np.sum(amount)
        for i in probability_vector:
            if i != 0:
                impurity += i * np.log(i)

        return -impurity

    def _impurity_Gini_criterion(self, amount):
        impurity = 0
        probability_vector = amount / np.sum(amount)
        for i in probability_vector:
            impurity += i * (1 - i)

        return impurity

    def _find_best_split(self, X, y):
        best_separator, best_gain = None, 0

        for feature_index, feature in enumerate(X.T):
            if isinstance(feature[0], (int, float)):
                index_sorted = np.argsort(feature)

                y = y[index_sorted]
                amount = self._find_classes_amount(y)
                feature = feature[index_sorted]
                n = len(feature)

                main_impurity = self._impurity_entropy(amount)

                left, right = amount, np.zeros(len(amount))

                index = 0
                while index < n - 1:
                    label = y[index]
                    left[self.classes[label]] -= 1
                    right[self.classes[label]] += 1

                    if index != 0 and feature[index-1] == feature[index]:
                        index += 1
                        continue

                    left_impurity = self._impurity_entropy(left)
                    right_impurity = self._impurity_entropy(right)

                    gain = n * main_impurity - ((n - index - 1) * left_impurity + (index + 1) * right_impurity)

                    if gain > best_gain:
                        best_gain = gain
                        best_main_impurity = main_impurity
                        if index == 0: threshold = feature[index]
                        else: threshold = np.mean([feature[index - 1], feature[index]])
                        best_separator = [feature_index, index, threshold]

                    index += 1
            else:
                pass
        return best_main_impurity, gain, best_separator

    def _build_tree(self, X, y, depth=0):

        stop_conditions = [
            len(y) < self.min_samples_leaf,
            depth >= self.max_depth,
            np.sum(y) in y
        ]

        if any(stop_conditions):
            return Tree(prediction=y)

        main_impurity, gain, split_criteria = self._find_best_split(X, y)

        if gain < (1 + self.min_impurity_decrease) * main_impurity:
            return Tree(prediction=y)

        feature_index, index, threshold = split_criteria

        root = Tree(feature=feature_index, threshold=threshold)
        root.left = self._build_tree(X[:index, :], y[:index])
        root.right = self._build_tree(X[index:, :], y[index:])

        return root

    def train(self, X, y):
        index = 0
        for i in np.unique(y):
            self.classes[i] = index
            index += 1

        self.tree = self._build_tree(X, y)
        return self

    def predict(self):
        pass

In [15]:
train_data = df.to_numpy()
m, n = train_data.shape

X_train, y_train = train_data[:, :n-1], train_data[:, -1]

a = Decision_tree()
tree = a.train(X_train, y_train)