In [1]:
import numpy as np
import pandas as pd

In [2]:
class Node():
    def __init__(self , feature=None , threshold=None , left=None , right=None , gain=None , value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value


In [4]:
class DecisionTree:
    def __init__(self, min_samples, max_depth):
        self.min_samples = min_samples
        self.max_depth = max_depth

    def split_data(self, dataset, feature, threshold):
        left_dataset = dataset[dataset[:, feature] <= threshold]
        right_dataset = dataset[dataset[:, feature] > threshold]
        return left_dataset, right_dataset

    def entropy(self, y):
        labels, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log(probabilities))

    def information_gain(self, parent, left, right):
        parent_entropy = self.entropy(parent)
        total_samples = len(left) + len(right)

        weighted_entropy = (len(left) / total_samples) * self.entropy(left) + \
                           (len(right) / total_samples) * self.entropy(right)

        return parent_entropy - weighted_entropy

    def best_split(self, dataset, num_features):
        best_split = {'gain': -1, 'feature': None, 'threshold': None}

        for idx in range(num_features):
            feature_values = dataset[:, idx]
            thresholds = np.unique(feature_values)
            for threshold in thresholds:
                left_dataset, right_dataset = self.split_data(
                    dataset, idx, threshold)

                if len(left_dataset) > 0 and len(right_dataset) > 0:
                    parent, left, right = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]
                    gain = self.information_gain(parent, left, right)

                    if gain > best_split["gain"]:
                        best_split.update({
                            "feature": idx,
                            "threshold": threshold,
                            "left_dataset": left_dataset,
                            "right_dataset": right_dataset,
                            "gain": gain
                        })

        return None if best_split["gain"] == -1 else best_split

    def calculate_leaf_value(self, y):
        y = list(y)
        return max(y, key=y.count)

    def build_tree(self, dataset, current_depth=0):
        X, y = dataset[:, :-1], dataset[:, -1]
        n_samples, n_features = X.shape

        if n_samples >= self.min_samples and current_depth <= self.max_depth:
            best_split = self.best_split(dataset, n_features)
            if best_split is not None:
                left_node = self.build_tree(
                    best_split["left_dataset"], current_depth + 1)
                right_node = self.build_tree(
                    best_split["right_dataset"], current_depth + 1)
                return Node(feature=best_split["feature"], threshold=best_split["threshold"],
                            left=left_node, right=right_node)

        # Create a leaf node
        leaf_value = self.calculate_leaf_value(y)
        return Node(value=leaf_value)

    def fit(self, X, y):
        dataset = np.concatenate((X, y.reshape(-1, 1)), axis=1)
        self.root = self.build_tree(dataset)

    def make_prediction(self, x, node):
        if node.value is not None:
            return node.value

        feature_value = x[node.feature]
        if feature_value <= node.threshold:
            return self.make_prediction(x, node.left)
        else:
            return self.make_prediction(x, node.right)

    def predict(self, X):
        return np.array([self.make_prediction(x, self.root) for x in X])


In [5]:
df = pd.read_csv("Iris.csv")

In [6]:
print(df.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [7]:
X = df[["SepalLengthCm" , "SepalWidthCm" , "PetalLengthCm" , "PetalWidthCm"]].values
y = df["Species"].values

In [8]:
X = (X - X.mean()) / X.std()


In [9]:
idx = int(0.8 * df.shape[0])
train_X , train_y = X[:idx] , y[:idx]
test_X , test_y = X[idx:] , y[idx:]

In [10]:
model = DecisionTree(min_samples=30 , max_depth=10)
model.fit(train_X , train_y)

In [11]:
y_pred =  model.predict(test_X)

np.unique(y_pred)


accuracy = (np.sum(y_pred == test_y) / len(test_y)) * 100


In [12]:
accuracy

np.float64(83.33333333333334)