In [1]:
import numpy as np
import pandas as pd

# Calculate entropy
def entropy(y):
    class_counts = np.bincount(y)
    probabilities = class_counts / len(y)
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

# Calculate information gain
def information_gain(X, y, feature_index, threshold):
    parent_entropy = entropy(y)

    # Split dataset
    left_indices = X[:, feature_index] <= threshold
    right_indices = X[:, feature_index] > threshold

    if sum(left_indices) == 0 or sum(right_indices) == 0:
        return 0

    # Weighted entropy of child nodes
    n = len(y)
    n_left = sum(left_indices)
    n_right = sum(right_indices)

    left_entropy = entropy(y[left_indices])
    right_entropy = entropy(y[right_indices])

    child_entropy = (n_left / n) * left_entropy + (n_right / n) * right_entropy

    # Information gain
    return parent_entropy - child_entropy

# Decision Tree Node
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Decision Tree Classifier
class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        if n_samples < self.min_samples_split or depth >= self.max_depth or len(set(y)) == 1:
            leaf_value = np.bincount(y).argmax()
            return DecisionNode(value=leaf_value)

        # Find best split
        best_gain = -1
        best_feature_index, best_threshold = None, None
        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gain = information_gain(X, y, feature_index, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        if best_gain == -1:
            leaf_value = np.bincount(y).argmax()
            return DecisionNode(value=leaf_value)

        # Recursively build left and right subtrees
        left_indices = X[:, best_feature_index] <= best_threshold
        right_indices = X[:, best_feature_index] > best_threshold
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        return DecisionNode(best_feature_index, best_threshold, left_subtree, right_subtree)

    def fit(self, X, y):
        self.root = self._build_tree(X, y, 0)

    def _predict(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

    def predict(self, X):
        return np.array([self._predict(x, self.root) for x in X])

# Sample Data
X = np.array([[1, 1], [1, 0], [0, 1], [0, 0], [1, 1], [0, 0]])
y = np.array([1, 1, 0, 0, 1, 0])

# Train the decision tree
tree = DecisionTree(max_depth=2)
tree.fit(X, y)

# Make predictions
predictions = tree.predict(X)
print("Predictions:", predictions)


Predictions: [1 1 0 0 1 0]
