In [2]:
import numpy as np
from collections import Counter

In [3]:


def entropy(y):
    """Calculate the entropy of a dataset."""
    # Count the occurrences of each class
    counts = np.bincount(y)
    probabilities = counts / len(y)
    # Filter out zero probabilities to avoid math error in np.log2
    probabilities = probabilities[probabilities > 0]
    return -np.sum(probabilities * np.log2(probabilities))

def conditional_entropy(x, y):
    """Calculate the conditional entropy of y given x."""
    # Entropy accumulator
    entropy_acc = 0
    # Iterate over each category in x to calculate conditional entropy
    for value in np.unique(x):
        subset_y = y[x == value]
        entropy_acc += (len(subset_y) / len(y)) * entropy(subset_y)
    return entropy_acc

def information_gain(X, y, feature_index):
    """Calculate the Information Gain of a feature."""
    return entropy(y) - conditional_entropy(X[:, feature_index], y)

def find_root_node(X, y):
    """Find the feature index that should be used as the root node."""
    gains = [information_gain(X, y, feature_index) for feature_index in range(X.shape[1])]
    return np.argmax(gains)


In [4]:
def bin_continuous_values(x, n_bins=3, strategy="width"):
    """Bin continuous values into categorical."""
    if strategy == "width":
        # Equal width binning
        bins = np.linspace(np.min(x), np.max(x), n_bins + 1)
        return np.digitize(x, bins) - 1
    elif strategy == "frequency":
        # Equal frequency binning (quantiles)
        quantiles = np.quantile(x, np.linspace(0, 1, n_bins + 1))
        return np.digitize(x, quantiles) - 1
    else:
        raise ValueError("Unsupported binning strategy")


In [5]:
class SimpleDecisionTree:
    def __init__(self):
        self.root = None
        self.predictions = None

    def fit(self, X, y, n_bins=3, bin_strategy="width"):
        # Bin continuous values if necessary
        if X.dtype == np.float:
            for i in range(X.shape[1]):
                X[:, i] = bin_continuous_values(X[:, i], n_bins, bin_strategy)
        
        # Find the root node using Information Gain
        self.root = find_root_node(X, y)
        
        # Store class predictions based on the selected feature for simplicity
        self.predictions = {}
        for value in np.unique(X[:, self.root]):
            subset_y = y[X[:, self.root] == value]
            self.predictions[value] = Counter(subset_y).most_common(1)[0][0]

    def predict(self, X):
        if X.dtype == np.float:
            raise ValueError("Prediction data must be binned categorical values")
        
        # Predict based on the majority class of the split by the root feature
        return np.array([self.predictions.get(x[self.root], None) for x in X])

