In [7]:
import numpy as np
import pandas as pd

class DecisionTreeRootFeatureSelector:
    def __init__(self):
        self.root_feature = None
        self.root_info_gain = None

    def calculate_information_gain(self, feature_values, labels):
        # Calculate entropy of the entire dataset
        total_entropy = self.calculate_entropy(labels)

        # Calculate weighted entropy for each feature value
        unique_values = np.unique(feature_values)
        weighted_entropy = 0
        for value in unique_values:
            value_indices = np.where(feature_values == value)[0]
            subset_labels = labels[value_indices]
            subset_entropy = self.calculate_entropy(subset_labels)
            weighted_entropy += (len(value_indices) / len(labels)) * subset_entropy

        # Calculate information gain
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def calculate_entropy(self, labels):
        # Calculate entropy of a set of labels
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Add a small value to avoid log(0)
        return entropy

    def select_root_feature(self, features, labels):
        max_info_gain = -1
        best_feature = None

        for i in range(features.shape[1]):  # Iterate over columns
            feature_values = features[:, i]
            info_gain = self.calculate_information_gain(feature_values, labels)
            if info_gain > max_info_gain:
                max_info_gain = info_gain
                best_feature = i + 1  # Adjust index to start from 1

        self.root_feature = best_feature
        self.root_info_gain = max_info_gain

        return best_feature, max_info_gain

# Load data from CSV file
csv_file_path = r"C:\Users\SUNIL KUMAR REDDY\Music\machine-learning\Malayalam_Char_Gabor.csv"
data = pd.read_csv(csv_file_path)

# Separate features and labels
features = data.iloc[:, :-1].values
labels = data.iloc[:, -1].values

# Create an instance of the feature selector
selector = DecisionTreeRootFeatureSelector()

# Select the root feature
root_feature, info_gain = selector.select_root_feature(features, labels)

print("Root Feature Index:", root_feature)
print("Information Gain:", info_gain)



Root Feature Index: 28
Information Gain: 9.935423237429495


In [13]:
import numpy as np

def binning(feature_values, binning_type='equal_width', num_bins=None):
    if binning_type == 'equal_width':
        return equal_width_binning(feature_values, num_bins)
    elif binning_type == 'frequency':
        return frequency_binning(feature_values, num_bins)
    else:
        raise ValueError("Invalid binning type. Please choose 'equal_width' or 'frequency'.")

def equal_width_binning(feature_values, num_bins=None, default_num_bins=10):
    if num_bins is None:
        num_bins = default_num_bins
    min_value = np.min(feature_values)
    max_value = np.max(feature_values)
    bin_edges = np.linspace(min_value, max_value, num_bins + 1)
    binned_values = np.digitize(feature_values, bin_edges)
    return binned_values

def frequency_binning(feature_values, num_bins=None, default_num_bins=10):
    if num_bins is None:
        num_bins = default_num_bins
    _, bin_edges = np.histogram(feature_values, bins=num_bins)
    binned_values = np.digitize(feature_values, bin_edges)
    return binned_values

# Example usage:
# Assuming feature_values is your continuous-valued feature array
feature_values = np.array([1.2, 2.5, 3.8, 4.1, 5.6, 6.2, 7.3, 8.9, 9.4, 10.0])

# Equal width binning with 5 bins
binned_values_equal_width = binning(feature_values, binning_type='equal_width', num_bins=5)
print("Equal width binning with 5 bins:", binned_values_equal_width)

# Frequency binning with default number of bins (10 bins)
binned_values_frequency = binning(feature_values, binning_type='frequency')
print("Frequency binning with default number of bins (10 bins):", binned_values_frequency)

# Alternatively, you can use default parameters:
# Equal width binning with default number of bins (10 bins)
binned_values_equal_width_default = binning(feature_values)
print("Equal width binning with default number of bins (10 bins):", binned_values_equal_width_default)

# Frequency binning with 7 bins
binned_values_frequency_custom = binning(feature_values, binning_type='frequency', num_bins=7)
print("Frequency binning with 7 bins:", binned_values_frequency_custom)


Equal width binning with 5 bins: [1 1 2 2 3 3 4 5 5 6]
Frequency binning with default number of bins (10 bins): [ 1  2  3  4  5  6  7  9 10 11]
Equal width binning with default number of bins (10 bins): [ 1  2  3  4  5  6  7  9 10 11]
Frequency binning with 7 bins: [1 2 3 3 4 4 5 7 7 8]


In [14]:
import numpy as np

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, value=None, left=None, right=None):
        self.feature_index = feature_index  # Index of feature to split on
        self.threshold = threshold          # Threshold value for binary split
        self.value = value                  # Class label for leaf node
        self.left = left                    # Left child node
        self.right = right                  # Right child node

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def entropy(self, y):
        unique_labels, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def information_gain(self, X, y, feature_idx, threshold):
        total_entropy = self.entropy(y)
        left_indices = np.where(X[:, feature_idx] <= threshold)[0]
        right_indices = np.where(X[:, feature_idx] > threshold)[0]
        left_entropy = self.entropy(y[left_indices])
        right_entropy = self.entropy(y[right_indices])
        weighted_entropy = (len(left_indices) / len(X)) * left_entropy + (len(right_indices) / len(X)) * right_entropy
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def find_best_split(self, X, y):
        num_features = X.shape[1]
        best_information_gain = -np.inf
        best_feature_idx = None
        best_threshold = None
        for feature_idx in range(num_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                current_information_gain = self.information_gain(X, y, feature_idx, threshold)
                if current_information_gain > best_information_gain:
                    best_information_gain = current_information_gain
                    best_feature_idx = feature_idx
                    best_threshold = threshold
        return best_feature_idx, best_threshold

    def build_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            leaf_value = np.argmax(np.bincount(y))
            return TreeNode(value=leaf_value)
        
        feature_idx, threshold = self.find_best_split(X, y)
        if feature_idx is None:
            leaf_value = np.argmax(np.bincount(y))
            return TreeNode(value=leaf_value)
        
        left_indices = np.where(X[:, feature_idx] <= threshold)[0]
        right_indices = np.where(X[:, feature_idx] > threshold)[0]

        left_subtree = self.build_tree(X[left_indices], y[left_indices], depth+1)
        right_subtree = self.build_tree(X[right_indices], y[right_indices], depth+1)

        return TreeNode(feature_index=feature_idx, threshold=threshold, left=left_subtree, right=right_subtree)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_sample(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict_sample(x, node.left)
        else:
            return self.predict_sample(x, node.right)

    def predict(self, X):
        if self.tree is None:
            raise ValueError("Tree not fitted.")
        predictions = []
        for x in X:
            prediction = self.predict_sample(x, self.tree)
            predictions.append(prediction)
        return np.array(predictions)

if __name__ == "__main__":
    X = np.array([[1, 0], [1, 1], [0, 0], [0, 1]])
    y = np.array([1, 1, 0, 0])

    tree = DecisionTreeClassifier(max_depth=1)
    tree.fit(X, y)

    print("Tree structure:")
    print("Root feature index:", tree.tree.feature_index)
    print("Root threshold:", tree.tree.threshold)

    test_samples = np.array([[1, 0], [0, 1]])
    predictions = tree.predict(test_samples)
    print("Predictions for test samples:", predictions)


Tree structure:
Root feature index: 0
Root threshold: 0
Predictions for test samples: [1 0]
