In [1]:
import numpy as np
from collections import Counter
from math import log2

def load_data(features_file, labels_file):
    """
    Load features and labels from the provided files.
    """
    extracted_features = np.load(features_file)
    labels = np.load(labels_file)
    return extracted_features, labels

def convert_to_categorical(features, num_bins=5):
    """
    Convert numerical features to categorical by binning.
    """
    categorical_features = []
    for feature in features.T:  # Transpose to loop through columns
        bins = np.linspace(min(feature), max(feature), num_bins + 1)
        digitized = np.digitize(feature, bins)
        categorical_features.append(digitized)
    return np.array(categorical_features).T

def calculate_entropy(labels):
    """
    Calculate the entropy of a set of labels.
    """
    label_counts = Counter(labels)
    total_samples = len(labels)
    
    entropy = 0
    for label in label_counts:
        prob = label_counts[label] / total_samples
        entropy -= prob * log2(prob)
    
    return entropy

def calculate_information_gain(feature_column, labels):
    """
    Calculate Information Gain for a feature.
    """
    # Calculate total entropy of the dataset
    total_entropy = calculate_entropy(labels)
    
    # Calculate the weighted entropy of each unique value in the feature
    unique_values = np.unique(feature_column)
    weighted_entropy = 0
    for value in unique_values:
        subset_indices = np.where(feature_column == value)[0]
        subset_labels = labels[subset_indices]
        weight = len(subset_labels) / len(labels)
        subset_entropy = calculate_entropy(subset_labels)
        weighted_entropy += weight * subset_entropy
    
    # Calculate Information Gain
    information_gain = total_entropy - weighted_entropy
    return information_gain

def find_best_feature(extracted_features, labels):
    """
    Find the best feature for the root node based on Information Gain.
    """
    num_features = extracted_features.shape[1]
    best_feature_index = -1
    best_information_gain = -1
    
    for i in range(num_features):
        feature_column = extracted_features[:, i]
        information_gain = calculate_information_gain(feature_column, labels)
        
        if information_gain > best_information_gain:
            best_information_gain = information_gain
            best_feature_index = i
    
    return best_feature_index, best_information_gain

def main():
    # Load data
    features_file = "D:\SEM-4\ML\CODES\Machine-Learning\Lab04\extracted_features.npy"
    labels_file = "D:\SEM-4\ML\CODES\Machine-Learning\Lab04\labels.npy"
    extracted_features, labels = load_data(features_file, labels_file)
    
    # Convert features to categorical if needed
    categorical_features = convert_to_categorical(extracted_features)
    
    # Find the best feature for the root node
    best_feature_index, best_information_gain = find_best_feature(categorical_features, labels)
    
    print("Best Feature Index:", best_feature_index)
    print("Best Information Gain:", best_information_gain)

if __name__ == "__main__":
    main()

Best Feature Index: 254
Best Information Gain: 0.14431896891734253


In [2]:
import numpy as np
from collections import Counter
from math import log2

def load_data(features_file, labels_file):
    """
    Load features and labels from the provided files.
    """
    extracted_features = np.load(features_file)
    labels = np.load(labels_file)
    return extracted_features, labels

def convert_to_categorical(features, binning_type=None, num_bins=None):
   
    if binning_type is None:
        binning_type = 'equal_width'
    if num_bins is None:
        num_bins = 5
    
    categorical_features = []
    for feature in features.T:  # Transpose to loop through columns
        if binning_type == 'equal_width':
            bins = np.linspace(min(feature), max(feature), num_bins + 1)
        elif binning_type == 'equal_frequency':
            bins = np.quantile(feature, np.linspace(0, 1, num_bins + 1))
        else:
            raise ValueError("Invalid binning type. Use 'equal_width' or 'equal_frequency'.")
        
        digitized = np.digitize(feature, bins)
        categorical_features.append(digitized)
    
    return np.array(categorical_features).T

def calculate_entropy(labels):
    """
    Calculate the entropy of a set of labels.
    """
    label_counts = Counter(labels)
    total_samples = len(labels)
    
    entropy = 0
    for label in label_counts:
        prob = label_counts[label] / total_samples
        entropy -= prob * log2(prob)
    
    return entropy

def calculate_information_gain(feature_column, labels):
    """
    Calculate Information Gain for a feature.
    """
    # Calculate total entropy of the dataset
    total_entropy = calculate_entropy(labels)
    
    # Calculate the weighted entropy of each unique value in the feature
    unique_values = np.unique(feature_column)
    weighted_entropy = 0
    for value in unique_values:
        subset_indices = np.where(feature_column == value)[0]
        subset_labels = labels[subset_indices]
        weight = len(subset_labels) / len(labels)
        subset_entropy = calculate_entropy(subset_labels)
        weighted_entropy += weight * subset_entropy
    
    # Calculate Information Gain
    information_gain = total_entropy - weighted_entropy
    return information_gain

def find_best_feature(extracted_features, labels):
    """
    Find the best feature for the root node based on Information Gain.
    """
    num_features = extracted_features.shape[1]
    best_feature_index = -1
    best_information_gain = -1
    
    for i in range(num_features):
        feature_column = extracted_features[:, i]
        information_gain = calculate_information_gain(feature_column, labels)
        
        if information_gain > best_information_gain:
            best_information_gain = information_gain
            best_feature_index = i
    
    return best_feature_index, best_information_gain

def main():
    # Load data
    features_file = "D:\SEM-4\ML\CODES\Machine-Learning\Lab04\extracted_features.npy"
    labels_file = "D:\SEM-4\ML\CODES\Machine-Learning\Lab04\labels.npy"
    extracted_features, labels = load_data(features_file, labels_file)
    
    # Convert features to categorical with default parameters
    categorical_features_equal_width = convert_to_categorical(extracted_features)
    
    # Find the best feature for the root node
    best_feature_index, best_information_gain = find_best_feature(categorical_features_equal_width, labels)
    
    print("Best Feature Index (Equal Width Binning):", best_feature_index)
    print("Best Information Gain (Equal Width Binning):", best_information_gain)
    
    # Convert features to categorical with equal frequency binning, 7 bins
    categorical_features_equal_freq = convert_to_categorical(extracted_features, binning_type='equal_frequency', num_bins=7)
    
    # Find the best feature for the root node
    best_feature_index_freq, best_information_gain_freq = find_best_feature(categorical_features_equal_freq, labels)
    
    print("\nBest Feature Index (Equal Frequency Binning):", best_feature_index_freq)
    print("Best Information Gain (Equal Frequency Binning):", best_information_gain_freq)

if __name__ == "__main__":
    main()


Best Feature Index (Equal Width Binning): 254
Best Information Gain (Equal Width Binning): 0.14431896891734253

Best Feature Index (Equal Frequency Binning): 254
Best Information Gain (Equal Frequency Binning): 0.1590929509346024


In [3]:
import numpy as np
from collections import Counter
from math import log2

class Node:
    """
    Node class for the Decision Tree.
    """
    def __init__(self, feature_index=None, value=None, is_leaf=False, label=None):
        self.feature_index = feature_index  # Index of the feature
        self.value = value  # Value of the split
        self.is_leaf = is_leaf  # True if it's a leaf node
        self.label = label  # Predicted label if it's a leaf
        self.children = {}  # Dictionary to store children nodes

def load_data(features_file, labels_file):
    """
    Load features and labels from the provided files.
    """
    extracted_features = np.load(features_file)
    labels = np.load(labels_file)
    return extracted_features, labels

def convert_to_categorical(features, binning_type=None, num_bins=None):

    if binning_type is None:
        binning_type = 'equal_width'
    if num_bins is None:
        num_bins = 5
    
    categorical_features = []
    for feature in features.T:  # Transpose to loop through columns
        if binning_type == 'equal_width':
            bins = np.linspace(min(feature), max(feature), num_bins + 1)
        elif binning_type == 'equal_frequency':
            bins = np.quantile(feature, np.linspace(0, 1, num_bins + 1))
        else:
            raise ValueError("Invalid binning type. Use 'equal_width' or 'equal_frequency'.")
        
        digitized = np.digitize(feature, bins)
        categorical_features.append(digitized)
    
    return np.array(categorical_features).T

def calculate_entropy(labels):
    """
    Calculate the entropy of a set of labels.
    """
    label_counts = Counter(labels)
    total_samples = len(labels)
    
    entropy = 0
    for label in label_counts:
        prob = label_counts[label] / total_samples
        entropy -= prob * log2(prob)
    
    return entropy

def calculate_information_gain(feature_column, labels):
    """
    Calculate Information Gain for a feature.
    """
    # Calculate total entropy of the dataset
    total_entropy = calculate_entropy(labels)
    
    # Calculate the weighted entropy of each unique value in the feature
    unique_values = np.unique(feature_column)
    weighted_entropy = 0
    for value in unique_values:
        subset_indices = np.where(feature_column == value)[0]
        subset_labels = labels[subset_indices]
        weight = len(subset_labels) / len(labels)
        subset_entropy = calculate_entropy(subset_labels)
        weighted_entropy += weight * subset_entropy
    
    # Calculate Information Gain
    information_gain = total_entropy - weighted_entropy
    return information_gain

def find_best_feature(extracted_features, labels):
    """
    Find the best feature for the root node based on Information Gain.
    """
    num_features = extracted_features.shape[1]
    best_feature_index = -1
    best_information_gain = -1
    
    for i in range(num_features):
        feature_column = extracted_features[:, i]
        information_gain = calculate_information_gain(feature_column, labels)
        
        if information_gain > best_information_gain:
            best_information_gain = information_gain
            best_feature_index = i
    
    return best_feature_index, best_information_gain

def build_tree(extracted_features, labels, max_depth=None, min_samples_split=2):
   
    if max_depth is None:
        max_depth = float('inf')
    
    if len(np.unique(labels)) == 1 or len(labels) < min_samples_split or max_depth == 0:
        # Create a leaf node
        return Node(is_leaf=True, label=Counter(labels).most_common(1)[0][0])
    
    best_feature_index, best_information_gain = find_best_feature(extracted_features, labels)
    
    if best_information_gain == 0:
        # Create a leaf node
        return Node(is_leaf=True, label=Counter(labels).most_common(1)[0][0])
    
    best_feature_values = np.unique(extracted_features[:, best_feature_index])
    
    root = Node(feature_index=best_feature_index)
    
    for value in best_feature_values:
        subset_indices = np.where(extracted_features[:, best_feature_index] == value)[0]
        subset_features = extracted_features[subset_indices]
        subset_labels = labels[subset_indices]
        
        child = build_tree(subset_features, subset_labels, max_depth - 1, min_samples_split)
        root.children[value] = child
    
    return root

def print_tree(node, depth=0):
    """
    Print the Decision Tree.
    """
    if node.is_leaf:
        print("  " * depth, "Predicted Label:", node.label)
    else:
        print("  " * depth, "Feature", node.feature_index)
        for value, child_node in node.children.items():
            print("  " * (depth + 1), "Value", value, "->", end=" ")
            print_tree(child_node, depth + 2)

def main():
    # Load data
    features_file = "D:\SEM-4\ML\CODES\Machine-Learning\Lab04\extracted_features.npy"
    labels_file = "D:\SEM-4\ML\CODES\Machine-Learning\Lab04\labels.npy"
    extracted_features, labels = load_data(features_file, labels_file)
    
    # Convert features to categorical with default parameters
    categorical_features = convert_to_categorical(extracted_features)
    
    # Build the Decision Tree
    decision_tree = build_tree(categorical_features, labels, max_depth=3, min_samples_split=5)
    
    # Print the Decision Tree
    print("Decision Tree:")
    print_tree(decision_tree)

if __name__ == "__main__":
    main()


Decision Tree:
 Feature 254
   Value 1 ->      Feature 315
       Value 1 ->          Feature 494
           Value 1 ->              Predicted Label: 6
           Value 2 ->              Predicted Label: 1
           Value 3 ->              Predicted Label: 3
           Value 4 ->              Predicted Label: 0
           Value 5 ->              Predicted Label: 9
       Value 2 ->          Feature 15
           Value 1 ->              Predicted Label: 8
           Value 2 ->              Predicted Label: 1
           Value 3 ->              Predicted Label: 1
           Value 4 ->              Predicted Label: 1
           Value 5 ->              Predicted Label: 4
           Value 6 ->              Predicted Label: 1
       Value 3 ->          Feature 276
           Value 1 ->              Predicted Label: 8
           Value 2 ->              Predicted Label: 5
           Value 3 ->              Predicted Label: 1
           Value 4 ->              Predicted Label: 1
           Valu