In [None]:
#A1 Decission tree

import numpy as np
import pandas as pd

class DecisionTree:
    def __init__(self):
        self.tree = None
        
    def entropy(self, labels):
        """
        Calculate entropy given a list of labels.
        """
        unique_labels, label_counts = np.unique(labels, return_counts=True)
        probabilities = label_counts / len(labels)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def information_gain(self, data, feature_values, target_values):
        """
        Calculate information gain for a specific feature.
        """
        # Calculate entropy of the whole dataset
        total_entropy = self.entropy(target_values)

        # Calculate weighted average entropy after splitting on the given feature
        weighted_entropy = 0
        for value in set(feature_values):
            subset_indices = np.where(feature_values == value)[0]  # Extract the first element of the tuple
            subset_targets = target_values[subset_indices]
            weighted_entropy += len(subset_targets) / len(target_values) * self.entropy(subset_targets)

        # Calculate information gain
        info_gain = total_entropy - weighted_entropy
        return info_gain

    def root_node_feature(self, features, target_values):
        """
        Find the feature with the highest information gain to use as the root node.
        """
        max_info_gain = -1
        root_feature = None
        for feature in features.columns:
            gain = self.information_gain(features[feature], features[feature], target_values)  # Pass each feature separately
            if gain > max_info_gain:
                max_info_gain = gain
                root_feature = feature
        return root_feature

    def load_data(self, excel_path, binning_type=None, num_bins=None):
        df = pd.read_excel(excel_path)
        features = df.drop('Disease', axis=1)
        labels = df['Disease']
        
        # Check if binning is required
        if binning_type and num_bins:
            for col in features.columns:
                if features[col].dtype != 'object':  # Check if the feature is continuous
                    bins = self.binning(features[col], num_bins, binning_type)
                    features[col] = pd.cut(features[col], bins=bins, labels=False)
        
        return features, labels

    def is_pure(self, s):
        return len(set(s)) == 1

    def most_common(self, a):
        (values, counts) = np.unique(a, return_counts=True)
        ind = np.argmax(counts)
        return values[ind]

    def recursive_split(self, x, y):
        if self.is_pure(y) or len(y) == 0:
            return self.most_common(y)

        gain = np.array([self.information_gain(y, x_attr, y) for x_attr in x.T])

        if np.all(gain < 1e-6):
            return self.most_common(y)

        selected_attr = np.argmax(gain)
        sets = self.partition(x[:, selected_attr])

        res = {}
        for key, value in sets.items():
            y_subset = y.take(value, axis=0)
            x_subset = x.take(value, axis=0)
            if len(y_subset) > 0:  # Check if there are samples left
                res["x_%d = %s" % (selected_attr, key)] = self.recursive_split(x_subset, y_subset)

        return res

    def partition(self, a):
        return {c: (a == c).nonzero()[0] for c in np.unique(a)}

    def print_tree(self, d, depth=0):
        for key, value in d.items():
            for i in range(depth):
                print(' ', end='')
            if type(value) is dict:
                print(key, end=':\n')
                self.print_tree(value, depth + 1)
            else:
                print(key, end=': ')
                print(value)
    
    def build_tree(self, features, labels):
        self.tree = self.recursive_split(features.to_numpy(), labels.to_numpy())
        
    def binning(self, data, num_bins, binning_type='equal_width'):
        if binning_type == 'equal_width':
            return np.linspace(data.min(), data.max(), num_bins + 1)
        elif binning_type == 'frequency':
            bins = pd.cut(data, bins=num_bins, duplicates='drop')
            return sorted(bins.unique())
        else:
            raise ValueError("Invalid binning type. Choose 'equal_width' or 'frequency'.")

# Create an instance of the DecisionTree class
dt = DecisionTree()

# Load data with binning
features, labels = dt.load_data("C:/Users/vishn/Downloads/extracted_features.xlsx", binning_type='equal_width', num_bins=5)

# Build decision tree
dt.build_tree(features, labels)

# Print the decision tree
dt.print_tree(dt.tree)


In [1]:
#A2

import numpy as np
import pandas as pd

def entropy(labels):
    """
    Calculate entropy given a list of labels.
    """
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    probabilities = label_counts / len(labels)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def information_gain(data, feature_values, target_values):
    """
    Calculate information gain for a specific feature.
    """
    # Calculate entropy of the whole dataset
    total_entropy = entropy(target_values)

    # Calculate weighted average entropy after splitting on the given feature
    weighted_entropy = 0
    for value in set(feature_values):
        subset_indices = np.where(feature_values == value)[0]  # Extract the first element of the tuple
        subset_targets = target_values[subset_indices]
        weighted_entropy += len(subset_targets) / len(target_values) * entropy(subset_targets)

    # Calculate information gain
    info_gain = total_entropy - weighted_entropy
    return info_gain

def root_node_feature(features, target_values):
    """
    Find the feature with the highest information gain to use as the root node.
    """
    max_info_gain = -1
    root_feature = None
    for feature in features.columns:
        gain = information_gain(features[feature], features[feature], target_values)  # Pass each feature separately
        if gain > max_info_gain:
            max_info_gain = gain
            root_feature = feature
    return root_feature

def load_data(excel_path, binning_type=None, num_bins=None):
    df = pd.read_excel(excel_path)
    features = df.drop('Disease', axis=1)
    labels = df['Disease']
    
    # Check if binning is required
    if binning_type and num_bins:
        for col in features.columns:
            if pd.api.types.is_numeric_dtype(features[col]):  # Check if the feature is numeric
                bins = binning_equal_width(features[col], num_bins)
                features[col] = pd.cut(features[col], bins=bins, labels=False)
    
    return features, labels

def binning_equal_width(feature_values, num_bins):
    min_val = feature_values.min()
    max_val = feature_values.max()
    bin_width = (max_val - min_val) / num_bins
    bins = [min_val + i * bin_width for i in range(num_bins)]
    bins.append(max_val)  # Add the upper bound of the last bin
    return bins

def is_pure(s):
    return len(set(s)) == 1

def most_common(a):
    (values, counts) = np.unique(a, return_counts=True)
    ind = np.argmax(counts)
    return values[ind]

def recursive_split(x, y):
    if is_pure(y) or len(y) == 0:
        return most_common(y)

    gain = np.array([information_gain(y, x_attr, y) for x_attr in x.T])
    
    if np.all(gain < 1e-6):
        return most_common(y)

    selected_attr = np.argmax(gain)
    sets = partition(x[:, selected_attr])

    res = {}
    for key, value in sets.items():
        y_subset = y.take(value, axis=0)
        x_subset = x.take(value, axis=0)
        if len(y_subset) > 0:  # Check if there are samples left
            res["x_%d = %s" % (selected_attr, key)] = recursive_split(x_subset, y_subset)

    return res


def partition(a):
    return {c: (a == c).nonzero()[0] for c in np.unique(a)}

def print_tree(d, depth=0):
    for key, value in d.items():
        for i in range(depth):
            print(' ', end='')
        if type(value) is dict:
            print(key, end=':\n')
            print_tree(value, depth + 1)
        else:
            print(key, end=': ')
            print(value)

# Load data with binning
features, labels = load_data("C:/Users/vishn/Downloads/extracted_features.xlsx", binning_type='equal_width', num_bins=5)

# Detect root node feature
root_feature = root_node_feature(features, labels)

# Perform algorithm on the example dataset to create a decision tree
d = recursive_split(features.to_numpy(), labels.to_numpy())

# Print the decision tree
print_tree(d)


In [2]:
#A3  Expand the above functions to built your own Decision Tree module.

import numpy as np
import pandas as pd

class DecisionTree:
    def __init__(self):
        self.tree = None
        
    def entropy(self, labels):
        """
        Calculate entropy given a list of labels.
        """
        unique_labels, label_counts = np.unique(labels, return_counts=True)
        probabilities = label_counts / len(labels)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def information_gain(self, data, feature_values, target_values):
        """
        Calculate information gain for a specific feature.
        """
        # Calculate entropy of the whole dataset
        total_entropy = self.entropy(target_values)

        # Calculate weighted average entropy after splitting on the given feature
        weighted_entropy = 0
        for value in set(feature_values):
            subset_indices = np.where(feature_values == value)[0]  # Extract the first element of the tuple
            subset_targets = target_values[subset_indices]
            weighted_entropy += len(subset_targets) / len(target_values) * self.entropy(subset_targets)

        # Calculate information gain
        info_gain = total_entropy - weighted_entropy
        return info_gain

    def root_node_feature(self, features, target_values):
        """
        Find the feature with the highest information gain to use as the root node.
        """
        max_info_gain = -1
        root_feature = None
        for feature in features.columns:
            gain = self.information_gain(features[feature], features[feature], target_values)  # Pass each feature separately
            if gain > max_info_gain:
                max_info_gain = gain
                root_feature = feature
        return root_feature

    def load_data(self, excel_path, binning_type=None, num_bins=None):
        df = pd.read_excel(excel_path)
        features = df.drop('Disease', axis=1)
        labels = df['Disease']
        
        # Check if binning is required
        if binning_type and num_bins:
            for col in features.columns:
                if features[col].dtype != 'object':  # Check if the feature is continuous
                    bins = self.binning(features[col], num_bins, binning_type)
                    features[col] = pd.cut(features[col], bins=bins, labels=False)
        
        return features, labels

    def is_pure(self, s):
        return len(set(s)) == 1

    def most_common(self, a):
        (values, counts) = np.unique(a, return_counts=True)
        ind = np.argmax(counts)
        return values[ind]

    def recursive_split(self, x, y):
        if self.is_pure(y) or len(y) == 0:
            return self.most_common(y)

        gain = np.array([self.information_gain(y, x_attr, y) for x_attr in x.T])

        if np.all(gain < 1e-6):
            return self.most_common(y)

        selected_attr = np.argmax(gain)
        sets = self.partition(x[:, selected_attr])

        res = {}
        for key, value in sets.items():
            y_subset = y.take(value, axis=0)
            x_subset = x.take(value, axis=0)
            if len(y_subset) > 0:  # Check if there are samples left
                res["x_%d = %s" % (selected_attr, key)] = self.recursive_split(x_subset, y_subset)

        return res

    def partition(self, a):
        return {c: (a == c).nonzero()[0] for c in np.unique(a)}

    def print_tree(self, d, depth=0):
        for key, value in d.items():
            for i in range(depth):
                print(' ', end='')
            if type(value) is dict:
                print(key, end=':\n')
                self.print_tree(value, depth + 1)
            else:
                print(key, end=': ')
                print(value)
    
    def build_tree(self, features, labels):
        self.tree = self.recursive_split(features.to_numpy(), labels.to_numpy())
        
    def binning(self, data, num_bins, binning_type='equal_width'):
        if binning_type == 'equal_width':
            return np.linspace(data.min(), data.max(), num_bins + 1)
        elif binning_type == 'frequency':
            bins = pd.cut(data, bins=num_bins, duplicates='drop')
            return sorted(bins.unique())
        else:
            raise ValueError("Invalid binning type. Choose 'equal_width' or 'frequency'.")

# Create an instance of the DecisionTree class
dt = DecisionTree()

# Load data with binning
features, labels = dt.load_data("C:/Users/vishn/Downloads/extracted_features.xlsx", binning_type='equal_width', num_bins=5)

# Build decision tree
dt.build_tree(features, labels)

# Print the decision tree
dt.print_tree(dt.tree)
