<a href="https://colab.research.google.com/github/vanadhisivakumar-source/Machine-learning-projects/blob/main/Decision%20tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    # Filter out zero probabilities to avoid log2(0) which is -inf
    probabilities = counts / np.sum(counts)
    entropy_val = np.sum([-p * np.log2(p) if p > 0 else 0 for p in probabilities])
    return entropy_val

def InfoGain(data, split_attribute_name, target_name="class"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name])
                               for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

def ID3(data, originaldata, features, target_attribute_name="class", parent_node_class=None):
    # If all target values are the same, return that value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    # If the dataset is empty, return the most common target value from the original dataset
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]
    # If there are no features left, return the parent's most common target value
    elif len(features) == 0:
        return parent_node_class
    else:
        # Determine the most common class in the current dataset (for parent_node_class)
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]

        # Calculate Information Gain for each feature
        item_values = [InfoGain(data, feature, target_attribute_name) for feature in features]
        # Select the feature with the highest Information Gain
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        # Initialize the tree with the best feature as the root
        tree = {best_feature: {}}

        # Remove the best_feature from the list of features for recursive calls
        remaining_features = [i for i in features if i != best_feature]

        # Recursively build the tree for each value of the best_feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = ID3(sub_data, originaldata, remaining_features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree

        return tree

# Main execution block
# Load the dataset
dataset = pd.read_csv('playtennis.csv', names=['outlook', 'temperature', 'humidity', 'wind', 'class'])

# Define initial features (all columns except the target 'class')
initial_features = dataset.columns.drop('class').tolist()

# Build the decision tree
tree = ID3(dataset, dataset, initial_features, target_attribute_name='class')

# Display the built tree
print('\nDisplay Tree\n', tree)