In [None]:
import pandas as pd
import numpy as np

def read_dataset(file_path):
    """
    Reads a dataset from a specified text file.
    Assumes the file has a header row and is comma-separated.
    """
    return pd.read_csv(file_path)

def display_dataset(data):
    """
    Displays the dataset.
    """
    print("\nDataset:")
    print(data)

def entropy(target_col):
    """
    Calculates the entropy of a target column.
    """
    # Get unique elements and their counts
    elements, counts = np.unique(target_col, return_counts=True)
    
    # Calculate probabilities of each unique element
    probabilities = counts / np.sum(counts)
    
    # Initialize entropy value
    entropy_value = 0
    
    # Calculate entropy using the formula
    for prob in probabilities:
        entropy_value -= prob * np.log2(prob)
    
    return entropy_value

def gini_index(target_col):
    """
    Calculates the Gini index of a target column.
    """
    # Get unique elements and their counts
    elements, counts = np.unique(target_col, return_counts=True)
    
    # Calculate probabilities of each unique element
    probabilities = counts / np.sum(counts)
    
    # Calculate Gini index using the formula
    gini_value = 1.0 - np.sum(probabilities**2)
    
    return gini_value

def info_gain(data, split_attribute_name, target_name):
    """
    Calculates the Information Gain for a given split attribute.
    """
    # Calculate the total entropy of the target attribute
    total_entropy = entropy(data[target_name])
    
    # Get unique values and their counts for the split attribute
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    
    # Initialize weighted entropy value
    weighted_entropy = 0
    
    # Calculate weighted entropy for each unique value
    for value, count in zip(vals, counts):
        # Create a subset of the data where the attribute equals the current value
        subset = data[data[split_attribute_name] == value]
        
        # Calculate entropy of the subset with respect to the target variable
        subset_entropy = entropy(subset[target_name])
        
        # Update the weighted entropy
        weighted_entropy += (count / np.sum(counts)) * subset_entropy
    
    # Calculate Information Gain
    info_gain_value = total_entropy - weighted_entropy
    
    return info_gain_value

def gini_gain(data, split_attribute_name, target_name):
    """
    Calculates the Gini Gain for a given split attribute.
    """
    # Calculate the total Gini index of the target attribute
    total_gini = gini_index(data[target_name])
    
    # Get unique values and their counts for the split attribute
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    
    # Initialize weighted Gini index value
    weighted_gini = 0
    
    # Calculate weighted Gini index for each unique value
    for value, count in zip(vals, counts):
        # Create a subset of the data where the attribute equals the current value
        subset = data[data[split_attribute_name] == value]
        
        # Calculate Gini index of the subset with respect to the target variable
        subset_gini = gini_index(subset[target_name])
        
        # Update the weighted Gini index
        weighted_gini += (count / np.sum(counts)) * subset_gini
    
    # Calculate Gini Gain
    gini_gain_value = total_gini - weighted_gini
    
    return gini_gain_value

def calculate_impurity_measures(data, target_name):
    """
    Calculates the Information Gain and Gini Gain for each feature.
    """
    # Get all feature names except the target column
    features = data.columns.drop([target_name])
    
    # Initialize dictionaries to store Information Gain and Gini Gain values
    info_gains = {}
    gini_gains = {}
    
    # Calculate Information Gain and Gini Gain for each feature
    for feature in features:
        info_gains[feature] = info_gain(data, feature, target_name)
        gini_gains[feature] = gini_gain(data, feature, target_name)
    
    return info_gains, gini_gains

def find_root_node(data, target_name):
    """
    Finds the root node of the decision tree based on the highest Information Gain and Gini Gain.
    """
    # Calculate Information Gain and Gini Gain for each feature
    info_gains, gini_gains = calculate_impurity_measures(data, target_name)
    
    # Find the feature with the maximum Information Gain
    root_node_info = max(info_gains, key=info_gains.get)
    
    # Find the feature with the maximum Gini Gain
    root_node_gini = max(gini_gains, key=gini_gains.get)
    
    return root_node_info, root_node_gini

def menu():
    """
    Displays a menu-driven interface for interacting with the dataset and decision tree calculations.
    """
    # Get the file path for the dataset from the user
    file_path = input("Enter the path to the dataset (e.g., 'data.csv'): ")
    
    # Read the dataset
    df = read_dataset(file_path)
    
    # Automatically get the last column as the target column
    target_name = df.columns[-1]

    while True:
        # Display the menu options
        print("\nMenu:")
        print("1. Display Dataset")
        print("2. Calculate Information Gain for Each Feature")
        print("3. Calculate Gini Gain for Each Feature")
        print("4. Find Root Node")
        print("5. Exit")
        
        # Get the user's choice
        choice = input("Enter your choice: ")

        if choice == '1':
            # Option 1: Display the dataset
            display_dataset(df)
        
        elif choice == '2':
            # Option 2: Calculate and display Information Gain for each feature
            info_gains, _ = calculate_impurity_measures(df, target_name=target_name)
            print("\nInformation Gain for each feature:")
            for feature, gain in info_gains.items():
                print(f"{feature}: {gain}")

        elif choice == '3':
            # Option 3: Calculate and display Gini Gain for each feature
            _, gini_gains = calculate_impurity_measures(df, target_name=target_name)
            print("\nGini Index for each feature:")
            for feature, gain in gini_gains.items():
                print(f"{feature}: {gain}")

        elif choice == '4':
            # Option 4: Find and display the root node based on Information Gain and Gini Gain
            root_node_info, root_node_gini = find_root_node(df, target_name=target_name)
            print(f"\nRoot node based on Information Gain: {root_node_info}")
            print(f"Root node based on Gini Index: {root_node_gini}")

        elif choice == '5':
            # Option 5: Exit the program
            print("Exiting the program.")
            break

        else:
            # Handle invalid choices
            print("Invalid choice. Please try again.")

# Run the menu-driven program
menu()


In [2]:
import pandas as pd
import numpy as np

def read_dataset(file_path):
    """
    Reads a dataset from a specified text file.
    Assumes the file has a header row and is comma-separated.
    """
    return pd.read_csv(file_path)

def display_dataset(data):
    """
    Displays the dataset.
    """
    print("\nDataset:")
    print(data)

def entropy(target_col):
    """
    Calculates the entropy of a target column.
    """
    elements, counts = np.unique(target_col, return_counts=True)
    probabilities = counts / np.sum(counts)
    entropy_value = 0
    for prob in probabilities:
        entropy_value -= prob * np.log2(prob)
    return entropy_value

def gini_index(target_col):
    """
    Calculates the Gini index of a target column.
    """
    elements, counts = np.unique(target_col, return_counts=True)
    probabilities = counts / np.sum(counts)
    gini_value = 1.0 - np.sum(probabilities**2)
    return gini_value

def info_gain(data, split_attribute_name, target_name):
    """
    Calculates the Information Gain for a given split attribute.
    """
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    weighted_entropy = 0
    for value, count in zip(vals, counts):
        subset = data[data[split_attribute_name] == value]
        subset_entropy = entropy(subset[target_name])
        weighted_entropy += (count / np.sum(counts)) * subset_entropy
    info_gain_value = total_entropy - weighted_entropy
    return info_gain_value

def gini_gain(data, split_attribute_name, target_name):
    """
    Calculates the Gini Gain for a given split attribute.
    """
    total_gini = gini_index(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    weighted_gini = 0
    for value, count in zip(vals, counts):
        subset = data[data[split_attribute_name] == value]
        subset_gini = gini_index(subset[target_name])
        weighted_gini += (count / np.sum(counts)) * subset_gini
    gini_gain_value = total_gini - weighted_gini
    return gini_gain_value

def calculate_impurity_measures(data, target_name):
    """
    Calculates the Information Gain and Gini Gain for each feature.
    """
    features = data.columns.drop([target_name])
    info_gains = {}
    gini_gains = {}
    
    for feature in features:
        info_gains[feature] = info_gain(data, feature, target_name)
        gini_gains[feature] = gini_gain(data, feature, target_name)
    
    return info_gains, gini_gains

def find_root_node(data, target_name):
    """
    Finds the root node of the decision tree based on the highest Information Gain and Gini Gain.
    """
    info_gains, gini_gains = calculate_impurity_measures(data, target_name)
    root_node_info = max(info_gains, key=info_gains.get)
    root_node_gini = max(gini_gains, key=gini_gains.get)
    return root_node_info, root_node_gini

def menu():
    file_path = input("Enter the path to the dataset (e.g., 'data.csv'): ")
    df = read_dataset(file_path)
    
    target_name = df.columns[-1]

    while True:
        print("\nMenu:")
        print("1. Display Dataset")
        print("2. Calculate Information Gain for Each Feature")
        print("3. Calculate Gini Gain for Each Feature")
        print("4. Find Root Node")
        print("5. Exit")
        choice = input("Enter your choice: ")

        if choice == '1':
            display_dataset(df)
        
        elif choice == '2':
            info_gains, _ = calculate_impurity_measures(df, target_name=target_name)
            print("\nInformation Gain for each feature:")
            for feature, gain in info_gains.items():
                print(f"{feature}: {gain}")

        elif choice == '3':
            _, gini_gains = calculate_impurity_measures(df, target_name=target_name)
            print("\nGini Index for each feature:")
            for feature, gain in gini_gains.items():
                print(f"{feature}: {gain}")

        elif choice == '4':
            root_node_info, root_node_gini = find_root_node(df, target_name=target_name)
            print(f"\nRoot node based on Information Gain: {root_node_info}")
            print(f"Root node based on Gini Index: {root_node_gini}")

        elif choice == '5':
            print("Exiting the program.")
            break

        else:
            print("Invalid choice. Please try again.")

# Run the menu-driven program
menu()


Enter the path to the dataset (e.g., 'data.csv'):  tennis_dataset.csv



Menu:
1. Display Dataset
2. Calculate Information Gain for Each Feature
3. Calculate Gini Gain for Each Feature
4. Find Root Node
5. Exit


Enter your choice:  1



Dataset:
     Outlook  Temp Humidity    Wind Play Tennis
0      Sunny   Hot     High    Weak          No
1      Sunny   Hot     High  Strong          No
2   Overcast   Hot     High    Weak         Yes
3       Rain  Mild     High    Weak         Yes
4       Rain  Cool   Normal    Weak         Yes
5       Rain  Cool   Normal  Strong          No
6   Overcast  Cool   Normal  Strong         Yes
7      Sunny  Mild     High    Weak          No
8      Sunny  Cool   Normal    Weak         Yes
9       Rain  Mild   Normal    Weak         Yes
10     Sunny  Mild   Normal  Strong         Yes
11  Overcast  Mild     High  Strong         Yes
12  Overcast   Hot   Normal    Weak         Yes
13      Rain  Mild     High  Strong          No

Menu:
1. Display Dataset
2. Calculate Information Gain for Each Feature
3. Calculate Gini Gain for Each Feature
4. Find Root Node
5. Exit


Enter your choice:  2



Information Gain for each feature:
Outlook: 0.24674981977443933
Temp: 0.02922256565895487
Humidity: 0.15183550136234159
Wind: 0.04812703040826949

Menu:
1. Display Dataset
2. Calculate Information Gain for Each Feature
3. Calculate Gini Gain for Each Feature
4. Find Root Node
5. Exit


Enter your choice:  3



Gini Index for each feature:
Outlook: 0.11632653061224485
Temp: 0.018707482993197244
Humidity: 0.09183673469387743
Wind: 0.030612244897959162

Menu:
1. Display Dataset
2. Calculate Information Gain for Each Feature
3. Calculate Gini Gain for Each Feature
4. Find Root Node
5. Exit


Enter your choice:  4



Root node based on Information Gain: Outlook
Root node based on Gini Index: Outlook

Menu:
1. Display Dataset
2. Calculate Information Gain for Each Feature
3. Calculate Gini Gain for Each Feature
4. Find Root Node
5. Exit


Enter your choice:  5


Exiting the program.
