In [1]:
import math
import pandas as pd


# Step 1: Entropy Function

def entropy(data, target_column):
    values, counts = data[target_column].value_counts().items()
    total = len(data[target_column])
    return -sum((count/total) * math.log2(count/total) for count in counts)

# -------------------------
# Step 2: Information Gain
# -------------------------
def information_gain(data, split_attribute, target_column):
    total_entropy = entropy(data, target_column)
    values, counts = data[split_attribute].value_counts().items()
    weighted_entropy = 0
    
    for v, count in zip(values, counts):
        subset = data[data[split_attribute] == v]
        weighted_entropy += (count/len(data)) * entropy(subset, target_column)
    
    return total_entropy - weighted_entropy

# -------------------------
# Step 3: ID3 Algorithm
# -------------------------
def id3(data, target_column, features):
    # If all labels are the same → return that label
    if len(data[target_column].unique()) == 1:
        return data[target_column].iloc[0]
    
    # If no features left → return most common label
    if len(features) == 0:
        return data[target_column].mode()[0]
    
    # Find best feature to split
    gains = [information_gain(data, f, target_column) for f in features]
    best_feature = features[gains.index(max(gains))]
    
    tree = {best_feature: {}}
    
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        new_features = [f for f in features if f != best_feature]
        subtree = id3(subset, target_column, new_features)
        tree[best_feature][value] = subtree
    
    return tree

# -------------------------
# Step 4: Example Dataset
# -------------------------
data = pd.DataFrame({
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 
                    'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 
                 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 
             'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 
                   'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
})

features = list(data.columns[:-1])  # exclude target
target = 'PlayTennis'

# Build Decision Tree
tree = id3(data, target, features)
print("Decision Tree (ID3):")
print(tree)


TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [3]:
import math
import pandas as pd


# Step 1: Entropy Function

def entropy(data, target_column):
    counts = data[target_column].value_counts()
    total = len(data[target_column])
    return -sum((count/total) * math.log2(count/total) for count in counts)


# Step 2: Information Gain

def information_gain(data, split_attribute, target_column):
    total_entropy = entropy(data, target_column)
    values = data[split_attribute].unique()
    weighted_entropy = 0
    
    for v in values:
        subset = data[data[split_attribute] == v]
        weighted_entropy += (len(subset)/len(data)) * entropy(subset, target_column)
    
    return total_entropy - weighted_entropy


# Step 3: ID3 Algorithm

def id3(data, target_column, features):
    # If all labels are the same → return that label
    if len(data[target_column].unique()) == 1:
        return data[target_column].iloc[0]
    
    # If no features left → return most common label
    if len(features) == 0:
        return data[target_column].mode()[0]
    
    # Find best feature to split
    gains = [information_gain(data, f, target_column) for f in features]
    best_feature = features[gains.index(max(gains))]
    
    tree = {best_feature: {}}
    
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        new_features = [f for f in features if f != best_feature]
        subtree = id3(subset, target_column, new_features)
        tree[best_feature][value] = subtree
    
    return tree


# Step 4: Example Dataset

data = pd.DataFrame({
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 
                    'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 
                 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 
             'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 
                   'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
})

features = list(data.columns[:-1])  # exclude target
target = 'PlayTennis'

# Build Decision Tree
tree = id3(data, target, features)
print("Decision Tree (ID3):")
print(tree)


Decision Tree (ID3):
{'Outlook': {'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}, 'Overcast': 'Yes', 'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}}


In [4]:
import pandas as pd
import math

# --------- Entropy Function ----------
def entropy(data, target_column):
    counts = data[target_column].value_counts()
    total = len(data[target_column])
    return -sum((count/total) * math.log2(count/total) for count in counts)

# --------- Split Information ----------
def split_info(data, split_attribute):
    values = data[split_attribute].unique()
    total = len(data)
    si = 0
    for v in values:
        subset = data[data[split_attribute] == v]
        ratio = len(subset)/total
        si -= ratio * math.log2(ratio)
    return si

# --------- Gain Ratio (C4.5) ----------
def gain_ratio(data, split_attribute, target_column):
    total_entropy = entropy(data, target_column)
    values = data[split_attribute].unique()
    weighted_entropy = 0
    
    for v in values:
        subset = data[data[split_attribute] == v]
        weighted_entropy += (len(subset)/len(data)) * entropy(subset, target_column)
    
    info_gain = total_entropy - weighted_entropy
    si = split_info(data, split_attribute)
    if si == 0:   # to avoid divide by zero
        return 0
    return info_gain / si

# --------- C4.5 Algorithm ----------
def c45(data, target_column, features):
    if len(data[target_column].unique()) == 1:
        return data[target_column].iloc[0]  # Pure → Leaf
    
    if len(features) == 0:
        return data[target_column].mode()[0]  # Majority
    
    # Choose best feature by Gain Ratio
    gains = [gain_ratio(data, f, target_column) for f in features]
    best_feature = features[gains.index(max(gains))]
    
    tree = {best_feature: {}}
    
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        if subset.empty:
            tree[best_feature][value] = data[target_column].mode()[0]
        else:
            new_features = [f for f in features if f != best_feature]
            subtree = c45(subset, target_column, new_features)
            tree[best_feature][value] = subtree
    
    return tree

# --------- Example Dataset (Play Tennis) ----------
data = pd.DataFrame({
    'Outlook': ['Sunny','Sunny','Overcast','Rain','Rain','Rain','Overcast','Sunny',
                'Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
    'Temperature': ['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild',
                    'Cool','Mild','Mild','Mild','Hot','Mild'],
    'Humidity': ['High','High','High','High','Normal','Normal','Normal','High',
                 'Normal','Normal','Normal','High','Normal','High'],
    'Wind': ['Weak','Strong','Weak','Weak','Weak','Strong','Strong','Weak',
             'Weak','Weak','Strong','Strong','Weak','Strong'],
    'PlayTennis': ['No','No','Yes','Yes','Yes','No','Yes','No',
                   'Yes','Yes','Yes','Yes','Yes','No']
})

features = list(data.columns[:-1])
target = 'PlayTennis'

# Build tree using C4.5
tree = c45(data, target, features)
print("\nDecision Tree (C4.5):\n", tree)



Decision Tree (C4.5):
 {'Outlook': {'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}, 'Overcast': 'Yes', 'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}}


In [2]:
import pandas as pd
import math

# ------------------- Gini Index -------------------
def gini_index(data, target_column):
    counts = data[target_column].value_counts()
    total = len(data[target_column])
    return 1 - sum((count/total)**2 for count in counts)

# ------------------- Gini Gain -------------------
def gini_gain(data, split_attribute, target_column):
    total_gini = gini_index(data, target_column)
    values = data[split_attribute].unique()
    weighted_gini = 0
    for v in values:
        subset = data[data[split_attribute] == v]
        weighted_gini += (len(subset)/len(data)) * gini_index(subset, target_column)
    return total_gini - weighted_gini

# ------------------- CART Algorithm -------------------
def cart(data, target_column, features):
    # If all examples are same class → return class
    if len(data[target_column].unique()) == 1:
        return data[target_column].iloc[0]
    # If no more features → return majority class
    if len(features) == 0:
        return data[target_column].mode()[0]

    # Choose best feature using Gini Gain
    gains = [gini_gain(data, f, target_column) for f in features]
    best_feature = features[gains.index(max(gains))]

    # Build tree as dictionary
    tree = {best_feature: {}}
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        if subset.empty:
            tree[best_feature][value] = data[target_column].mode()[0]
        else:
            new_features = [f for f in features if f != best_feature]
            tree[best_feature][value] = cart(subset, target_column, new_features)
    return tree

# ------------------- Example Dataset (Play Tennis) -------------------
data = pd.DataFrame({
    'Outlook': ['Sunny','Sunny','Overcast','Rain','Rain','Rain','Overcast','Sunny',
                'Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
    'Temperature': ['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild',
                    'Cool','Mild','Mild','Mild','Hot','Mild'],
    'Humidity': ['High','High','High','High','Normal','Normal','Normal','High',
                 'Normal','Normal','Normal','High','Normal','High'],
    'Wind': ['Weak','Strong','Weak','Weak','Weak','Strong','Strong','Weak',
             'Weak','Weak','Strong','Strong','Weak','Strong'],
    'PlayTennis': ['No','No','Yes','Yes','Yes','No','Yes','No',
                   'Yes','Yes','Yes','Yes','Yes','No']
})

features = list(data.columns[:-1])
target = 'PlayTennis'

# ------------------- Build CART Tree -------------------
tree_cart = cart(data, target, features)

print("\nDecision Tree (CART):\n", tree_cart)



Decision Tree (CART):
 {'Outlook': {'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}, 'Overcast': 'Yes', 'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}}
