# Dataset

In [2]:
import pandas as pd

# Defining the dataset
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temp': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)
df


Unnamed: 0,Outlook,Temp,Humidity,Wind,Decision
0,Sunny,85,85,Weak,No
1,Sunny,80,90,Strong,No
2,Overcast,83,78,Weak,Yes
3,Rain,70,96,Weak,Yes
4,Rain,68,80,Weak,Yes
5,Rain,65,70,Strong,No
6,Overcast,64,65,Strong,Yes
7,Sunny,72,95,Weak,No
8,Sunny,69,70,Weak,Yes
9,Rain,75,80,Weak,Yes


# C 4.5 Algorithm Implementation

In [3]:
import numpy as np

def calculate_entropy(y):
    """Calculate the entropy of the labels."""
    value_counts = np.unique(y, return_counts=True)[1]
    probabilities = value_counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-9))  # Adding a small value to avoid log(0)

def calculate_gain_ratio(df, feature, target):
    """Calculate Gain Ratio for the feature."""
    total_entropy = calculate_entropy(df[target])
    
    # Calculate feature values and their entropies
    values, counts = np.unique(df[feature], return_counts=True)
    weighted_entropy = 0
    split_info = 0
    
    for value, count in zip(values, counts):
        subset = df[df[feature] == value]
        subset_entropy = calculate_entropy(subset[target])
        weighted_entropy += (count / len(df)) * subset_entropy
        split_info -= (count / len(df)) * np.log2(count / len(df) + 1e-9)
    
    gain = total_entropy - weighted_entropy
    
    if split_info == 0:
        return 0  # Avoid division by zero
    
    gain_ratio = gain / split_info
    return gain_ratio

def find_best_feature(df, target):
    """Find the best feature to split on based on Gain Ratio."""
    features = df.columns.drop(target)
    best_gain_ratio = -1
    best_feature = None
    
    for feature in features:
        gain_ratio = calculate_gain_ratio(df, feature, target)
        if gain_ratio > best_gain_ratio:
            best_gain_ratio = gain_ratio
            best_feature = feature
            
    return best_feature

# Recursive function to build the decision tree
def build_tree(df, target):
    """Builds a decision tree using the C4.5 algorithm."""
    if len(np.unique(df[target])) == 1:
        return df[target].values[0]  # Return the label if all examples have the same label
    
    if df.empty:
        return None  # Return None if there are no examples

    # Find the best feature to split on
    best_feature = find_best_feature(df, target)
    
    tree = {best_feature: {}}
    for value in np.unique(df[best_feature]):
        subset = df[df[best_feature] == value]
        subtree = build_tree(subset, target)
        tree[best_feature][value] = subtree
        
    return tree

# Build the decision tree
decision_tree_c45 = build_tree(df, 'Decision')
print("Decision Tree (C4.5):")
print(decision_tree_c45)


Decision Tree (C4.5):
{'Temp': {64: 'Yes', 65: 'No', 68: 'Yes', 69: 'Yes', 70: 'Yes', 71: 'No', 72: {'Outlook': {'Overcast': 'Yes', 'Sunny': 'No'}}, 75: 'Yes', 80: 'No', 81: 'Yes', 83: 'Yes', 85: 'No'}}


# CART Algorithm Implementation

In [4]:
def gini_impurity(y):
    """Calculate the Gini Impurity."""
    value_counts = np.unique(y, return_counts=True)[1]
    probabilities = value_counts / len(y)
    return 1 - np.sum(probabilities ** 2)

def calculate_gini_gain(df, feature, target):
    """Calculate Gini Gain for the feature."""
    total_gini = gini_impurity(df[target])
    weighted_gini = 0
    
    for value in np.unique(df[feature]):
        subset = df[df[feature] == value]
        weighted_gini += (len(subset) / len(df)) * gini_impurity(subset[target])
        
    return total_gini - weighted_gini

def find_best_feature_cart(df, target):
    """Find the best feature to split on based on Gini Gain."""
    features = df.columns.drop(target)
    best_gini_gain = -1
    best_feature = None
    
    for feature in features:
        gini_gain = calculate_gini_gain(df, feature, target)
        if gini_gain > best_gini_gain:
            best_gini_gain = gini_gain
            best_feature = feature
            
    return best_feature

# Recursive function to build the decision tree using CART
def build_tree_cart(df, target):
    """Builds a decision tree using the CART algorithm."""
    if len(np.unique(df[target])) == 1:
        return df[target].values[0]  # Return the label if all examples have the same label
    
    if df.empty:
        return None  # Return None if there are no examples

    # Find the best feature to split on
    best_feature = find_best_feature_cart(df, target)
    
    tree = {best_feature: {}}
    for value in np.unique(df[best_feature]):
        subset = df[df[best_feature] == value]
        subtree = build_tree_cart(subset, target)
        tree[best_feature][value] = subtree
        
    return tree

# Build the decision tree
decision_tree_cart = build_tree_cart(df, 'Decision')
print("\nDecision Tree (CART):")
print(decision_tree_cart)



Decision Tree (CART):
{'Temp': {64: 'Yes', 65: 'No', 68: 'Yes', 69: 'Yes', 70: 'Yes', 71: 'No', 72: {'Outlook': {'Overcast': 'Yes', 'Sunny': 'No'}}, 75: 'Yes', 80: 'No', 81: 'Yes', 83: 'Yes', 85: 'No'}}


# Classifying a new sample

In [5]:
def classify(tree, sample):
    """Classify a sample using the decision tree."""
    if not isinstance(tree, dict):
        return tree  # If the tree is a leaf, return the decision
    
    feature = next(iter(tree))  # Get the first feature in the tree
    feature_value = sample[feature]
    
    if feature_value in tree[feature]:
        return classify(tree[feature][feature_value], sample)
    else:
        return None  # If feature value is not found in the tree

# New sample for classification
new_sample = {
    'Outlook': 'Sunny',
    'Temp': 75,
    'Humidity': 70,
    'Wind': 'Strong'
}

# Classifying using C4.5
decision_c45 = classify(decision_tree_c45, new_sample)
print(f"\nClassification result for new sample (C4.5): {decision_c45}")

# Classifying using CART
decision_cart = classify(decision_tree_cart, new_sample)
print(f"Classification result for new sample (CART): {decision_cart}")



Classification result for new sample (C4.5): Yes
Classification result for new sample (CART): Yes
