In [5]:
import pandas as pd 
import math


In [2]:
data = {
    "Day": ["day1","day2","day3","day4","day5","day6","day7","day8","day9","day10","day11","day12","day13","day14"],
    "Weather": ["Sunny","Sunny","Cloudy","Rain","Rain","Rain","Cloudy","Sunny","Sunny","Rain","Sunny","Cloudy","Cloudy","Rain"],
    "Temperature": ["Hot","Hot","Hot","Mild","Cool","Cool","Cool","Mild","Cool","Mild","Mild","Mild","Hot","Mild"],
    "Humidity": ["High","High","High","High","Normal","Normal","Normal","High","Normal","Normal","Normal","High","Normal","High"],
    "Wind": ["Weak","Strong","Weak","Weak","Weak","Strong","Strong","Weak","Weak","Weak","Strong","Strong","Weak","Strong"],
    "PlayFootball": ["No","No","Yes","Yes","Yes","No","Yes","No","Yes","Yes","Yes","Yes","Yes","No"]
}

df = pd.DataFrame(data)
print(df)


      Day Weather Temperature Humidity    Wind PlayFootball
0    day1   Sunny         Hot     High    Weak           No
1    day2   Sunny         Hot     High  Strong           No
2    day3  Cloudy         Hot     High    Weak          Yes
3    day4    Rain        Mild     High    Weak          Yes
4    day5    Rain        Cool   Normal    Weak          Yes
5    day6    Rain        Cool   Normal  Strong           No
6    day7  Cloudy        Cool   Normal  Strong          Yes
7    day8   Sunny        Mild     High    Weak           No
8    day9   Sunny        Cool   Normal    Weak          Yes
9   day10    Rain        Mild   Normal    Weak          Yes
10  day11   Sunny        Mild   Normal  Strong          Yes
11  day12  Cloudy        Mild     High  Strong          Yes
12  day13  Cloudy         Hot   Normal    Weak          Yes
13  day14    Rain        Mild     High  Strong           No


In [3]:
df = pd.DataFrame(data)
df = df.drop(columns=["Day"]) 

Function for calculating Entropy

In [6]:
def entropy(target_col):
    values = target_col.value_counts()
    total = len(target_col)
    
    ent = 0
    for count in values:
        p = count / total
        ent -= p * math.log2(p)
    return ent

Information Gain

In [7]:
def information_gain(data, feature, target="PlayFootball"):
    total_entropy = entropy(data[target])

    values = data[feature].value_counts().index

    weighted_entropy = 0
    for v in values:
        subset = data[data[feature] == v]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])

    return total_entropy - weighted_entropy

ID3(examples, target_attribute, features):

    1. Create a new node for the tree

    2. If all examples have the same value for the target_attribute:
            return that value (make it a leaf node)

    3. If features list is empty:
            return the most common value of target_attribute in examples

    4. For each feature in features:
            compute InformationGain(examples, feature)

    5. best_feature ← feature with highest InformationGain

    6. Make best_feature the decision attribute for this node

    7. For each value v of best_feature:
            - Let subset ← examples where best_feature = v

            - If subset is empty:
                    attach a leaf node with most common target value in examples

            - Else:
                    child ← ID3(subset, target_attribute, features − {best_feature})
                    connect child as subtree below value v

    8. Return the node


In [8]:
def id3(data, features, target="PlayFootball"):
    # If target is pure
    if len(data[target].value_counts()) == 1:
        return data[target].value_counts().index[0]
    
    # If no features left
    if len(features) == 0:
        return data[target].value_counts().idxmax()
    
    # Choose best feature
    gains = {feature: information_gain(data, feature, target) for feature in features}
    best_feature = max(gains, key=gains.get)

    tree = {best_feature: {}}

    # Recursively build tree
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]

        if subset.empty:
            tree[best_feature][value] = data[target].value_counts().idxmax()
        else:
            remaining_features = [f for f in features if f != best_feature]
            tree[best_feature][value] = id3(subset, remaining_features, target)

    return tree


Building Decision Tree

In [9]:
features = list(df.columns)
features.remove("PlayFootball")

decision_tree = id3(df, features)

print("Decision Tree (ID3):")
print(decision_tree)

Decision Tree (ID3):
{'Weather': {'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}, 'Cloudy': 'Yes', 'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}}
