In [16]:
import pandas as pd
import numpy as np
from math import log2
from collections import Counter
from pprint import pprint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [17]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = -np.sum([(counts[i]/np.sum(counts)) * log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy_val

def info_gain(data, split_attribute, target_attribute):
    total_entropy = entropy(data[target_attribute])
    vals, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data[data[split_attribute] == vals[i]][target_attribute]) for i in range(len(vals))])
    return total_entropy - weighted_entropy

def split_info(data, split_attribute):
    vals, counts = np.unique(data[split_attribute], return_counts=True)
    return -np.sum([(counts[i]/np.sum(counts)) * log2(counts[i]/np.sum(counts)) for i in range(len(vals))])


In [18]:
def id3(data, original_data, features, target_attribute, parent_class=None):
    # If all target values have same label
    if len(np.unique(data[target_attribute])) <= 1:
        return np.unique(data[target_attribute])[0]
    # If dataset is empty
    elif len(data) == 0:
        return np.unique(original_data[target_attribute])[np.argmax(np.unique(original_data[target_attribute], return_counts=True)[1])]
    # If no more features, return majority
    elif len(features) == 0:
        return parent_class
    else:
        parent_class = np.unique(data[target_attribute])[np.argmax(np.unique(data[target_attribute], return_counts=True)[1])]
        gains = [info_gain(data, feature, target_attribute) for feature in features]
        best_feature = features[np.argmax(gains)]
        tree = {best_feature: {}}
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            sub_features = [f for f in features if f != best_feature]
            subtree = id3(sub_data, data, sub_features, target_attribute, parent_class)
            tree[best_feature][value] = subtree
        return tree

In [19]:
def c45(data, original_data, features, target_attribute, parent_class=None):
    if len(np.unique(data[target_attribute])) <= 1:
        return np.unique(data[target_attribute])[0]
    elif len(data) == 0:
        return np.unique(original_data[target_attribute])[np.argmax(np.unique(original_data[target_attribute], return_counts=True)[1])]
    elif len(features) == 0:
        return parent_class
    else:
        parent_class = np.unique(data[target_attribute])[np.argmax(np.unique(data[target_attribute], return_counts=True)[1])]
        info_gains = []
        for feature in features:
            ig = info_gain(data, feature, target_attribute)
            si = split_info(data, feature)
            gain_ratio = ig / si if si != 0 else 0
            info_gains.append(gain_ratio)
        best_feature = features[np.argmax(info_gains)]
        tree = {best_feature: {}}
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            sub_features = [f for f in features if f != best_feature]
            subtree = c45(sub_data, data, sub_features, target_attribute, parent_class)
            tree[best_feature][value] = subtree
        return tree

In [20]:
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    feature = list(tree.keys())[0]
    if sample[feature] in tree[feature]:
        return predict(tree[feature][sample[feature]], sample)
    else:
        # unseen attribute value â†’ majority guess
        return list(Counter([v for v in tree[feature].values() if not isinstance(v, dict)]).keys())[0]


In [21]:
df = pd.read_csv('Data/playCricket.csv')

# Convert continuous values to boolean (threshold)
# Here assume Temperature and Humidity are continuous (Hot/Mild/Cool, High/Normal)
# Convert them into binary features based on logical thresholds
df['Temperature'] = df['Temperature'].map({'Hot':1, 'Mild':0, 'Cool':0})
df['Humidity'] = df['Humidity'].map({'High':1, 'Normal':0})

# Encode categorical columns except target
df['Outlook'] = df['Outlook'].astype('category')
df['Wind'] = df['Wind'].astype('category')
df['PlayCricket'] = df['PlayCricket'].astype('category')

# Drop 'Day' column (not a feature)
df.drop(columns=['Day'], inplace=True)

In [25]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = df.drop('PlayCricket', axis=1)
y = df['PlayCricket']

id3_metrics, c45_metrics = [], []

for train_index, test_index in kf.split(df):
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]
    
    features = list(X.columns)
    
    id3_tree = id3(train_data, train_data, features, 'PlayCricket')
    c45_tree = c45(train_data, train_data, features, 'PlayCricket')
    
    y_true = test_data['PlayCricket'].values
    y_pred_id3 = [predict(id3_tree, row) for _, row in test_data.iterrows()]
    y_pred_c45 = [predict(c45_tree, row) for _, row in test_data.iterrows()]
    
    # Compute metrics
    for algo, pred, metric_list in [('ID3', y_pred_id3, id3_metrics), ('C4.5', y_pred_c45, c45_metrics)]:
        acc = accuracy_score(y_true, pred)
        prec = precision_score(y_true, pred, pos_label='Yes', zero_division=0)
        rec = recall_score(y_true, pred, pos_label='Yes')
        f1 = f1_score(y_true, pred, pos_label='Yes', zero_division=0)
        metric_list.append([acc, prec, rec, f1])

In [23]:
def avg_metrics(metrics):
    return np.mean(metrics, axis=0)

id3_avg = avg_metrics(id3_metrics)
c45_avg = avg_metrics(c45_metrics)

print("=== ID3 Decision Tree ===")
print(f"Accuracy: {id3_avg[0]:.3f}, Precision: {id3_avg[1]:.3f}, Recall: {id3_avg[2]:.3f}, F1: {id3_avg[3]:.3f}")
print("\n=== C4.5 Decision Tree ===")
print(f"Accuracy: {c45_avg[0]:.3f}, Precision: {c45_avg[1]:.3f}, Recall: {c45_avg[2]:.3f}, F1: {c45_avg[3]:.3f}")

print("\nSample ID3 Tree:")
pprint(id3_tree)
print("\nSample C4.5 Tree:")
pprint(c45_tree)

=== ID3 Decision Tree ===
Accuracy: 0.933, Precision: 0.933, Recall: 1.000, F1: 0.960

=== C4.5 Decision Tree ===
Accuracy: 0.600, Precision: 0.667, Recall: 0.700, F1: 0.653

Sample ID3 Tree:
{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {np.int64(0): 'Yes', np.int64(1): 'No'}}}}

Sample C4.5 Tree:
{'Humidity': {np.int64(0): {'Wind': {'Strong': {'Outlook': {'Rain': 'No',
                                                            'Sunny': 'Yes'}},
                                     'Weak': 'Yes'}},
              np.int64(1): {'Outlook': {'Overcast': 'Yes',
                                        'Rain': 'No',
                                        'Sunny': 'No'}}}}
