#Name: Vedant Bhutada
#Batch: A4
#Roll: 69
#Aim: Write a program to implement Decision Tree algorithm


In [None]:
import pandas as pd
import numpy as np
train_data_m=pd.read_csv("/content/PlayTennis.csv")
train_data_m

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [None]:
def total_entropy(df,label,clas):
  rows=len(df)
  entropy=0
  for c in clas:
    class_cnt=len(df[df[label]==c])
    entropy+=((-class_cnt/rows)*np.log2(class_cnt/rows))
  return entropy

In [None]:
def calc_total_entropy(train_data, label, class_list):
    total_row = train_data.shape[0]
    total_entr = 0

    for c in class_list:
        total_class_count = train_data[train_data[label] == c].shape[0]
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row)
        total_entr += total_class_entr

    return total_entr

def calc_entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0

    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count
            entropy_class = - probability_class * np.log2(probability_class)
        entropy += entropy_class
    return entropy

def calc_info_gain(feature_name, train_data, label, class_list):
    feature_value_list = train_data[feature_name].unique()
    total_row = train_data.shape[0]
    feature_info = 0.0

    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list)
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy

    return calc_total_entropy(train_data, label, class_list) - feature_info

def find_most_informative_feature(train_data, label, class_list):
    feature_list = train_data.columns.drop(label)
    max_info_gain = -1
    max_info_feature = None

    for feature in feature_list:
        feature_info_gain = calc_info_gain(feature, train_data, label, class_list)
        if max_info_gain < feature_info_gain:
            max_info_gain = feature_info_gain
            max_info_feature = feature

    return max_info_feature

def generate_sub_tree(feature_name, train_data, label, class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False).to_dict()
    tree = {}

    for feature_value, count in feature_value_count_dict.items():
        feature_value_data = train_data[train_data[feature_name] == feature_value]

        assigned_to_node = False
        for c in class_list:
            class_count = feature_value_data[feature_value_data[label] == c].shape[0]

            if class_count == count:
                tree[feature_value] = c
                train_data = train_data[train_data[feature_name] != feature_value]
                assigned_to_node = True
        if not assigned_to_node:
            tree[feature_value] = "?"

    return tree, train_data

def make_tree(root, prev_feature_value, train_data, label, class_list):
    if train_data.shape[0] != 0:
        max_info_feature = find_most_informative_feature(train_data, label, class_list)
        tree, train_data = generate_sub_tree(max_info_feature, train_data, label, class_list)
        next_root = None

        if prev_feature_value != None:
            root[prev_feature_value] = dict()
            root[prev_feature_value][max_info_feature] = tree
            next_root = root[prev_feature_value][max_info_feature]
        else:
            root[max_info_feature] = tree
            next_root = root[max_info_feature]

        for node, branch in list(next_root.items()):
            if branch == "?":
                feature_value_data = train_data[train_data[max_info_feature] == node]
                make_tree(next_root, node, feature_value_data, label, class_list)

def id3(train_data_m, label):
    train_data = train_data_m.copy()
    tree = {}
    class_list = train_data[label].unique()
    make_tree(tree, None, train_data, label, class_list)
    return tree

tree = id3(train_data_m, 'Play Tennis')

def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    else:
        root_node = next(iter(tree))
        feature_value = instance[root_node]
        if feature_value in tree[root_node]:
            return predict(tree[root_node][feature_value], instance)
        else:
            return None

def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows():
        result = predict(tree, test_data_m.iloc[index])
        if result == test_data_m[label].iloc[index]:
            correct_preditct += 1
        else:
            wrong_preditct += 1
    accuracy = correct_preditct / (correct_preditct + wrong_preditct)
    return accuracy

def display_tree(tree, depth=0, parent_key=None, is_last_child=True):
    if isinstance(tree, dict):
        for key, value in tree.items():
            print("  " * depth, end="")

            if parent_key is not None:
                prefix = "└── " if is_last_child else "├── "
                print(prefix, end="")

            print(str(key))

            is_last = list(tree.keys())[-1] == key
            display_tree(value, depth + 1, parent_key=key, is_last_child=is_last)
    else:
        prefix = "  " * (depth + 1) + " - " if parent_key is not None else ""
        print(prefix + str(tree))
print(tree)

{'Outlook': {'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}, 'Overcast': 'Yes', 'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}}


In [None]:
display_tree(tree)

Outlook
  └── Sunny
    ├── Humidity
      └── High
           - No
      └── Normal
           - Yes
  └── Overcast
       - Yes
  └── Rain
    └── Wind
      └── Weak
           - Yes
      └── Strong
           - No


In [None]:
print('Entropy(S,Outlook):',calc_entropy(train_data_m, 'Outlook', ['Sunny','Overcast','Rain']))
print('Entropy(S,Temperature):',calc_entropy(train_data_m, 'Temperature', ['Hot','Mild','Cool']))
print('Entropy(S,Humidity):',calc_entropy(train_data_m, 'Humidity', ['High','Normal']))
print('Entropy(S,Wind):',calc_entropy(train_data_m, 'Wind', ['Weak','Strong']))

Entropy(S,Outlook): 1.5774062828523454
Entropy(S,Temperature): 1.5566567074628228
Entropy(S,Humidity): 1.0
Entropy(S,Wind): 0.9852281360342515


In [None]:
print('Information Gain(S,Outlook):',calc_info_gain('Play Tennis',train_data_m, 'Outlook', ['Sunny','Overcast','Rain']))
print('Information Gain(S,Temperature):',calc_info_gain('Play Tennis',train_data_m, 'Temperature', ['Hot','Mild','Cool']))
print('Information Gain(S,Humidity):',calc_info_gain('Play Tennis',train_data_m, 'Humidity', ['High','Normal']))
print('Information Gain(S,Wind):',calc_info_gain('Play Tennis', train_data_m, 'Wind', ['Weak','Strong']))

Information Gain(S,Outlook): 0.24674981977443933
Information Gain(S,Temperature): 0.029222565658954647
Information Gain(S,Humidity): 0.15183550136234147
Information Gain(S,Wind): 0.04812703040826938
