In [1]:
import pandas as pd 
import numpy as np 

In [2]:
DT =pd.read_csv("PlayTennis.csv") 
DT

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [3]:
def calc_total_entropy(DT,label):
    entropy = 0
    DT_count = len(DT)
    for _class in DT[label].unique():
        class_count = len(DT[DT[label]==_class])
        entropy+= - (class_count/DT_count) * np.log2(class_count/DT_count)
    return entropy

In [4]:
calc_total_entropy(DT,'Play Tennis')

0.9402859586706311

In [5]:
def calc_entropy(DT, feature_label, class_label):
    entropy = {}
    uniqued_feature = DT[feature_label].unique()
    uniqued_class = DT[class_label].unique()

    for feature in uniqued_feature:
        entropy_sum = 0
        feature_class_entropy = 0
        feature_count = len(DT[DT[feature_label]==feature])
        feature_p = feature_count / len(DT[feature_label])
        
        for _class in uniqued_class:
            feature_class_count = len(DT[ (DT[feature_label]==feature) & (DT[class_label]==_class) ][feature_label])            
            if feature_class_count != 0:
                feature_class_probability = feature_class_count/feature_count 
                feature_class_entropy = - feature_class_probability * np.log2(feature_class_probability) 
            entropy_sum += feature_class_entropy
        entropy[feature] = {'P':feature_p,'H':entropy_sum}
        
    return entropy
# calc_entropy(DT,'Outlook','Play Tennis')

In [6]:
def calc_gain(DT,class_label):
    import operator
    features = DT.columns.difference([class_label])
    total_entropy = calc_total_entropy(DT,class_label)
    
    gains = {}
    for feature in features:
        I = 0
        entropies = calc_entropy(DT,feature,class_label)
        for item in entropies.keys() :
            I += entropies[item]['P'] * entropies[item]['H']
        gains[feature] = total_entropy - I
    best_feature = max(gains.items(), key=operator.itemgetter(1))[0]
    return {
        'gains' : gains,
        'best' : {
            'key' : best_feature,
            'value' : gains[best_feature],
        }
    }
calc_gain(DT,'Play Tennis')

{'gains': {'Humidity': 0.15183550136234159,
  'Outlook': 0.24674981977443933,
  'Temperature': 0.02922256565895487,
  'Wind': 0.04812703040826949},
 'best': {'key': 'Outlook', 'value': 0.24674981977443933}}

In [7]:
def sub_tree_maker(best_feature, DT, target):
    class_list = DT[target].unique()
    tree = {} 
    best_feature_unique_list = DT[best_feature].value_counts(sort=False)
    
    for best_feature_item_name, best_feature_count in best_feature_unique_list.iteritems():
        best_feature_data = DT[DT[best_feature] == best_feature_item_name]
        branch_node = False 
        for _class in class_list: 
            class_count = len(best_feature_data[best_feature_data[target] == _class])
            if class_count == best_feature_count:
                tree[best_feature_item_name] = _class 
                DT = DT[DT[best_feature] != best_feature_item_name] 
                branch_node = True
            if not branch_node: 
                tree[best_feature_item_name] = "Branch"
    return tree, DT
# sub_tree_maker('Outlook',DT,'Play Tennis')

In [8]:
def tree_maker(DT, class_label, class_unique_value, root, node=None):

    if len(DT) == 0: return False
    
    best_feature = calc_gain(DT, class_label)['best']['key'] 
    sub_tree, DT = sub_tree_maker(best_feature, DT, class_label) 
    
    next_root = None 
    if node != None: 
        root[node] = {}
        root[node][best_feature] = sub_tree
        next_root = root[node][best_feature]
    else: 
        root[best_feature] = sub_tree
        next_root = root[best_feature]      
    for node, branch in list(next_root.items()): 
        if branch == "Branch": 
            feature_value_data = DT[DT[best_feature] == node] 
            tree_maker(feature_value_data, class_label, class_unique_value, next_root, node)

In [9]:
def predict(tree, new_instance):
    if not isinstance(tree, dict): 
        return tree 
    else:
        root_node = next(iter(tree))
        feature_value = new_instance[root_node] 
        if feature_value in tree[root_node]: 
            return predict(tree[root_node][feature_value], new_instance) 
        else:
            return None

In [10]:
def metrics_report(DTTEST,tree):
    correct_count = 0
    wrong_count = 0

    for index,r in DTTEST.iterrows(): 
        result = predict(tree, DTTEST.iloc[index]) 
        if result == DTTEST[target].iloc[index]: 
            correct_count += 1 
        else:
            wrong_count+= 1 
    accuracy =  correct_count / ( correct_count + wrong_count) 
    return accuracy

# __main__

In [11]:
class_uniqued_value = DT[class_label].unique()
tree = {} 
class_label = 'Play Tennis'

tree_maker(DT, class_label, class_uniqued_value, tree) 
display(root)

NameError: name 'class_label' is not defined

In [None]:
DTTEST = pd.read_csv("PlayTennis_Test.csv")
metrics_report(DTTEST,tree)