In [47]:
import pandas as pd 
from pandas import DataFrame
from sklearn import tree
df_tennis = pd.read_csv("tennis.csv") 
df_tennis.head() 

Unnamed: 0,Outlook,Temperature,Humidity,Windy,PlayTennis
0,Sunny,Hot,High,False,No
1,Sunny,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Rainy,Mild,High,False,Yes
4,Rainy,Cool,Normal,False,Yes


In [40]:
def entropy(probs): 
    import math 
    return sum( [-prob*math.log(prob, 2) for prob in probs] )

In [43]:
def entropy_of_list(a_list): 
    from collections import Counter     
    cnt = Counter(x for x in a_list)
    print("No and Yes Classes:",a_list.name,cnt)     
    num_instances = len(a_list)*1.0     
    probs = [x / num_instances for x in cnt.values()]     
    return entropy(probs) # Call Entropy: 
total_entropy = entropy_of_list(df_tennis['PlayTennis']) 
print("Entropy of given PlayTennis Data Set:",total_entropy) 

No and Yes Classes: PlayTennis Counter({'Yes': 9, 'No': 5})
Entropy of given PlayTennis Data Set: 0.9402859586706309


In [44]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0): 
    df_split = df.groupby(split_attribute_name) 
    for name,group in df_split:         
        print(name)         
        print(group)    
        nobs = len(df.index) * 1.0
        df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name] 
        df_agg_ent.columns = ['Entropy', 'PropObservations'] 
        new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
        old_entropy = entropy_of_list(df[target_attribute_name])  
    return old_entropy - new_entropy


print('Info-gain for Outlook is :'+str( information_gain(df_tennis, 'Outlook', 'PlayTennis')),"\n") 
print('\n Info-gain for Humidity is: ' + str( information_gain(df_tennis, 'Humidity', 'PlayTennis')),"\n") 
print('\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'Windy', 'PlayTennis')),"\n") 
print('\n Info-gain for Temperature is:' + str( information_gain(df_tennis , 'Temperature','PlayTennis')),"\n")

Overcast
     Outlook Temperature Humidity  Windy PlayTennis
2   Overcast         Hot     High  False        Yes
6   Overcast        Cool   Normal   True        Yes
11  Overcast        Mild     High   True        Yes
12  Overcast         Hot   Normal  False        Yes
No and Yes Classes: PlayTennis Counter({'Yes': 4})
No and Yes Classes: PlayTennis Counter({'Yes': 3, 'No': 2})
No and Yes Classes: PlayTennis Counter({'No': 3, 'Yes': 2})
No and Yes Classes: PlayTennis Counter({'Yes': 9, 'No': 5})
Rainy
   Outlook Temperature Humidity  Windy PlayTennis
3    Rainy        Mild     High  False        Yes
4    Rainy        Cool   Normal  False        Yes
5    Rainy        Cool   Normal   True         No
9    Rainy        Mild   Normal  False        Yes
13   Rainy        Mild     High   True         No
No and Yes Classes: PlayTennis Counter({'Yes': 4})
No and Yes Classes: PlayTennis Counter({'Yes': 3, 'No': 2})
No and Yes Classes: PlayTennis Counter({'No': 3, 'Yes': 2})
No and Yes Classes: Pla

In [49]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    from collections import Counter     
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:         
        return next(iter(cnt)) 
    elif df.empty or (not attribute_names): 
             return default_class 
    else:
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] 
        index_of_max = gainz.index(max(gainz)) 
        best_attr = attribute_names[index_of_max] 
        tree = {best_attr:{}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        for attr_val, data_subset in df.groupby(best_attr): 
            subtree = id3(data_subset,
                          target_attribute_name,                         
                          remaining_attribute_names,                         
                          default_class)
            tree[best_attr][attr_val] = subtree 
        return tree


attribute_names = list(df_tennis.columns)
print("List of Attributes:", attribute_names) 
#attribute_names.remove('id')
attribute_names.remove('PlayTennis') 
print("Predicting Attributes:", attribute_names)


from pprint import pprint
tree.
tree = id3(df_tennis,'PlayTennis',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n") 
pprint(tree)

List of Attributes: ['Outlook', 'Temperature', 'Humidity', 'Windy', 'PlayTennis']
Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Windy']
Overcast
     Outlook Temperature Humidity  Windy PlayTennis
2   Overcast         Hot     High  False        Yes
6   Overcast        Cool   Normal   True        Yes
11  Overcast        Mild     High   True        Yes
12  Overcast         Hot   Normal  False        Yes
No and Yes Classes: PlayTennis Counter({'Yes': 4})
No and Yes Classes: PlayTennis Counter({'Yes': 3, 'No': 2})
No and Yes Classes: PlayTennis Counter({'No': 3, 'Yes': 2})
No and Yes Classes: PlayTennis Counter({'Yes': 9, 'No': 5})
Rainy
   Outlook Temperature Humidity  Windy PlayTennis
3    Rainy        Mild     High  False        Yes
4    Rainy        Cool   Normal  False        Yes
5    Rainy        Cool   Normal   True         No
9    Rainy        Mild   Normal  False        Yes
13   Rainy        Mild     High   True         No
No and Yes Classes: PlayTennis Counter({'