In [11]:
import pandas as pd
from pandas import DataFrame

In [12]:
df_tennis=pd.read_csv('data3.csv')

In [13]:
def entropy(probs):
    import math
    return sum( [-prob*math.log(prob,2) for prob in probs] )

In [14]:
def entropy_of_list(a_list):
    from collections import Counter
    cnt=Counter(x for x in a_list)
    num_instances=len(a_list)*1.0
    probs=[x/num_instances for x in cnt.values()]
    return entropy(probs)

In [15]:
def information_gain(df,split_attribute_name,target_attribute_name,trace=0):
    df_split=df.groupby(split_attribute_name)
    nobs=len(df.index)*1.0
    df_agg_ent=df_split.agg({target_attribute_name : [entropy_of_list,lambda x:len(x)/nobs]})[target_attribute_name]
    df_agg_ent.columns=['Entropy','PropObservations']
    new_entropy=sum(df_agg_ent['Entropy']*df_agg_ent['PropObservations'])
    old_entropy=entropy_of_list(df[target_attribute_name])
    return old_entropy-new_entropy

In [16]:
def id3(df,target_attribute_name,attribute_names,default_class=None):
    from collections import Counter
    cnt=Counter(x for x in df[target_attribute_name])
    if len(cnt) ==1:
        return next(iter(cnt))
    elif df.empty or(not attribute_names):
        return default_class
    else:
        default_class=max(cnt.keys())
        gainz=[information_gain(df,attr,target_attribute_name)for attr in attribute_names]
        index_of_max=gainz.index(max(gainz))
        best_attr=attribute_names[index_of_max]
        tree={best_attr:{}}
        remaining_attribute_names=[i for i in attribute_names if i != best_attr]
        for attr_val,data_subset in df.groupby(best_attr):
            subtree=id3(data_subset,target_attribute_name,remaining_attribute_names,default_class)
            tree[best_attr][attr_val]=subtree
        return tree

In [17]:
def classify(instance, tree, default=None):
    attribute=next(iter(tree))
    if instance[attribute] in tree[attribute].keys():
        result=tree[attribute][instance[attribute]]
        if isinstance(result,dict):
            return classify(instance,result)
        else:
            return result
    else:
        return default

In [18]:
attribute_names=list(df_tennis.columns)
attribute_names.remove('PlayTennis')

In [19]:
from pprint import pprint
tree=id3(df_tennis,'PlayTennis',attribute_names)
print("\n\nThe Resultant Decision Tree is:\n")
pprint(tree)



The Resultant Decision Tree is:

{'Day': {'d1': 'no',
         'd10': 'yes',
         'd11': 'yes',
         'd12': 'yes',
         'd13': 'yes',
         'd14': 'no',
         'd2': 'no',
         'd3': 'yes',
         'd4': 'yes',
         'd5': 'yes',
         'd6': 'no',
         'd7': 'yes',
         'd8': 'no',
         'd9': 'yes'}}


In [14]:
df_tennis['predicted']=df_tennis.apply(classify,axis=1,args=(tree,'No'))
print('Accuracy is:' +str( sum(df_tennis['Play Tennis']==df_tennis['predicted']) / (1.0*len(df_tennis.index))))
df_tennis[['PlayTennis','predicted']]

Accuracy is:1.0


Unnamed: 0,Play Tennis,predicted
0,no,no
1,no,no
2,yes,yes
3,yes,yes
4,yes,yes
5,no,no
6,yes,yes
7,no,no
8,yes,yes
9,yes,yes


In [15]:
training_data = df_tennis.iloc[1:-4]
test_data = df_tennis.iloc[-4:]
train_tree = id3(training_data,'Play Tennis',attribute_names)
test_data['predicted2'] = test_data.apply(classify,axis=1,args=(train_tree,'Yes'))
print('\n\n Accuracy is:'+ str( sum(test_data['Play Tennis']==test_data['predicted2']) / (1.0*len(test_data.index))))



 Accuracy is:0.75


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
