In [24]:
import pandas as pd
from pandas import DataFrame


In [25]:
df_tennis=pd.read_csv('dd3.csv')


In [26]:
df_tennis

Unnamed: 0,outlook,temperature,humidity,wind,playtennis
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,strong,no
5,rain,cool,normal,strong,yes
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rain,mild,normal,weak,yes


In [28]:
def entropy(probs):
    import math
    return sum([-prob*math.log(prob,2) for prob in probs])

def entropy_of_list(a_list):
    from collections import Counter
    cnt = Counter(x for x in a_list)
    print("yes and no classes:",a_list.name,cnt)
    num_instances=len(a_list)*1.0
    probs=[x / num_instances for x in cnt.values()]
    return entropy(probs)
total_entropy=entropy_of_list(df_tennis['playtennis'])
print("entropy of given play tennis data set:",total_entropy)

yes and no classes: playtennis Counter({'yes': 9, 'no': 5})
entropy of given play tennis data set: 0.9402859586706309


In [80]:
def information_gain(df,split_attribute_name,target_attribute_name,trace=0):
    print("information gain calculation of",split_attribute_name)
    
    df_split = df.groupby(split_attribute_name)
    
    for name,group in df_split:
        print(name)
        print(group)
        
    nobs = len(df.index) * 1.0
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list,lambda x: len(x)/nobs]}) [target_attribute_name]
    df_agg_ent.columns=['entropy','propObservations']
    new_entropy = sum(df_agg_ent['entropy']*df_agg_ent['propObservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy

print('info-gain for outlook is :'+str(information_gain(df_tennis,'outlook','playtennis')),"\n")
print('\n info-gain for humidity is :'+str(information_gain(df_tennis,'humidity','playtennis')),"\n")
print('\n info-gain for wind is :'+str(information_gain(df_tennis,'wind','playtennis')),"\n")
print('\n info-gain for temperatue is :'+str(information_gain(df_tennis,'temperature','playtennis')),"\n")

information gain calculation of outlook
overcast
     outlook temperature humidity    wind playtennis
2   overcast         hot     high    weak        yes
6   overcast        cool   normal  strong        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
rain
   outlook temperature humidity    wind playtennis
3     rain        mild     high    weak        yes
4     rain        cool   normal  strong         no
5     rain        cool   normal  strong        yes
9     rain        mild   normal    weak        yes
13    rain        mild     high  strong         no
sunny
   outlook temperature humidity    wind playtennis
0    sunny         hot     high    weak         no
1    sunny         hot     high  strong         no
7    sunny        mild     high    weak         no
8    sunny        cool   normal    weak        yes
10   sunny        mild   normal  strong        yes
yes and no classes: playtennis Counter({'yes': 4})
yes and no c

In [86]:
def id3(df,target_attribute_name,attribute_names,default_class=None):
    
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    
    if len(cnt)==1:
        return next(iter(cnt))
    
    elif df.empty or (not attribute_names):
        return default_class
    
    else:
        default_class = max(cnt.keys())
        gainz =[information_gain(df,attr,target_attribute_name) for  attr in attribute_names]
        index_of_max = gainz.index(max(gainz))
        best_attr = attribute_names[index_of_max]
        tree = {best_attr:{}}
        remaining_attribute_names = [i for i in attribute_names if i !=best_attr]
        
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,target_attribute_name,remaining_attribute_names,default_class)
            tree[best_attr][attr_val]=subtree
        return tree    
                

In [87]:
attribute_names=list(df_tennis.columns)
print("list of attributes:",attribute_names)
attribute_names.remove('playtennis')
print("predictting attributes:",attribute_names)

list of attributes: ['outlook', 'temperature', 'humidity', 'wind', 'playtennis']
predictting attributes: ['outlook', 'temperature', 'humidity', 'wind']


In [88]:
from pprint import pprint
tree=id3(df_tennis,'playtennis',attribute_names)
print("\n\n the resultant decision tree is:\n")
pprint(tree)

information gain calculation of outlook
overcast
     outlook temperature humidity    wind playtennis
2   overcast         hot     high    weak        yes
6   overcast        cool   normal  strong        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
rain
   outlook temperature humidity    wind playtennis
3     rain        mild     high    weak        yes
4     rain        cool   normal  strong         no
5     rain        cool   normal  strong        yes
9     rain        mild   normal    weak        yes
13    rain        mild     high  strong         no
sunny
   outlook temperature humidity    wind playtennis
0    sunny         hot     high    weak         no
1    sunny         hot     high  strong         no
7    sunny        mild     high    weak         no
8    sunny        cool   normal    weak        yes
10   sunny        mild   normal  strong        yes
yes and no classes: playtennis Counter({'yes': 4})
yes and no c

In [91]:
def classify(instance, tree, default=None):
    attribute = next(iter(tree))
    if instance[attribute] in  tree[attribute].keys():
        result=tree[attribute][instance[attribute]]
        if isinstance(result,dict):
            return classify(instance,result)
        else:
            return result
    else:
            return default
        
df_tennis['predicted']=df_tennis.apply(classify,axis=1,args=(tree,'no'))
print('accuracy is:'+str(sum(df_tennis['playtennis']==df_tennis['predicted'])/(1.0*len(df_tennis.index))))
df_tennis[['playtennis','predicted']]
        
        

accuracy is:0.9285714285714286


Unnamed: 0,playtennis,predicted
0,no,no
1,no,no
2,yes,yes
3,yes,yes
4,no,yes
5,yes,yes
6,yes,yes
7,no,no
8,yes,yes
9,yes,yes


In [92]:
training_data=df_tennis.iloc[1: -4]
test_data=df_tennis.iloc[-4:]
train_tree=id3(training_data,'playtennis',attribute_names)
test_data['predicted2']=test_data.apply(classify,axis=1,args=(train_tree,'yes'))
print("\n\n accuracy is:"+str(sum(test_data['playtennis']==test_data['predicted2'])/(1.0*len(test_data.index))))

information gain calculation of outlook
overcast
    outlook temperature humidity    wind playtennis predicted
2  overcast         hot     high    weak        yes       yes
6  overcast        cool   normal  strong        yes       yes
rain
  outlook temperature humidity    wind playtennis predicted
3    rain        mild     high    weak        yes       yes
4    rain        cool   normal  strong         no       yes
5    rain        cool   normal  strong        yes       yes
9    rain        mild   normal    weak        yes       yes
sunny
  outlook temperature humidity    wind playtennis predicted
1   sunny         hot     high  strong         no        no
7   sunny        mild     high    weak         no        no
8   sunny        cool   normal    weak        yes       yes
yes and no classes: playtennis Counter({'yes': 2})
yes and no classes: playtennis Counter({'yes': 3, 'no': 1})
yes and no classes: playtennis Counter({'no': 2, 'yes': 1})
yes and no classes: playtennis Counter({'ye

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
