Implement Decision Tree Classifier for classification of EnjoySport dataset

In [1]:
import numpy as np
import pandas as pd


a. Importing Dataset :

In [2]:
dataset = pd.read_csv("EnjoySport.csv")

print( dataset.head())
print("Dataset Length : ", len(dataset))
print("Dataset Shape : ", dataset.shape)
dataset.target = dataset['play']
dataset.target

   Day   outlook  temp   humid  windy play
0    1     sunny   hot    high  False   no
1    2     sunny   hot    high   True   no
2    3  overcast   hot    high  False  yes
3    4     rainy  mild    high  False  yes
4    5     rainy  cool  normal  False  yes
Dataset Length :  14
Dataset Shape :  (14, 6)


  dataset.target = dataset['play']


Unnamed: 0,play
0,no
1,no
2,yes
3,yes
4,yes
5,no
6,yes
7,no
8,yes
9,yes


b. Calculating Entropy of System :

In [3]:
def Entropy(data):
  d = data.iloc[:,-1]
  d = d.value_counts()
  s=0

  for v in d.keys():
    p = d[v]/sum(d)
    s -= p*np.log2(p)

  return(s)



In [4]:

print("Entropy is : ",Entropy(dataset))

Entropy is :  0.9402859586706311


Attributes :

In [5]:
def values(attr):
  l=[]
  for i in attr:
    if i not in l:
      l.append(i)
  return l

In [6]:
values(dataset['outlook'])

['sunny', 'overcast', 'rainy']

c. Calculate Information Gain :

In [7]:
def IG(data,A):
  Es = Entropy(data)
  # print("Entropy of System : ",Es)
  val = values(data[A])
  # print("Values of Attribute : ",val)
  s_c = data[A].value_counts(sort=False)
  # print("Count of Values : ",s_c)
  s_v = []

  # print(len(val))

  for i in range(len(val)):
    ds = data[data[A]==val[i]]

    s=0
    for res in values(data.iloc[:,-1]):


      ds_res = ds[ds.iloc[:,-1]==res]
      # print(ds_res)
      # print("Count : ",len(ds_res))
      p = len(ds_res)/len(ds)

      if p==0 or p==1:
        s-=0
      else:
        s -= p*np.log2(p)



    s_v.append(s)


  for i in range(len(val)):
    # print("S_v : ",s_v[i])
    # print("S_c : ",s_c[val[i]])
    # print(s_c[val[i]])
    Es = Es - s_c[val[i]]*s_v[i]/np.sum(s_c)
  return Es


In [19]:
ig = [IG(dataset,'temp'),IG(dataset,'humid'),IG(dataset,'windy'),IG(dataset,'outlook')]
ig = [round(i, 4) for i in ig]
print("InformationGain : ",ig)

InformationGain :  [0.0292, 0.1518, 0.0481, 0.2467]


##Creating Decision Tree

In [9]:
class Node():

    def __init__(self, name=None, attr=None):
        self.name = name
        self.attr = attr

    def call(self):
        return self.name

def DTNode(data, features_used):
    node = Node()
    IGmax = 0; vbest = None

    val_list = [v for v in values(data)[:-1] if v not in features_used]

    if val_list != []:

        for v in val_list:

            if IG(data, v) > IGmax:
                IGmax = IG(data, v)
                v_best = v

        if v_best:
            features_used.append(v_best)
            node.name = v_best
            node.attr = values(data[v_best])

            return (node)
        else:
          return (None)
    return (None)



def DTClassifier(data, features_used):

    root = DTNode(data, features_used)

    DT_dict = {}

    if root != None:
        item = []
        for attr in root.attr:
            dataN = data[data[root.name] == attr]
            if Entropy(dataN) == 0:
                item.append((attr, values(dataN.iloc[:, -1])[0]))
            else:
                dt = DTClassifier(dataN, features_used)
                item.append((attr, dt))

        DT_dict[root.name] = item

    return (DT_dict)


In [14]:
DTClassifier(dataset, [])

{'Day': [(1, 'no'),
  (2, 'no'),
  (3, 'yes'),
  (4, 'yes'),
  (5, 'yes'),
  (6, 'no'),
  (7, 'yes'),
  (8, 'no'),
  (9, 'yes'),
  (10, 'yes'),
  (11, 'yes'),
  (12, 'yes'),
  (13, 'yes'),
  (14, 'no')]}