## ENTROPY

In [2]:
# it is the measure of randomness, check notes

In [3]:
import numpy as np

In [6]:
Y = np.array([1,0,0,1,0,1,0,1]) # Y is a random variable

In [13]:
def entropy(var):
    N = var.shape[0]
    values , counts = np.unique(var,return_counts = True)
    ent = 0.0
    for i in counts:
        p = i /N
        ent += (p * np.log2(p))
    return -ent
    
    

In [15]:
values , counts = np.unique(Y,return_counts = True)

In [17]:
counts

array([4, 4], dtype=int64)

In [19]:
entropy(Y)

1.0

In [21]:
X = np.array([1,1,1,1,1,1,1])
entropy(X)

-0.0

## SPLIT DATA

In [16]:
import pandas as pd
import numpy as np


In [18]:
file_path = r"C:\Users\ChanakyaY\Downloads\golf.csv"
df = pd.read_csv(file_path)

In [20]:
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [22]:
def divide_data(data, feature):
    DATA = {}
    feat_values = list(data[feature].value_counts().index)
    occurence = list(data[feature].value_counts())

    for val in feat_values :
        DATA[val] = {'data' : pd.DataFrame([],columns = data.columns) , 'len' : 0 }
    for ix in range(data.shape[0]):
        val = data[feature].iloc[ix]
        new_row = data.iloc[[ix]]
        DATA[val]['data'] = pd.concat([DATA[val]['data'], new_row], ignore_index=True)
        idx = feat_values.index(val)
        DATA[val]['len'] = occurence[idx]

    return DATA
        

        

In [34]:
list(df['Outlook'].value_counts().index)
#to convert into list write list

['sunny', 'rainy', 'overcast']

In [36]:
list(df['Outlook'].value_counts()) 

[5, 5, 4]

In [32]:
divide_data(df,'Outlook')

{'sunny': {'data':   Outlook Temperature Humidity  Windy Play
  0   sunny         hot     high  False   no
  1   sunny         hot     high   True   no
  2   sunny        mild     high  False   no
  3   sunny        cool   normal  False  yes
  4   sunny        mild   normal   True  yes,
  'len': 5},
 'rainy': {'data':   Outlook Temperature Humidity  Windy Play
  0   rainy        mild     high  False  yes
  1   rainy        cool   normal  False  yes
  2   rainy        cool   normal   True   no
  3   rainy        mild   normal  False  yes
  4   rainy        mild     high   True   no,
  'len': 5},
 'overcast': {'data':     Outlook Temperature Humidity  Windy Play
  0  overcast         hot     high  False  yes
  1  overcast        cool   normal   True  yes
  2  overcast        mild     high   True  yes
  3  overcast         hot   normal  False  yes,
  'len': 4}}

In [38]:
def information_gain(data,feature):
    examples = data.shape[0]
    DATA = divide_data(data,feature)
    keys = DATA.keys()

    ent_of_children = 0.0

    for key in keys :
        ent_of_children += ((DATA[key]['len'])/examples) * entropy(DATA[key]['data']['Play'])
    info_gain = entropy(data['Play']) - ent_of_children
    return info_gain

In [40]:
information_gain(df,'Outlook')

0.24674981977443933

In [42]:
information_gain(df,'Windy')

0.04812703040826949

## CODE : BUILDINNG A DECISION TREE

In [110]:
class DecisionTree :
    def __init__(self,depth = 0 , max_depth = 5):
        #creating a Node,...there can be many children  
        self.children = {}
        self.key = None # feature key , value at that node
        self.max_depth = max_depth
        self.depth = depth
        self.target = None # predict at any node

    def train(self,data) : # here data is df
        features = list(data.columns[:-1])
        info_gain = []
        root_feat = 0
        for f in features:
            i_gain = information_gain(data,f)
            info_gain.append(i_gain)
        
        self.key = features[np.argmax(info_gain)] # the best feature , the root node
        
        #splitting the data
        DATA = divide_data(data,self.key)
        
        # Giving a target label to the node
        labels = list(data['Play'].value_counts().index)
        freq = list(data['Play'].value_counts().values)

        self.target = labels[np.argmax(freq)] #target label to each node
        
        #### STOPPING CODITIONS --- BASE CASES
        have_data = 0
        keys = DATA.keys()
        for key in keys:
            if DATA[key]['len'] > 0:  # 
                have_data +=1
        # 1.(If it is pure node) ,data belongs to only one class,no other classes
        if have_data <2 :
            return 
        # 2. Early stopy if we have (Reached max_depth)
        if self.depth >= self.max_depth :
            return

        print("\t"*self.depth + "Making tree with -",self.key)
        # Recursievely train the child node
        for key in keys :
            new_data = DATA[key]['data']
            self.children[key] = DecisionTree(depth = self.depth +1)
            self.children[key].train(new_data) #recursion
        return
    def predict(self,test):
        if self.children=={}:
            return self.target
        return self.children[test[self.key][0]].predict(test)
        
            
        
        

In [112]:
list(df.columns[:-1])

['Outlook', 'Temperature', 'Humidity', 'Windy']

In [114]:
labels = list(df['Play'].value_counts().index)
labels

['yes', 'no']

In [116]:
freq = list(df['Play'].value_counts())
freq

[9, 5]

## Explore the model

In [119]:
model = DecisionTree()

In [121]:
model.train(df)

Making tree with - Outlook
	Making tree with - Humidity
	Making tree with - Windy


In [123]:
model

<__main__.DecisionTree at 0x28b4661bfb0>

In [125]:
model.target

'yes'

In [127]:
model.key


'Outlook'

In [129]:
model.children

{'sunny': <__main__.DecisionTree at 0x28b4b71f7d0>,
 'rainy': <__main__.DecisionTree at 0x28b4b7041d0>,
 'overcast': <__main__.DecisionTree at 0x28b4b76d5b0>}

In [131]:
model.children['sunny'].key

'Humidity'

In [133]:
model.children['sunny'].children

{'high': <__main__.DecisionTree at 0x28b4b76ec90>,
 'normal': <__main__.DecisionTree at 0x28b4b76dd30>}

In [135]:
x_test = pd.DataFrame([['overcast','hot','high',False]],columns= list(df.columns.values[:-1]))

In [137]:
x_test

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,overcast,hot,high,False


In [140]:
model.predict(x_test)

'yes'