In [1]:
import requests
from io import StringIO
import pandas as pd
import numpy as np
import math

In [2]:
r = requests.get('https://raw.githubusercontent.com/serengil/decision-trees-for-ml/master/dataset/golf.txt')
s=str(r.content,'utf-8')
data = StringIO(s) 
df=pd.read_csv(data)

In [3]:
df

Unnamed: 0,Outlook,Temp.,Humidity,Wind,Decision
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


### Decision Tree
The entropy of the dataset is given by, 
$$S(D) = -\sum_{i=0}^N p(x_i)\log p(x_i)$$
where N is the number of classes. 

The conditional entropy of the column is given by, 
$$S(D|c=j) = -\sum_{i=0}^N p(x_i|c=j)\log p(x_i|c=j)$$

Information Gain by the Column is then, 
$$G(c) = S(D)-\sum^M_{j=0}p(c=j)S(D|c=j)$$

Split the dataset into sub-dataset by the dominant factor, and remove the dominant and loop over the process again, until no column is left. 

In [4]:
classes = df['Decision'].unique()
S = 0
shape = df.shape # rows,cols
for i in range(len(classes)):
    p = (df["Decision"]==classes[i]).sum()/shape[0]
    S += -p*math.log(p,2)
print("Entropy: ", S)

Entropy:  0.9402859586706309


#### General 

In [8]:
class Decision_Tree:
    def __init__(self, df, label_col="Decision"):
        self.df = df
        self.label_col = label_col
        self.shape = df.shape
        self.S = self.ds_entropy()
        
    def ds_entropy(self):
        self.classes = df[self.label_col].unique()
        S = 0
        for i in range(len(self.classes)):
            p = (self.df[self.label_col]==self.classes[i]).sum()/self.shape[0]
            S += -p*math.log(p,2)
            
        print("Entropy: ", S)
        return S
    
    def gain(self, col_name="Humidity"):
        unique_classes = self.df[col_name].unique()
        G = self.S.copy()
        for j in range(len(unique_classes)):
            bool_classes = self.df[col_name]==unique_classes[j]
            num_classes = (bool_classes).sum()
            S_condition = 0
            for i in range(len(self.classes)):
                count_and = ((bool_classes) & (self.df[self.label_col]==self.classes[i])).sum()
                p = count_and/num_classes
                S_condition += -p*math.log(p,2) if p>0 else 0
            G += -num_classes/self.shape[0]*S_condition
        print(f"{col_name} Gain: ", G)
        return G
        
    def rank_gain(self):
        col_names = self.df.columns
        gains = []
        for col_name in col_names:
            if col_name==self.label_col:
                continue
            gains.append(self.gain(col_name=col_name))
        gains = np.array(gains)
        ranks = np.argsort(gains)[::-1]
        return ranks

In [9]:
dt = Decision_Tree(df)
ranks = np.array(dt.rank_gain())
print(ranks, dt.df.columns[ranks])

Entropy:  0.9402859586706309
Outlook Gain:  0.2467498197744391
Temp. Gain:  0.029222565658954647
Humidity Gain:  0.15183550136234136
Wind Gain:  0.04812703040826932
[0 2 3 1] Index(['Outlook', 'Humidity', 'Wind', 'Temp.'], dtype='object')


In [7]:
df.columns

Index(['Outlook', 'Temp.', 'Humidity', 'Wind', 'Decision'], dtype='object')

In [10]:
sub_df = df[(df["Outlook"]=="Sunny")]
sub_df = sub_df.drop(columns=["Outlook"])
sub_df

Unnamed: 0,Temp.,Humidity,Wind,Decision
0,Hot,High,Weak,No
1,Hot,High,Strong,No
7,Mild,High,Weak,No
8,Cool,Normal,Weak,Yes
10,Mild,Normal,Strong,Yes


In [11]:
dt2 = Decision_Tree(sub_df)
ranks2 = np.array(dt2.rank_gain())
print(ranks2, dt2.df.columns[ranks2])

Entropy:  0.9709505944546686
Temp. Gain:  0.5709505944546686
Humidity Gain:  0.9709505944546686
Wind Gain:  0.01997309402197489
[1 0 2] Index(['Humidity', 'Temp.', 'Wind'], dtype='object')


### References
1. https://medium.com/analytics-vidhya/mathematics-behind-decision-tree-73ee2ef82164