In [1]:
import pandas as pd
import numpy as np

Day	| Outlook | Temperature | Humidity | Wind |	Play | Golf|
----|---------|-------------|----------|------|------|-----|
D1 | Sunny | Hot | High | Weak | No |
D2 | Sunny | Hot | High | Strong | No | 
D3 | Overcast | Hot | High | Weak | Yes
D4 | Rain | Mild | High | Weak | Yes | 
D5 | Rain | Cool | Normal | Weak | Yes |
D6 | Rain | Cool | Normal | Weak | No |
D7 | Overcast | Cool | Normal | Strong | Yes |
D8 | Sunny | Mild | High | Weak | No |
D9 | Sunny | Cool | Normal | Weak | Yes |
D10 | Rain | Mild | Normal | Weak | Yes |
D11 | Sunny | Mild | Normal | Strong | Yes |
D12 | Overcast | Mild | High | Strong | Yes |
D13 | Overcast | Hot | Normal | Weak | Yes |
D14 | Rain | Mild | High | Strong | No |

In [2]:
df = pd.read_csv('Tennis.csv')

In [4]:
df.head(14)

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,Play Golf
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Weak,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


### Information Gain
- In order to pick which, feature to split on, we need a way of measuring how good the split is. This is where information gain and entropy come in.

$Shannon-entropy: H(x) = -\sum\limits_{i = 1}^{n} P(X_{i})log_{2}P(X_{i})$ \
$P_{i} = $ probability of occurence of value i 
- High entropy: All the classes are nearly equally likely
- Low entropy: A few classes are likely; most of the classes are rarely observed
- Assume 0 $log_{2}$ = 0
- For completely homogoeneous dataset (all True or all False): entrpy is 0
- If dataset is equally divided (same amount of True and all False): entrpy is 1


In [5]:
df.columns

Index(['Day', 'Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Golf'], dtype='object')

In [7]:
df[df.columns[5]].unique()

array(['No', 'Yes'], dtype=object)

In [9]:
df[df['Play Golf'] == 'Yes']

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,Play Golf
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
6,D7,Overcast,Cool,Normal,Strong,Yes
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes
10,D11,Sunny,Mild,Normal,Strong,Yes
11,D12,Overcast,Mild,High,Strong,Yes
12,D13,Overcast,Hot,Normal,Weak,Yes


In [10]:
df[df.columns[5]].shape

(14,)

In [11]:
PG = df[df.columns[5]].shape[0]

In [12]:
PG

14

In [17]:
PG_No = df[df[df.columns[5]] == df[df.columns[5]].unique()[0]].shape[0] # need to shorten this

In [18]:
PG_No

5

In [19]:
PG_Yes = df[df[df.columns[5]] == df[df.columns[5]].unique()[1]].shape[0] # need to shorten this

In [20]:
PG_Yes

9

$Shannon-entropy: H(x) = -\sum\limits_{i = 1}^{n} P(X_{i})log_{2}P(X_{i})$ 

In [41]:
# WILL BE ON EXAM

In [21]:
PG_Yes/PG

0.6428571428571429

In [22]:
H_PG = - PG_Yes/PG * np.log2(PG_Yes/PG) - PG_No/PG * np.log2(PG_No/PG)

In [24]:
H_PG
# entopy of overall playing golf or not

0.9402859586706311

In [25]:
df.columns

Index(['Day', 'Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Golf'], dtype='object')

In [26]:
Outlook = df['Outlook'].shape[0]
Outlook

14

In [27]:
df.groupby('Outlook')['Play Golf'].value_counts()

Outlook   Play Golf
Overcast  Yes          4
Rain      Yes          3
          No           2
Sunny     No           3
          Yes          2
Name: Play Golf, dtype: int64

In [29]:
Sunny = df.groupby('Outlook')['Play Golf'].value_counts().loc['Sunny'].sum()
Sunny

5

In [30]:
Overcast = df.groupby('Outlook')['Play Golf'].value_counts().loc['Overcast'].sum()
Overcast

4

In [31]:
Rain = df.groupby('Outlook')['Play Golf'].value_counts().loc['Rain'].sum()
Rain

5

In [32]:
Outlook_Y = df.groupby('Outlook')['Play Golf'].value_counts().loc[:, 'Yes'].sum()
Outlook_Y

9

In [33]:
Outlook_N = df.groupby('Outlook')['Play Golf'].value_counts().loc[:, 'No'].sum()
Outlook_N

5

In [57]:
Sunny_Y = df.groupby('Outlook')['Play Golf'].value_counts().loc['Sunny'].get('Yes', 0)
Sunny_N = df.groupby('Outlook')['Play Golf'].value_counts().loc['Sunny'].get('No', 0)
Overcast_Y = df.groupby('Outlook')['Play Golf'].value_counts().loc['Overcast'].get('Yes', 0)
Rain_Y = df.groupby('Outlook')['Play Golf'].value_counts().loc['Rain'].get('Yes', 0)
Rain_N = df.groupby('Outlook')['Play Golf'].value_counts().loc['Rain'].get('No', 0)

In [58]:
Sunny_Y

2

In [59]:
Sunny_N

3

In [60]:
Overcast_Y

4

In [61]:
Rain_Y

3

In [62]:
Rain_N

2

In [63]:
Ent_Outlook = - Outlook_Y/Outlook * np.log2(Outlook_Y/Outlook) - Outlook_N/Outlook * np.log2(Outlook_N/Outlook)

In [64]:
Ent_Outlook

0.9402859586706311

In [65]:
Ent_Sunny = - Sunny_Y/Sunny * np.log2(Sunny_Y/Sunny) - Sunny_N/Sunny * np.log2(Sunny_N/Sunny)
Ent_Sunny

0.9709505944546686

In [66]:
Ent_Overcast = - Overcast_Y/Overcast * np.log2(Overcast_Y/Overcast)
Ent_Overcast

-0.0

In [67]:
Ent_Rain = - Rain_Y/Rain * np.log2(Rain_Y/Rain) - Rain_N/Rain * np.log2(Rain_N/Rain)
Ent_Rain

0.9709505944546686

### ID3 (Iterative Dichotomize)
- ID3 algorithm is used to build the decision tree
- It utilizes entropy and information gain to build the tree
- Uses Information Theory (Entropy) to split on an attribute that gives the highest information gain
- ~ It is a top-down greedy search of possible branches

0.9709505944546686
$Gain (Outlook) = Entropy (Outlook) - \sum\limits_{\in (Sunny, Overcast, Rain) } \frac{|S_{v}|}{|S|} Entropy(S_{v})$ 

In [68]:
Gain_Outlook = Ent_Outlook - Sunny/Outlook * Ent_Sunny - Overcast/Outlook * Ent_Overcast - Rain/Outlook * Ent_Rain
Gain_Outlook

0.24674981977443933

In [69]:
df.columns

Index(['Day', 'Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Golf'], dtype='object')