In [269]:
import pandas as pd
classification=['Freshman','Freshman','Sophomore','Junior','Freshman','Sophomore']
hour_of_practice=['>2h','>2h','>2h','<2h','>2h','<2h']
pass_the_quiz=['Yes','Yes','Yes', 'Yes', 'No','No']
df=pd.DataFrame({'Classification':classification, 
                'hour of practice':hour_of_practice, 
                "Pass the quiz":pass_the_quiz })
df

Unnamed: 0,Classification,hour of practice,Pass the quiz
0,Freshman,>2h,Yes
1,Freshman,>2h,Yes
2,Sophomore,>2h,Yes
3,Junior,<2h,Yes
4,Freshman,>2h,No
5,Sophomore,<2h,No


## Gini Imputiry & Gini Index

**Gini Impurity**
$$\text{Gini($K$)}=\sum_{i\in N}P_{i,K}(1-P_{i,K})=1-\sum_{i\in N}P_{i,K}^2$$
* $N$ is the list of classes (In this case $N=\{\text{'Yes'},\text{'No'}\}$)
* $K$ is the category
* $P_{i,K}$ is the probability of category $K$ having class $i$ 

$\text{Gini}(F)=1-(P_{\text{No}}^2+P_{\text{Yes}}^2) =1-((\frac{1}{3})^2 +(\frac{2}{3})^2)=\frac{4}{9}$

$\text{Gini}(S)=1-((\frac{1}{2})^2 +(\frac{1}{2})^2)=0.5$

$\text{Gini}(J)=1-(0^2+1^2)=0$

**Gini Index**
$$I_{\text{Gini}}(a)=\sum_{k\in M}P_{k,a}\cdot\text{Gini($k$)}$$
* $a$ is the feature
* $M$ be the list of all categories in feature $a$ 
* $P_{k,a}$is the fraction of category $k$ in feature $a$

Let's $a$ ="Classification"
$I_{\text{Gini}}(a)=P_{ \text{F},a}\cdot \text{Gini}(\text{F})+P_{\text{S},a}\cdot \text{Gini}(\text{S})+P_{\text{ J},a}\cdot \text{Gini}(\text{J})$
$I_{\text{Gini}}(a)=\frac{1}{2}\cdot\frac{4}{9}+\frac{1}{3}\cdot 0.5+\frac{1}{6}\cdot 0=\frac{7}{18}$

In [276]:
# Input Format(df, feature name, category name ,target name, list of all classes)
# where df is the current node observation

def gini(df,feature,category,target,classes_list):
    df=df[df[feature]==category]
    def P_i_K(i):
        return len(df[df[target]==i])/len(df)
        
    result=1-sum([(P_i_K(i))**2 for i in classes_list])
    return result

# Input Format(df, feature name, target name, list of all classes)
def Gini_index(df,feature,target,classes_list):
    def P_k_a(category):
        return len(df[df[feature]==category])/len(df)
    
    result=0
    for category in df[feature].unique():
        gini_value=gini(df,feature,category,target,classes_list) 
        P_k_a_value=P_k_a(category)
        result+=gini_value*P_k_a_value

    return result

print("Gini Index of Classification",
      Gini_index(df,"Classification","Pass the quiz",['Yes','No']))
print("Gini Index of hour of practice", 
      Gini_index(df,"hour of practice","Pass the quiz",['Yes','No']))

Gini Index of Classification 0.38888888888888884
Gini Index of hour of practice 0.41666666666666663


# Test

In [277]:
df_new=df.copy()
df_new['Classification']=df['Classification'].map({'Freshman':1, 
                                                   'Sophomore':2 , 
                                                   'Junior':3})
df_new['hour of practice']=df['hour of practice'].map({'>2h':1, '<2h':0})
df_new['Pass the quiz']=df['Pass the quiz'].map({'Yes':1, 'No':0})
df_new

Unnamed: 0,Classification,hour of practice,Pass the quiz
0,1,1,1
1,1,1,1
2,2,1,1
3,3,0,1
4,1,1,0
5,2,0,0


In [278]:
from sklearn import tree

X = df_new[['Classification','hour of practice']].values
Y = df_new['Pass the quiz'].values
clf = tree.DecisionTreeClassifier(min_samples_split=2)
clf = clf.fit(X, Y)

In [279]:
#Freshman <2h
print("Freshman >2h", clf.predict([[1, 0]]))
print("Sophomore >2h", clf.predict([[2, 1]]))
print("Junior >2h", clf.predict([[3, 1]]))

Freshman >2h [0]
Sophomore >2h [1]
Junior >2h [1]


## Entropy & Infomation gain

**Entropy**

$$H(E)=\begin{cases}-\sum_{i\in N}P_{i,E}\log_2P_{i,E} \hspace{6.5mm} P_{i,E}\neq 0 \text{ for all }i \\ 0  \hspace{45mm} \text{otherwise} \end{cases}$$

* N is the list of all classes

* $P_{i,E}$ is the probability of event $E$ having class $i$ 

$H(F)=-(P_{\text{Yes,F}}\cdot \log_2P_{\text{Yes,F}}+P_{\text{No,F}}\cdot \log_2P_{\text{No,F}})=-(\frac{2}{3}\log_2\frac{2}{3}+\frac{1}{3}\log_2\frac{1}{3})=0.9183$
$H(S)=-(\frac{1}{2}\log_2\frac{1}{2}+\frac{1}{2}\log_2\frac{1}{2})=1$
Since $P_{\text{No,J}}=0$, 
$H(J)=0$  

**Info gain**
$$IG(T,a)=H(T)-H(T|a)=H(T)-\sum_{i\in K} P_{i,a} \cdot H(i)$$
* $T$ is the sample space
* $a$ is the feature
* $H(T|a)$ can be understand as weighted sum of all entropy

Let's $a$ ="Classification", we have
$H(T)=-(P_{\text{Yes,T}}\cdot \log_2P_{\text{Yes,T}}+P_{\text{No,T}}\cdot \log_2P_{\text{No,T}})=-(\frac{1}{3}\log_2\frac{1}{3}+\frac{2}{3}\log_2\frac{2}{3})=0.918$

$\sum_{i\in K} P_{i,a} \cdot H(i)=P_{F,a}\cdot H(F)+P_{F,a}\cdot H(S)++P_{S,a}\cdot H(J)=\frac{1}{2}\cdot 0.918+\frac{1}{3}\cdot 1++\frac{1}{6}\cdot 0=0.792$

In [282]:
# Input Format(df,feature name K,category name,target name,#List of all classes)
# Pass feature, category None if want to find the entropy of the whole observation
import math
def entropy(df,feature,category,target,classes_list):
    
    if (feature!=None)|(category!=None):
        df=df[df[feature]==category]
        
    def P_i_k(class_):
        return len(df[df[target]==class_])/len(df)
    
    result=0
    for class_ in classes_list:
        P_i_k_value=P_i_k(class_)
        if P_i_k_value!=0:
            result+=P_i_k_value*math.log(P_i_k_value,2)
    return -1*result


# Input Format(df,feature name K,category name,target name,#List of all classes)
def InfoGain(df,feature,target,classes_list):
    H_T=entropy(df,None,None,target,classes_list)    
    def P_i_a(category,feature):
        return len(df[df[feature]==category])/len(df)
    
    result=0
    
    for category in df[feature].unique():
        result+=P_i_a(category, feature)*entropy(df,feature,category,target,classes_list)
    result=H_T-result
    return result

print("Information Gain of Classification", 
      InfoGain(df,'Classification','Pass the quiz',['Yes','No']))
print("Information Gain of hour of practice", 
      InfoGain(df,'hour of practice','Pass the quiz',['Yes','No']))

Information Gain of Classification 0.12581458369391152
Information Gain of hour of practice 0.044110417748401076


In [281]:
# for category in df['Classification'].unique():
#     print(category+" Entropy:",
#           entropy(df,'Classification',category,'Pass the quiz',['Yes','No']))