In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/bitstorm/decision-tree-example/master/src/main/java/org/bitstorm/wekaexample/weather.csv')
data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [3]:
def _calculate_weighted_entropy(data, target):
    """
    It calculates weighted entropy of dataframe and returns entropy
    for each attribute.
    
    :param data: Dataframe
        Pandas dataframe.
    :param target: str
        Target column name
    :return features_entropy: dict()
        Returns entropy of each attribute in a dictionary.
    """
    
    features_entropy = {}
    features = list(data.columns)[:-1]
    classes = list(data[target].unique())
    
    #This loop iterates over every feature in dataset..
    for feature in features:
        feature_categories = list(data[feature].unique())
        
        entropy_sum = 0.0
        #This loop calculates weighted_entropy...
        for category in feature_categories:
            prob_category = len(data[data[feature] == category])/len(data[feature])
            
            entropy_category = 0.0
            #This loop calculates entropy...
            for _class in classes:
                prob_subcategory = len(data[(data[feature] == category) & (data[data.columns[-1]] == _class)])/len(data[data[feature] == category])
                entropy_category += -1.0 * (prob_subcategory * np.log2(prob_subcategory+1e-323))
            
            entropy_sum += prob_category * entropy_category
        
        features_entropy[feature] = entropy_sum
     
    return features_entropy

In [4]:
#_calculate_weighted_entropy(data, 'Play')

In [5]:
#_calculate_weighted_entropy(data, 'Play')

In [6]:
def _calculate_weighted_gini(data, target):
    """
    It calculates weighted gini of dataframe and returns entropy
    for each attribute.
    
    :param data: Dataframe
        Pandas dataframe.
    :param target: str
        Target column name
    :return features_entropy: dict()
        Returns gini of each attribute in a dictionary.
    """
    
    features_gini = {}
    features = list(data.columns)[:-1]
    classes = list(data[target].unique())
    
    #This loop iterates over every feature in dataset..
    for feature in features:
        feature_categories = list(data[feature].unique())
        
        gini_sum = 0.0
        #This loop calculates weighted_gini...
        for category in feature_categories:
            prob_category = len(data[data[feature] == category])/len(data[feature])
            
            gini_category = 0.0
            #This loop calculates gini...
            for _class in classes:
                prob_subcategory = len(data[(data[feature] == category) & (data[data.columns[-1]] == _class)])/len(data[data[feature] == category])
                gini_category += prob_subcategory**2
            
            gini_category = 1 - gini_category
            gini_sum += prob_category * gini_category
        
        features_gini[feature] = gini_sum
     
    return features_gini

In [7]:
def _entropy_sample(target):
    """
    It calculates the entropy of target.
    
    :param target: Series
        Pandas Series of target feature in dataset.
    :return entropy: float
        Returns entropy of target variable.
    """
    
    classes = list(target.unique())
    
    entropy = 0.0
    for _class in classes:
        prob = len(target[target == _class])/len(target)
        entropy += prob * np.log2(prob+1e-323)
    
    entropy = -1.0 * entropy
    return entropy

In [8]:
def _gini_sample(target):
    """
    It calculates the gini of target.
    
    :param target: Series
        Pandas Series of target feature in dataset.
    :return entropy: float
        Returns gini of target variable.
    """
    
    classes = list(target.unique())
    
    gini = 0.0
    for _class in classes:
        prob = len(target[target == _class])/len(target)
        gini += prob**2
    
    gini = 1.0 - gini
    return gini

In [9]:
def _info_gain(impurity_sample, features_impurity):
    """
    It takes impurity(entropy or gini) of whole dataset and
    impurity of features and calculates Information Gain.
    
    :param impurity_sample: float
        Entropy or Gini calculated for target variable.
    :param features_impurity: dict()
        Calculated impurity(entropy or gini) for each feature variable.
    :return info_gain: dict()
        Returns dictionary containing I.G for each variable.
    """
    
    info_gain = {}
    for feature in list(features_impurity.keys()):
        info_gain[feature] = impurity_sample - features_impurity[feature]
    
    info_gain = {k: v for k, v in sorted(info_gain.items(), key=lambda item: item[1], reverse=True)}
    return info_gain

In [10]:
def IUFS(dataset, target, measure, k):
    """
    It calculates I.G for features based on measure either
    gini or entropy and returns names of k no of features
    along with their information gain.
    
    :param dataset: Dataframe
        Pandas Dataframe(dataset).
    :param target: str
        Target variable name.
    :param measure: str
        'entropy' or 'gini'
    :param k: int
        No of top features to retreive.
    :return features_gain: dict()
        I.G for each variable in descending order.
    """
    
    assert target in  dataset.columns, "Invalid target value!"
    
    features_gain = None
    if measure is 'entropy':
        features_impurity = _calculate_weighted_entropy(dataset, target)
        features_gain = _info_gain(_entropy_sample(data[target]), features_impurity)
    elif measure is 'gini':
        features_impurity = _calculate_weighted_gini(dataset, target)
        features_gain = _info_gain(_gini_sample(data[target]), features_impurity)
    else:
        assert False, "Invalid measure parameter!"
        
    assert k <= len(dataset.columns) - 1, "No of k can't be greater than total no of features!"
    features_gain = {key: features_gain[key] for i, key in enumerate(list(features_gain.keys())) if i < k}
    
    return features_gain

In [11]:
#Calling IUFS and getting columns with their I.G...
IUFS(data, 'Play', 'gini', 4)

{'Outlook': 0.11632653061224485,
 'Humidity': 0.09183673469387743,
 'Wind': 0.030612244897959162,
 'Temperature': 0.018707482993197244}

In [12]:
(True | False) and (True or False) or (True & False) and (True and False)

True

In [13]:
np.log2(10) == np.log2(10+1e-323)

True