In [10]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

### Read the dataset

In [11]:
edf = pd.read_csv("careval.csv")
# edf.head()
edf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying        1728 non-null object
maint         1728 non-null object
doors         1728 non-null object
persons       1728 non-null object
lug_boot      1728 non-null object
safety        1728 non-null object
evaluation    1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [12]:
edf.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,evaluation
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


You will create a method called IUFS (impurity-based univariate feature selection), which will select the most informative features with a univariate feature selection schema. This feature selection method will take the dataset, name of the target variable, number of features to be selected (k) and the measure of impurity as an input, and will output the names of k best features based on the information gain. You are expected to implement information gain, entropy and Gini index functions. Note here that this will be a univariate selection, which means that you need to test the features individually.

In [13]:
# entropy (H)


def entropy(feature, dataset):
    """Calculates the entropy of a feature in a given dataset.

    Parameters
    ----------
    feature: str
        name of the feature
    dataset: pd.DataFrame
        dataframe for the dataset
    Returns
    -------
    float
        entropy for the feature in the dataset
    """

    assert feature in dataset.columns, "Invalid feature value!"

    classes = list(dataset[dataset.columns[-1]].unique())
    feature_categories = list(dataset[feature].unique())

    entropy_sum = 0.0
    # This loop calculates weighted_entropy...
    for category in feature_categories:
        prob_category = len(dataset[dataset[feature] == category]) / len(
            dataset[feature]
        )

        entropy_category = 0.0
        # This loop calculates entropy...
        for _class in classes:
            prob_subcategory = len(
                dataset[
                    (dataset[feature] == category)
                    & (dataset[dataset.columns[-1]] == _class)
                ]
            ) / len(dataset[dataset[feature] == category])
            entropy_category += -1.0 * (
                prob_subcategory * np.log2(prob_subcategory + 1e-323)
            )

        entropy_sum += prob_category * entropy_category

    return entropy_sum


entropy("buying", edf)

1.1092920008425613

In [14]:
# gini index (Gini)


def gini(feature, dataset):
    """Calculates the gini index of a feature in a given dataset.

    Parameters
    ----------
    feature: str
        name of the feature
    dataset: pd.DataFrame
        dataframe for the dataset
    Returns
    -------
    float
        gini index for the feature in the dataset
    """

    assert feature in dataset.columns, "Invalid feature value!"

    classes = list(dataset[dataset.columns[-1]].unique())
    feature_categories = list(dataset[feature].unique())

    gini_sum = 0.0
    # This loop calculates weighted_entropy...
    for category in feature_categories:
        prob_category = len(dataset[dataset[feature] == category]) / len(
            dataset[feature]
        )

        gini_category = 0.0
        # This loop calculates entropy...
        for _class in classes:
            prob_subcategory = len(
                dataset[
                    (dataset[feature] == category)
                    & (dataset[dataset.columns[-1]] == _class)
                ]
            ) / len(dataset[dataset[feature] == category])
            gini_category += prob_subcategory ** 2

        gini_category = 1 - gini_category
        gini_sum += prob_category * gini_category

    return gini_sum


gini("buying", edf)

0.4429976851851852

In [15]:
# information gain (IG)


def IG(feature, target, dataset, measure):
    """Calculates the information gain of a feature for a given target variable and a dataset.

    Parameters
    ----------
    feature: str
        name of the feature
    target: str
        name of the target variable
    dataset: pd.DataFrame
        dataframe for the dataset
    measure: str ('entropy' or 'gini')
        measure of impurity to be used
    Returns
    -------
    float
        information gain for the feature in the dataset for a given target variable
    """

    assert feature in dataset.columns, "Invalid feature value!"
    assert target in dataset.columns, "Invalid target value!"
    assert measure in ["entropy", "gini"], "Invalid measure value!"

    def _entropy_sample(target):
        classes = list(target.unique())

        entropy = 0.0
        for _class in classes:
            prob = len(target[target == _class]) / len(target)
            entropy += prob * np.log2(prob + 1e-323)

        entropy = -1.0 * entropy
        return entropy

    def _gini_sample(target):
        classes = list(target.unique())

        gini = 0.0
        for _class in classes:
            prob = len(target[target == _class]) / len(target)
            gini += prob ** 2

        gini = 1.0 - gini
        return gini

    if measure is "entropy":
        return _entropy_sample(dataset[target]) - entropy(feature, dataset)
    elif measure is "gini":
        return _gini_sample(dataset[target]) - gini(feature, dataset)
    else:
        assert False, "Invalid measure value!"


IG("buying", "evaluation", edf, "gini")

0.014286077889231918

In [16]:
def IUFS(target, dataset, k, measure="entropy"):
    """Finds k most informative features in the given dataset based on the target variable
        using information gain with the selected measure.

    Parameters
    ----------
    target: str
        name of the target variable
    dataset: pd.DataFrame
        dataframe for the dataset
    k: int
        number of features to return, must be less than or equal to number of descriptive features in dataset.
        in other words, 0 < k < len(dataset.columns).
    measure: str, 'entropy' or 'gini'
        measure of impurity
    Returns
    -------
    list
        returns a list of k feature names, selected based on univariate selection schema
    """
    assert target in dataset.columns, "Invalid target value!"
    assert measure in ["entropy", "gini"], "Invalid measure value!"
    assert k < len(dataset.columns) and k > 0, "Invalid k value!"

    features = list(dataset.columns)[:-1]

    info_gain = {}
    for feature in features:
        info_gain[feature] = IG(feature, target, dataset, measure)

    info_gain = {
        k: v
        for k, v in sorted(info_gain.items(), key=lambda item: item[1], reverse=True)
    }
    info_gain = {
        key: info_gain[key] for i, key in enumerate(list(info_gain.keys())) if i < k
    }
    return list(info_gain.keys())


IUFS("evaluation", edf, 2, measure="entropy")

['safety', 'persons']

In [17]:
def GR(feature, target, dataset, measure):
    """Calculates the gain ratio of a feature for a given target variable and a dataset.

    Parameters
    ----------
    feature: str
        name of the feature
    target: str
        name of the target variable
    dataset: pd.DataFrame
        dataframe for the dataset
    measure: str ('entropy' or 'gini')
        measure of impurity to be used
    Returns
    -------
    float
        gain ratio for the feature in the dataset for a given target variable
    """

    assert feature in dataset.columns, "Invalid feature value!"
    assert target in dataset.columns, "Invalid target value!"
    assert measure in ["entropy", "gini"], "Invalid measure value!"

    def _split_info(feature):
        sub_categories = list(feature.unique())

        split_info = 0.0
        for _category in sub_categories:
            prob = len(feature[feature == _category]) / len(feature)
            split_info += prob * np.log2(prob + 1e-323)

        split_info = -1.0 * split_info
        return split_info

    info_gain = IG(feature, target, dataset, measure)
    split_info = _split_info(dataset[feature])
    gain_ratio = info_gain / split_info
    return gain_ratio


GR("buying", "evaluation", edf, "gini")

0.007143038944615959

In [18]:
def IUFS2(target, dataset, k, measure="entropy", gain="IG"):
    """Finds k most informative features in the given dataset based on the target variable
        using information gain with the selected measure.

    Parameters
    ----------
    target: str
        name of the target variable
    dataset: pd.DataFrame
        dataframe for the dataset
    k: int
        number of features to return, must be less than or equal to number of descriptive features in dataset.
        in other words, 0 < k < len(dataset.columns).
    measure: str, 'entropy' or 'gini'
        measure of impurity
    gain: str, 'IG' or 'GR'
        feature selection metric ('IG' for information gain, 'GR' for gain ratio)
    Returns
    -------
    list
        returns a list of k feature names, selected based on univariate selection schema
    """

    assert target in dataset.columns, "Invalid target value!"
    assert measure in ["entropy", "gini"], "Invalid measure value!"
    assert k < len(dataset.columns) and k > 0, "Invalid k value!"
    assert gain in ["IG", "GR"], "Invalid gain value!"

    features = list(dataset.columns)[:-1]

    gain_metric = {}
    for feature in features:
        if gain is "IG":
            gain_metric[feature] = IG(feature, target, dataset, measure)
        elif gain is "GR":
            gain_metric[feature] = GR(feature, target, dataset, measure)

    gain_metric = {
        k: v
        for k, v in sorted(gain_metric.items(), key=lambda item: item[1], reverse=True)
    }
    gain_metric = {
        key: gain_metric[key] for i, key in enumerate(list(gain_metric.keys())) if i < k
    }
    return list(gain_metric.keys())


IUFS2("evaluation", edf, 2, measure="gini", gain="GR")

['safety', 'persons']