In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter
from math import log


from sklearn import tree
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier




from categorical_features_id3 import GiniRunner, EntropyRunner, TreeBaseRunner


<IPython.core.display.Javascript object>

# Create the data

In [3]:
OUTLOOK = [
    "sunny",
    "sunny",
    "overcast",
    "rainy",
    "rainy",
    "rainy",
    "overcast",
    "sunny",
    "sunny",
    "rainy",
    "sunny",
    "overcast",
    "overcast",
    "rainy",
]

TEMPERATURE = [
    "hot",
    "hot",
    "hot",
    "mild",
    "cool",
    "cool",
    "cool",
    "mild",
    "cool",
    "mild",
    "mild",
    "mild",
    "hot",
    "mild",
]
HUMIDITY = [
    "high",
    "high",
    "high",
    "high",
    "normal",
    "normal",
    "normal",
    "high",
    "normal",
    "normal",
    "normal",
    "high",
    "normal",
    "high",
]
WINDY = [
    False,
    True,
    False,
    False,
    False,
    True,
    True,
    False,
    False,
    False,
    True,
    True,
    False,
    True,
]
TARGET = [
    "no",
    "no",
    "yes",
    "yes",
    "yes",
    "no",
    "yes",
    "no",
    "yes",
    "yes",
    "yes",
    "yes",
    "yes",
    "no",
]

<IPython.core.display.Javascript object>

In [4]:
df = pd.DataFrame(
    {
        "outlook": OUTLOOK,
        "temperature": TEMPERATURE,
        "humidity": HUMIDITY,
        "windy": WINDY,
        "target": TARGET,
    }
)
df["windy"] = df["windy"].astype("str")

df

Unnamed: 0,outlook,temperature,humidity,windy,target
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


<IPython.core.display.Javascript object>

In [5]:
X = df.drop("target", axis=1)
y = df["target"]

<IPython.core.display.Javascript object>

In [6]:
def normalized_counts(df, feature):
    return df[feature].value_counts(normalize=True)


def calc_entropy_target(df, target: str):
    normed_counts = normalized_counts(df, target)
    return -sum(normed_counts * normed_counts.map(lambda x: log(x, 2)))


def calc_entropy_feature(df, feature: str, target: str):
    entropy = 0
    feature_labels = df[feature].unique()
    feature_labels_prob = normalized_counts(df, feature)

    for label in feature_labels:
        masked_df = df[df[feature] == label]
        label_entropy = calc_entropy_target(masked_df, target)

        label_prob = feature_labels_prob[label]

        entropy += label_prob * label_entropy

        print(f"{label} entropy: {label_entropy}")
    print(f"{feature} entropy: {entropy}")
    return entropy


def gain_information_feature(df, feature, target):
    target_entropy = calc_entropy_target(df, target)
    feature_entropy = calc_entropy_feature(df, feature, target)
    return target_entropy - feature_entropy


for feature_name in set(df.columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(df, feature_name, "target")}'
    )
    print("\n")

False entropy: 0.8112781244591328
True entropy: 1.0
windy entropy: 0.8921589282623617
windy gain information: 0.04812703040826927


high entropy: 0.9852281360342516
normal entropy: 0.5916727785823275
humidity entropy: 0.7884504573082896
humidity gain information: 0.15183550136234136


hot entropy: 1.0
mild entropy: 0.9182958340544896
cool entropy: 0.8112781244591328
temperature entropy: 0.9110633930116763
temperature gain information: 0.029222565658954647


sunny entropy: 0.9709505944546686
overcast entropy: -0.0
rainy entropy: 0.9709505944546686
outlook entropy: 0.6935361388961918
outlook gain information: 0.2467498197744391




<IPython.core.display.Javascript object>

The outlook feature has the highest gain information so that is going to be the first node we will split the decision tree. Thus we will have three branches:
- sunny_df
- overcast_df
- rainy_df

In [7]:
for feature_name in set(df[df["outlook"] == "sunny"].columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(df[df["outlook"] == "sunny"], feature_name, "target")}\n'
    )

False entropy: 0.9182958340544896
True entropy: 1.0
windy entropy: 0.9509775004326937
windy gain information: 0.01997309402197489

high entropy: -0.0
normal entropy: -0.0
humidity entropy: 0.0
humidity gain information: 0.9709505944546686

hot entropy: -0.0
mild entropy: 1.0
cool entropy: -0.0
temperature entropy: 0.4
temperature gain information: 0.5709505944546686

sunny entropy: 0.9709505944546686
outlook entropy: 0.9709505944546686
outlook gain information: 0.0



<IPython.core.display.Javascript object>

In [8]:
for feature_name in set(df[df["outlook"] == "rainy"].columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(df[df["outlook"] == "rainy"], feature_name, "target")}\n'
    )

False entropy: -0.0
True entropy: -0.0
windy entropy: 0.0
windy gain information: 0.9709505944546686

high entropy: 1.0
normal entropy: 0.9182958340544896
humidity entropy: 0.9509775004326937
humidity gain information: 0.01997309402197489

mild entropy: 0.9182958340544896
cool entropy: 1.0
temperature entropy: 0.9509775004326937
temperature gain information: 0.01997309402197489

rainy entropy: 0.9709505944546686
outlook entropy: 0.9709505944546686
outlook gain information: 0.0



<IPython.core.display.Javascript object>

In [9]:
for feature_name in set(df[df["outlook"] == "overcast"].columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(df[df["outlook"] == "overcast"], feature_name, "target")}\n'
    )

False entropy: -0.0
True entropy: -0.0
windy entropy: 0.0
windy gain information: -0.0

high entropy: -0.0
normal entropy: -0.0
humidity entropy: 0.0
humidity gain information: -0.0

hot entropy: -0.0
cool entropy: -0.0
mild entropy: -0.0
temperature entropy: 0.0
temperature gain information: -0.0

overcast entropy: -0.0
outlook entropy: 0.0
outlook gain information: -0.0



<IPython.core.display.Javascript object>

1st level: OUTLOOK

2nd level
- Sunny: HUMIDITY
- Rainy: WINDY
- Overcast: -

3rd level:




In [10]:
mask = (df["outlook"] == "sunny") & (df["humidity"] == "high")
masked_df = df[mask]

for feature_name in set(masked_df.columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(masked_df, feature_name, "target")}\n'
    )

masked_df

False entropy: -0.0
True entropy: -0.0
windy entropy: 0.0
windy gain information: -0.0

high entropy: -0.0
humidity entropy: 0.0
humidity gain information: -0.0

hot entropy: -0.0
mild entropy: -0.0
temperature entropy: 0.0
temperature gain information: -0.0

sunny entropy: -0.0
outlook entropy: 0.0
outlook gain information: -0.0



Unnamed: 0,outlook,temperature,humidity,windy,target
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
7,sunny,mild,high,False,no


<IPython.core.display.Javascript object>

In [11]:
mask = (df["outlook"] == "sunny") & (df["humidity"] == "normal")
masked_df = df[mask]

for feature_name in set(masked_df.columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(masked_df, feature_name, "target")}\n'
    )
masked_df

False entropy: -0.0
True entropy: -0.0
windy entropy: 0.0
windy gain information: -0.0

normal entropy: -0.0
humidity entropy: 0.0
humidity gain information: -0.0

cool entropy: -0.0
mild entropy: -0.0
temperature entropy: 0.0
temperature gain information: -0.0

sunny entropy: -0.0
outlook entropy: 0.0
outlook gain information: -0.0



Unnamed: 0,outlook,temperature,humidity,windy,target
8,sunny,cool,normal,False,yes
10,sunny,mild,normal,True,yes


<IPython.core.display.Javascript object>

In [12]:
mask = (df["outlook"] == "rainy") & (df["windy"] == "False")
masked_df = df[mask]

for feature_name in set(masked_df.columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(masked_df, feature_name, "target")}\n'
    )
masked_df

False entropy: -0.0
windy entropy: 0.0
windy gain information: -0.0

high entropy: -0.0
normal entropy: -0.0
humidity entropy: 0.0
humidity gain information: -0.0

mild entropy: -0.0
cool entropy: -0.0
temperature entropy: 0.0
temperature gain information: -0.0

rainy entropy: -0.0
outlook entropy: 0.0
outlook gain information: -0.0



Unnamed: 0,outlook,temperature,humidity,windy,target
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
9,rainy,mild,normal,False,yes


<IPython.core.display.Javascript object>

In [13]:
mask = (df["outlook"] == "rainy") & (df["windy"] == "True")
masked_df = df[mask]

for feature_name in set(masked_df.columns).difference(["target"]):
    print(
        f'{feature_name} gain information: {gain_information_feature(masked_df, feature_name, "target")}\n'
    )
masked_df

True entropy: -0.0
windy entropy: 0.0
windy gain information: -0.0

normal entropy: -0.0
high entropy: -0.0
humidity entropy: 0.0
humidity gain information: -0.0

cool entropy: -0.0
mild entropy: -0.0
temperature entropy: 0.0
temperature gain information: -0.0

rainy entropy: -0.0
outlook entropy: 0.0
outlook gain information: -0.0



Unnamed: 0,outlook,temperature,humidity,windy,target
5,rainy,cool,normal,True,no
13,rainy,mild,high,True,no


<IPython.core.display.Javascript object>

# Classifier

In [14]:
pipeline = ColumnTransformer(
    [
        (
            "ohe",
            OneHotEncoder(),
            ["outlook", "temperature", "humidity", "windy"],
        )
    ]
)

X_transformed = pipeline.fit_transform(X)
display(X_transformed)

array([[0., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0., 0., 1., 1., 0.],
       [0., 1., 0., 1., 0., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0., 1., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 1., 1., 0.],
       [0., 0., 1., 0., 0., 1., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 1., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 0., 1., 1., 0., 0., 1.]])

<IPython.core.display.Javascript object>

In [15]:
X_prepared = pd.DataFrame(X_transformed).rename(
    columns={
        0: "outlook_overcast",
        1: "outlook_rainy",
        2: "outlook_sunny",
        3: "temperature_cool",
        4: "temperature_hot",
        5: "temperature_cool",
        6: "humidity_high",
        7: "humidity_normal",
        8: "windy_False",
        9: "windy_True",
    }
)
X_prepared

Unnamed: 0,outlook_overcast,outlook_rainy,outlook_sunny,temperature_cool,temperature_hot,temperature_cool.1,humidity_high,humidity_normal,windy_False,windy_True
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
7,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
8,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
9,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


<IPython.core.display.Javascript object>

# Feature importance

In [16]:
clf = DecisionTreeClassifier(random_state=42, criterion="entropy")

clf.fit(X_prepared, y)

pd.DataFrame(
    index=X_prepared.columns,
    data=clf.tree_.compute_feature_importances(normalize=False),
    columns=["feature_importance"],
)

Unnamed: 0,feature_importance
outlook_overcast,0.226
outlook_rainy,0.114974
outlook_sunny,0.142857
temperature_cool,0.0
temperature_hot,0.0
temperature_cool,0.0
humidity_high,0.0
humidity_normal,0.198623
windy_False,0.257831
windy_True,0.0


<IPython.core.display.Javascript object>

# Runner

In [19]:
IMPURITY_RUNNER = EntropyRunner()

<IPython.core.display.Javascript object>

In [20]:
from typing import Dict, List, Tuple


def iteration(x, y, impurity_runner=EntropyRunner()) -> Dict:
    tree_runner = TreeBaseRunner(x, y, impurity_runner)
    return tree_runner.run()




<IPython.core.display.Javascript object>

In [21]:
dfs = iteration(X, y)

impurity: 0.9402859586706311
samples: 14
value: Counter({'yes': 9, 'no': 5})
best_feature_to_split_on: outlook




<IPython.core.display.Javascript object>

In [22]:
feature_to_split = "outlook"


for feature_label, (x, y) in dfs[feature_to_split].items():
    print("feature label:", feature_label)
    dfs[feature_to_split][feature_label] = iteration(x, y)

feature label: sunny
impurity: 0.9709505944546686
samples: 5
value: Counter({'no': 3, 'yes': 2})
best_feature_to_split_on: humidity


feature label: rainy
impurity: 0.9709505944546686
samples: 5
value: Counter({'yes': 3, 'no': 2})
best_feature_to_split_on: windy


feature label: overcast
impurity: -0.0
samples: 4
value: Counter({'yes': 4})
best_feature_to_split_on: temperature


<IPython.core.display.Javascript object>

In [23]:
for first_feature, v in dfs.items():
    for first_layer_feature_label, v in v.items():
        if v:
            for second_feature, v in v.items():
                for second_layer_feature_label, (x, y) in v.items():
                    print("first_feature:", first_feature)
                    print("first_feature_label:", first_layer_feature_label)
                    print("second_feature:", second_feature)
                    print("second_feature_label:", second_layer_feature_label)
                    dfs[first_feature][first_layer_feature_label][second_feature][
                        second_layer_feature_label
                    ] = iteration(x, y)
                    print("\n")
        else:
            continue

first_feature: outlook
first_feature_label: sunny
second_feature: humidity
second_feature_label: high
impurity: -0.0
samples: 3
value: Counter({'no': 3})
best_feature_to_split_on: temperature


first_feature: outlook
first_feature_label: sunny
second_feature: humidity
second_feature_label: normal
impurity: -0.0
samples: 2
value: Counter({'yes': 2})
best_feature_to_split_on: temperature


first_feature: outlook
first_feature_label: rainy
second_feature: windy
second_feature_label: False
impurity: -0.0
samples: 3
value: Counter({'yes': 3})
best_feature_to_split_on: temperature


first_feature: outlook
first_feature_label: rainy
second_feature: windy
second_feature_label: True
impurity: -0.0
samples: 2
value: Counter({'no': 2})
best_feature_to_split_on: temperature




<IPython.core.display.Javascript object>

# Test

In [24]:
mask = (df["outlook"] == "sunny") & (df["humidity"] == "normal")
df[mask]["target"]

8     yes
10    yes
Name: target, dtype: object

<IPython.core.display.Javascript object>

In [25]:
mask = (df["outlook"] == "rainy") & (df["windy"] == "True")
df[mask]["target"]

5     no
13    no
Name: target, dtype: object

<IPython.core.display.Javascript object>

In [26]:
mask = (df["outlook"] == "rainy") & (df["windy"] == "True")
df[mask]["target"]

5     no
13    no
Name: target, dtype: object

<IPython.core.display.Javascript object>