In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn import tree
import graphviz
from matplotlib import pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import train_test_split

In [2]:
c0 = ["hot", "hot", "hot", "mild", "cool", "cool", "cool", "mild", "cool", "mild", "mild", "mild", "hot",
      "mild"]
c1 = ["weak", "strong", "weak", "weak", "weak", "strong", "strong", "weak", "weak", "weak", "strong", "strong",
      "weak", "strong"]
c2 = ["long", "long", "long", "long", "short", "short", "short", "long", "short", "short", "short", "long",
      "short", "long"]
c3 = ["no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"]

df = pd.DataFrame([c0, c1, c2, c3]).T

In [3]:
label_encoder = LabelEncoder()
df[0] = label_encoder.fit_transform(df[0])
df[1] = label_encoder.fit_transform(df[1])
df[2] = label_encoder.fit_transform(df[2])
df[3] = label_encoder.fit_transform(df[3])
X = df.iloc[:, 0:3]
y = df.iloc[:, 3]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=42)

clf = DecisionTreeClassifier(random_state=0,
                             criterion="entropy")
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [5]:
features = ['temperature', 'wind', 'traffic']
response = ['no', 'yes']

dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=features,
                                class_names=response,
                                filled=True)
# Draw graph
graph = graphviz.Source(dot_data, format="png")
graph
graph.render("car_driving_tree")


'car_driving_tree.png'

In [6]:
def entropy(target_col):
    elements, counts = np.unique(target_col,
                                 return_counts=True)
    entropy_sum = np.sum(
        [(-counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
    return entropy_sum


def info(data, split_attribute_name, target_name):
    total_entropy = entropy(data[target_name])
    values, counts = np.unique(data[split_attribute_name],
                               return_counts=True)
    weighted_entropy = np.sum(
        [(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute_name] == values[i])
                                                .dropna()[target_name])
         for i in range(len(values))])
    information = total_entropy - weighted_entropy
    return information

In [7]:
print("Temperature: ", info(df, 0, 3))
print("Wind:  ", info(df, 1, 3))
print("Traffic: ", info(df, 2, 3))
print("Decision Tree score: ", clf.score(X_test, y_test))
print("Decision Tree score(no train_test split): ", clf.score(X, y))

Temperature:  0.02922256565895487
Wind:   0.04812703040826949
Traffic:  0.15183550136234159
Decision Tree score:  0.5
Decision Tree score(no train_test split):  0.7142857142857143
