In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot

In [2]:
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,iris
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
names = list(df.columns)
target = len(names)
df.columns = list(range(len(names)))
attributes = list(df.columns)[:-1]
df.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
names

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'iris']

In [5]:
attributes

[0, 1, 2, 3]

## 计算每个属性的Entropy

In [6]:
# Calculates the entropy of the given data set for the target attr
def entropy(df, targetAttr):
    proba = df[targetAttr].value_counts(normalize=True)
    entropy = -sum(proba*np.log2(proba))
    return entropy

In [7]:
for i in range(5):
    print(names[i], entropy(df, i))

sepal_length 4.822018088381164
sepal_width 4.0117097612189285
petal_length 5.033829378702223
petal_width 4.065662933799394
iris 1.584962500721156


The most informative attribute is "petal_length", followed by "sepal_length"  
The most uninformative attribute is the 'sepal_width', followed by "petal_width" (here the class label "iris" is not considered as a attribute)

## Build a decision tree using ID3

In [8]:
def gain(df, attr, target):
    """
    Calculates the information gain (reduction in entropy) that would
    result by splitting the data on the chosen attribute (attr).
    """
    # Calculate the value distributions of the target attribute
    val_counts = df[attr].value_counts(normalize=True)
    
    # Calculate the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    subset_entropy = 0.0
    for val in val_counts.index:
        val_prob = val_counts[val]
        sub_df = df[df[attr]==val]
        subset_entropy += val_prob * entropy(sub_df, target)
    return (entropy(df, target) - subset_entropy)

def choose_attr(df, attributes, target):
    best = attributes[0]
    max_gain = 0;
    for attr in attributes:
        new_gain = gain(df, attr, target) 
        if new_gain > max_gain:
            max_gain = new_gain
            best = attr
    return best

def make_tree(df, attributes, target):
    # If all the records in the dataset have the same classification, return that classification.
    unique_vals = df[target].unique()
    if len(unique_vals) == 1:
        return unique_vals[0]
    else:
        # Choose the next best attribute to best classify our data
        best = choose_attr(df, attributes, target)
        # Create a new decision tree/node with the best attribute and an empty
        # dictionary object--we'll fill that up next.
        tree = {names[best]:{}}
    
        # Create a new decision tree/sub-node for each of the values in the
        # best attribute field
        for val in df[best].unique():
            # Create a subtree for the current value under the "best" field
            examples = df[df[best] == val]
            new_attr = attributes[:]
            new_attr.remove(best)
            subtree = make_tree(examples, new_attr, target)
    
            # Add the new subtree to the empty dictionary object in our new
            # tree/node we just created.
            tree[names[best]][val] = subtree
    
    return tree

In [9]:
#Run ID3
tree = make_tree(df, attributes, 4)
tree

{'petal_length': {1.4: 'Iris-setosa',
  1.3: 'Iris-setosa',
  1.5: 'Iris-setosa',
  1.7: 'Iris-setosa',
  1.6: 'Iris-setosa',
  1.1: 'Iris-setosa',
  1.2: 'Iris-setosa',
  1.0: 'Iris-setosa',
  1.9: 'Iris-setosa',
  4.7: 'Iris-versicolor',
  4.5: {'sepal_length': {6.4: 'Iris-versicolor',
    5.7: 'Iris-versicolor',
    5.6: 'Iris-versicolor',
    6.2: 'Iris-versicolor',
    6.0: 'Iris-versicolor',
    5.4: 'Iris-versicolor',
    4.9: 'Iris-virginica'}},
  4.9: {'sepal_width': {3.1: 'Iris-versicolor',
    2.5: 'Iris-versicolor',
    2.8: 'Iris-virginica',
    2.7: 'Iris-virginica',
    3.0: 'Iris-virginica'}},
  4.0: 'Iris-versicolor',
  4.6: 'Iris-versicolor',
  3.3: 'Iris-versicolor',
  3.9: 'Iris-versicolor',
  3.5: 'Iris-versicolor',
  4.2: 'Iris-versicolor',
  3.6: 'Iris-versicolor',
  4.4: 'Iris-versicolor',
  4.1: 'Iris-versicolor',
  4.8: {'sepal_length': {5.9: 'Iris-versicolor',
    6.8: 'Iris-versicolor',
    6.2: 'Iris-virginica',
    6.0: 'Iris-virginica'}},
  4.3: 'Iris-ver