In [1]:
# Will Hollingsworth, Colton Murray, Alexander Shiveley

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import math

# Getting the data into Python

In [3]:
# Load the csv as a numpy array of strings, 
# because it includes the column headers
raw_data = np.loadtxt('iris.csv', delimiter=',', dtype=str)

# Grab the headers
header_list = raw_data[0].tolist()

# Remove the headers from the data
stripped = np.delete(raw_data, [0], axis=0)

# We specifically want to know if a sample is setosa or NOT setosa
def apply_mapping(row):
    row[4] = 1 if row[4] == 'setosa' else 0
    return row

# Convert species to a numeric value
converted = np.apply_along_axis(apply_mapping, 1, stripped)

# Convert everything into floats!
clean_data = np.array(converted, dtype=float)

In [4]:
clean_data

array([[5.1, 3.5, 1.4, 0.2, 1. ],
       [4.9, 3. , 1.4, 0.2, 1. ],
       [4.7, 3.2, 1.3, 0.2, 1. ],
       [4.6, 3.1, 1.5, 0.2, 1. ],
       [5. , 3.6, 1.4, 0.2, 1. ],
       [5.4, 3.9, 1.7, 0.4, 1. ],
       [4.6, 3.4, 1.4, 0.3, 1. ],
       [5. , 3.4, 1.5, 0.2, 1. ],
       [4.4, 2.9, 1.4, 0.2, 1. ],
       [4.9, 3.1, 1.5, 0.1, 1. ],
       [5.4, 3.7, 1.5, 0.2, 1. ],
       [4.8, 3.4, 1.6, 0.2, 1. ],
       [4.8, 3. , 1.4, 0.1, 1. ],
       [4.3, 3. , 1.1, 0.1, 1. ],
       [5.8, 4. , 1.2, 0.2, 1. ],
       [5.7, 4.4, 1.5, 0.4, 1. ],
       [5.4, 3.9, 1.3, 0.4, 1. ],
       [5.1, 3.5, 1.4, 0.3, 1. ],
       [5.7, 3.8, 1.7, 0.3, 1. ],
       [5.1, 3.8, 1.5, 0.3, 1. ],
       [5.4, 3.4, 1.7, 0.2, 1. ],
       [5.1, 3.7, 1.5, 0.4, 1. ],
       [4.6, 3.6, 1. , 0.2, 1. ],
       [5.1, 3.3, 1.7, 0.5, 1. ],
       [4.8, 3.4, 1.9, 0.2, 1. ],
       [5. , 3. , 1.6, 0.2, 1. ],
       [5. , 3.4, 1.6, 0.4, 1. ],
       [5.2, 3.5, 1.5, 0.2, 1. ],
       [5.2, 3.4, 1.4, 0.2, 1. ],
       [4.7, 3

In [5]:
def split_input_output(data):
    """
    Splits the input array into two seperate sets:
        * the feature values
        * the output value
        
    :returns: (tuple) the features are the first element, the outputs are the second
    """
    return np.delete(data, [data.shape[1] - 1], axis=1), data[:, -1]

In [6]:
x, y = split_input_output(clean_data)
x, y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [7]:
# we also probably want to be able to separate our positive and negative examples
def split_pos_neg(data):
    """
    Returns two sets of positive, then negative examples (1's then 0's from the output column)
    """
    return data[data[:, -1]==1, :-1], data[data[:, -1]==0, :-1]

In [8]:
pos, neg = split_pos_neg(clean_data)

# Histograms

In [9]:
def get_range(data, attribute):
    return data[:, attribute].min(), data[:, attribute].max()

def get_bin_edges(data, attribute, num_bins):
    l, h = get_range(data, attribute)
    
    step = (h - l) / num_bins
    edges = np.arange(num_bins + 1)
    return (edges * step) + l

def hist(data, attribute, num_bins):
    """
    Produces two histograms, one for positive and one for negative examples.
    Calculates the bins from the entire data set.
    
    :param data: The TOTAL data set
    :attribute: the attribute (column index) you'd like to histogram
    :num_bins: the number of bins for the histogram
    """
    edges = get_bin_edges(data, attribute, num_bins)
    pos, neg = split_pos_neg(data)
    
    h_pos, _ = np.histogram(pos[:, attribute], edges)
    h_neg, _ = np.histogram(neg[:, attribute], edges)
    
    return h_pos, h_neg, edges
    

In [24]:
h_pos, h_neg, e = hist(clean_data, 2, 5)
h_pos, h_neg, e

(array([50,  0,  0,  0,  0], dtype=int64),
 array([ 0,  3, 34, 47, 16], dtype=int64),
 array([1.  , 2.18, 3.36, 4.54, 5.72, 6.9 ]))

In [16]:
# ^ Notice how they're completely separate for petal length

# Entropy

In [35]:
def total_entropy(data_set):
    """
    Calculates the entropy of a data set.
    Assumes that the final column is the classification for each sample.
    """
    x, y = split_input_output(data_set)
    
    # get the possible output values, and their totals
    values, counts = np.unique(y, return_counts=True)
    
    entropy = 0
    for count in counts:
        p = count / sum(counts)
        entropy -= p * math.log(p, len(values))
        
    return entropy

def bucket_entropy(pos_count, neg_count):
    if pos_count == 0 or neg_count == 0:
        return 0
    
    p = pos_count / (pos_count + neg_count)
    return -p * math.log(p, 2)


def info_gained(data_set, attribute, num_bins):
    h_pos, h_neg, _ = hist(data_set, attribute, num_bins)
    
    tot_entropy = total_entropy(data_set)
    for i in range(len(h_pos)):
        if h_pos[i] == 0 and h_neg[i] == 0:
            continue
            
        tot_entropy -= (h_pos[i] / (h_pos[i] + h_neg[i])) * bucket_entropy(h_pos[i], h_neg[i])
        
    return tot_entropy

In [36]:
total_entropy(clean_data)

0.9182958340544896

In [37]:
info_gained(clean_data, 2, 5)

0.9182958340544896

In [38]:
# ^ In this specific case, we gained ALL the info because the samples are perfectly seperated by petal length
# Theoretically, we would check every attribute and choose whichever one yielded the highest gain
# Then split our data_set by the bins of the histogram, if a bin has all the same kind of output we can stop there,
# but if there are still bins with both kinds of output we repeat this info_gained on a new attribute for that bin.

In [39]:
# TODO: make sure we're doing the bins right? the homework mentions rounding to the nearest integer, 
# currently we're just evenly splitting across the range of the attribute

# TODO: actually form the tree? I don't think it's actually necessary but usually you would actually output the tree with
# the sequences of tests it performs on which attributes

# TODO: accuracy?

In [40]:
for i in range(4):
    print(info_gained(clean_data, i, 5))

0.5145217274189858
0.5209306190327005
0.9182958340544896
0.8714208340544896


In [41]:
for i in range(4):
    print(info_gained(clean_data, i, 10))

0.37507517662068357
0.14405463858438253
0.9182958340544896
0.9182958340544896


In [42]:
for i in range(4):
    print(info_gained(clean_data, i, 15))

0.2625540243189166
-0.1359341262023851
0.9182958340544896
0.9182958340544896


In [43]:
for i in range(4):
    print(info_gained(clean_data, i, 20))

-0.004912453704329987
-0.4273491780277112
0.9182958340544896
0.9182958340544896
