In [22]:
import pandas

names = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", 
         "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", 
         "hours_per_week", "native_country", "high_income"]
income = pandas.read_csv("income.csv", header=None, names=names, index_col=False)
print(income.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0          2174             0              40   United-States   

In [23]:
# Convert the categorical variables in our dataset to numeric variables
# We can use the Categorical.from_array method from Pandas to perform the conversion to numbers
convert_list = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
                'race', 'sex', 'native_country', 'high_income']
for column in convert_list:
    col = pandas.Categorical.from_array(income[column])
    income[column] = col.codes
    print(income[column].head())

0    7
1    6
2    4
3    4
4    4
Name: workclass, dtype: int8
0     9
1     9
2    11
3     1
4     9
Name: education, dtype: int8
0    4
1    2
2    0
3    2
4    2
Name: marital_status, dtype: int8
0     1
1     4
2     6
3     6
4    10
Name: occupation, dtype: int8
0    1
1    0
2    1
3    0
4    5
Name: relationship, dtype: int8
0    4
1    4
2    4
3    2
4    2
Name: race, dtype: int8
0    1
1    1
2    1
3    1
4    0
Name: sex, dtype: int8
0    39
1    39
2    39
3    39
4     5
Name: native_country, dtype: int8
0    0
1    0
2    0
3    0
4    0
Name: high_income, dtype: int8


In [24]:
# Compute the entropy of the high_income column in the income dataframe
import math

high_income = sum(income['high_income'] == 1)
total = income.shape[0]
high_ratio = high_income / total
low_ratio = 1 - high_ratio
income_entropy = - (high_ratio * math.log(high_ratio, 2) + low_ratio * math.log(low_ratio, 2))

print(income_entropy)

0.796383955202


In [25]:
# Compute the information gain for splitting on the age column of income
import numpy

def calc_entropy(column):
    # Calculate entropy given a pandas Series, list, or numpy array.
    counts = numpy.bincount(column)
    probabilities = counts / len(column)
    
    entropy = 0
    for prob in probabilities:
        if prob > 0: 
            entropy += prob * math.log(prob, 2)
    return -entropy

median_age = numpy.median(income['age'])
left_split = income[income['age'] <= median_age]
right_split = income[income['age'] > median_age]

left_entropy = calc_entropy(left_split['high_income'])
right_entropy = calc_entropy(right_split['high_income'])
total_entropy = calc_entropy(income['high_income'])

age_information_gain = total_entropy - (len(left_split) / len(income) * left_entropy + 
                                        len(right_split) / len(income) * right_entropy)
print(age_information_gain)

0.0470286613047


Make a list called information_gains. It should contain, in order, the information gain from splitting on these columns: age, workclass, education_num, marital_status, occupation, relationship, race, sex, hours_per_week, native_country.

Find the highest value in the information_gains list. Assign the name of the column with the highest information gain to highest_gain.

In [26]:
def calc_information_gain(data_set, split_name, target_name):
    # Calculate information gain given a dataset, column to split on, and target.
    median = numpy.median(data_set[split_name])
    left_split = data_set[data_set[split_name] <= median]
    right_split = data_set[data_set[split_name] > median]
    
    left_entropy = calc_entropy(left_split[target_name])
    right_entropy = calc_entropy(right_split[target_name])
    total_entropy = calc_entropy(data_set[target_name])
    
    information_gain = total_entropy - (len(left_split) / len(data_set) * left_entropy +
                                       len(right_split) / len(data_set) * right_entropy)
    return information_gain

columns = ["age", "workclass", "education_num", "marital_status", "occupation", 
           "relationship", "race", "sex", "hours_per_week", "native_country"]
information_gains = []

for col in columns:
    information_gains.append(calc_information_gain(income, col, 'high_income'))

index = information_gains.index(max(information_gains))
highest_gain = columns[index]
print(highest_gain)

marital_status


In [27]:
def find_best_column(data, target_name, columns):
    # find the column in columns to split on.
    
    information_gains = []
    for col in columns:
        information_gains.append(calc_information_gain(data, col, target_name))
    index = information_gains.index(max(information_gains))
    return columns[index]

# A list of columns to potentially split income with.
columns = ["age", "workclass", "education_num", "marital_status", "occupation", 
           "relationship", "race", "sex", "hours_per_week", "native_country"]

income_split = find_best_column(income, 'high_income', columns)
print(income_split)

marital_status


In [28]:
# To build up to making the full id3 function, let's first build a simpler algorithm that we 
# can extend. 
label_1s = []
label_0s = []
def id3(data_set, target, columns):
    unique_targets = pandas.unique(data_set[target])
    
    if len(unique_targets) == 1:
        if (unique_targets[0] == 1):
            label_1s.append(1)
        elif(unique_targets[0] == 0):
            label_0s.append(0)
        return
    
    best_column = find_best_column(data_set, target, columns)
    column_median = numpy.median(data_set[best_column])
    left_split = data_set[data_set[best_column] <= column_median]
    right_split = data_set[data_set[best_column] > column_median]
    
    for split in [left_split, right_split]:
        id3(split, target, columns)

# Create the dataset that we used in the example in the last screen.
data = pandas.DataFrame([
    [0,20,0],
    [0,60,2],
    [0,40,1],
    [1,25,1],
    [1,35,2],
    [1,55,1]
    ])
# Assign column names to the data.
data.columns = ["high_income", "age", "marital_status"]

# Call the function on our data to set the counters properly.
id3(data, "high_income", ["age", "marital_status"])
print(label_1s, label_0s)

[1, 1, 1] [0, 0, 0]


In [29]:
# In order to keep track of the tree, we'll need to make some modifications to id3. The first 
# is that we'll be changing the definition to pass in the tree dictionary.
tree = {}
nodes = []

def id3(data_set, target, columns, tree):
    nodes.append(len(nodes) + 1)
    tree['number'] = nodes[-1]
    
    unique_targets = pandas.unique(data_set[target])
    
    if len(unique_targets) == 1:
        if (unique_targets[0] == 1):
            tree['label'] = 1
        elif(unique_targets[0] == 0):
            tree['label'] = 0
        return
    
    best_column = find_best_column(data_set, target, columns)
    column_median = numpy.median(data_set[best_column])
    
    tree['column'] = best_column
    tree['median'] = column_median
    
    left_split = data_set[data_set[best_column] <= column_median]
    right_split = data_set[data_set[best_column] > column_median]
    
    for name, split in [['left', left_split], ['right', right_split]]:
        tree[name] = {}
        id3(split, target, columns, tree[name])

id3(data, "high_income", ["age", "marital_status"], tree)
print(tree)

{'column': 'age', 'median': 37.5, 'right': {'column': 'age', 'median': 55.0, 'right': {'number': 11, 'label': 0}, 'number': 7, 'left': {'column': 'age', 'median': 47.5, 'right': {'number': 10, 'label': 1}, 'number': 8, 'left': {'number': 9, 'label': 0}}}, 'number': 1, 'left': {'column': 'age', 'median': 25.0, 'right': {'number': 6, 'label': 1}, 'number': 2, 'left': {'column': 'age', 'median': 22.5, 'right': {'number': 5, 'label': 1}, 'number': 3, 'left': {'number': 4, 'label': 0}}}}


In [30]:
# A pretty tree
def print_with_depth(string, depth):
    # Add space before a string.
    prefix = "    " * depth
    # Print a string, appropriately indented.
    print("{0}{1}".format(prefix, string))
    
    
def print_node(tree, depth):
    # Check for the presence of label in the tree.
    if "label" in tree:
        # If there's a label, then this is a leaf, so print it and return.
        print_with_depth("Leaf: Label {0}".format(tree["label"]), depth)
        # This is critical -- without it, you'll get infinite recursion.
        return
    # Print information about what the node is splitting on.
    print_with_depth("{0} > {1}".format(tree["column"], tree["median"]), depth)
    
    # Create a list of tree branches.
    branches = [tree["left"], tree["right"]]
        
    # Insert code here to recursively call print_node on each branch.
    # Don't forget to increment depth when you pass it in!
    for branch in branches:
        print_node(branch, depth+1)

print_node(tree, 0)

age > 37.5
    age > 25.0
        age > 22.5
            Leaf: Label 0
            Leaf: Label 1
        Leaf: Label 1
    age > 55.0
        age > 47.5
            Leaf: Label 0
            Leaf: Label 1
        Leaf: Label 0


In [32]:
# Make predictions
def predict(tree, row):
    if 'label' in tree:
        return tree['label']
    
    column = tree['column']
    median = tree['median']
    
    if row[column] <= median:
        return predict(tree['left'], row)
    else:
        return predict(tree['right'], row)

print(predict(tree, data.iloc[0]))

0


In [33]:
# Make predictions on multiple rows at once 
new_data = pandas.DataFrame([
    [40,0],
    [20,2],
    [80,1],
    [15,1],
    [27,2],
    [38,1]
    ])
# Assign column names to the data.
new_data.columns = ["age", "marital_status"]

predictions = new_data.apply(lambda row: predict(tree, row), axis=1)
print(predictions)

0    0
1    0
2    0
3    0
4    1
5    0
dtype: int64
