In [21]:
# Load data, convert categorical variables to numeric variables, split data set into train and 
# test. Those steps are the same as the steps we took before in the decision tree repository.

import pandas
import numpy
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

names = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", 
         "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", 
         "hours_per_week", "native_country", "high_income"]

income = pandas.read_csv("income.csv", header=None, index_col=False, names=names)

# Convert the categorical variables to numeric variables
convert_list = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
                'race', 'sex', 'native_country', 'high_income']
for column in convert_list:
    col = pandas.Categorical.from_array(income[column])
    income[column] = col.codes

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship",
           "race", "sex", "hours_per_week", "native_country"]

income = income.reindex(numpy.random.permutation(income.index))

train_max_row = math.floor(income.shape[0] * .7)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

We'll first try to create two decision trees with slightly different parameters, and check their accuracy separately.

In [22]:
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75)
clf.fit(train[columns], train["high_income"])

clf2 = DecisionTreeClassifier(random_state=1, max_depth=6)
clf2.fit(train[columns], train["high_income"])

predict_1 = clf.predict(test[columns])
auc_1 = roc_auc_score(predict_1, test['high_income'])

predict_2 = clf2.predict(test[columns])
auc_2 = roc_auc_score(predict_2, test['high_income'])

print(auc_1)
print(auc_2)

0.786587913238
0.774814226817


In [23]:
# Combining predictions
predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf2.predict_proba(test[columns])[:,1]

auc = roc_auc_score(numpy.round((predictions + predictions2) / 2), test['high_income'])
print(auc)

0.794347522655


As we can see, the combined predictions of the two trees are more accurate than any single tree.

Next, we will introduce variations to decision trees using the Bagging technique, which means sampling with replacement.

In [24]:
# We'll build 10 trees using Bagging
tree_count = 10

# Each bag will have 70% of the number of original rows.
bag_proportion = .7

predictions = []
for i in range(tree_count):
    
    # We set random state to i instead of a fixed value so we don't get the same sample every 
    # loop. That would make all of our trees the same.
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    # Fit a decision tree model to the "bag".
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75)
    clf.fit(bag[columns], bag["high_income"])
    
    # Make predictions on the test data.
    predictions.append(clf.predict_proba(test[columns])[:,1])

rounded = numpy.round(sum(predictions)/len(predictions))
auc = roc_auc_score(rounded, test['high_income'])
print(auc)

0.785941008368


In [25]:
# Again we'll use calc_entropy and calc_information_gain we defined earlier in decision tree 
# repository to calculate entropy and information gain.

# Calculate entropy given a pandas Series, list, or numpy array.
def calc_entropy(column):

    counts = numpy.bincount(column)
    probabilities = counts / len(column)
    
    entropy = 0
    for prob in probabilities:
        if prob > 0: 
            entropy += prob * math.log(prob, 2)
    return -entropy

In [26]:
# Calculate information gain given a dataset, column to split on, and target.
def calc_information_gain(data_set, split_name, target_name):
    
    median = numpy.median(data_set[split_name])
    left_split = data_set[data_set[split_name] <= median]
    right_split = data_set[data_set[split_name] > median]
    
    left_entropy = calc_entropy(left_split[target_name])
    right_entropy = calc_entropy(right_split[target_name])
    total_entropy = calc_entropy(data_set[target_name])
    
    information_gain = total_entropy - (len(left_split) / len(data_set) * left_entropy +
                                       len(right_split) / len(data_set) * right_entropy)
    return information_gain

Modify find_best_column function we used before in decision tree repository to select a random sample from columns before computing information gain. Each subset will have 2 items in it.

In [27]:
# Select random features
# Create a model data set

data = pandas.DataFrame([
    [0,4,20,0],
    [0,4,60,2],
    [0,5,40,1],
    [1,4,25,1],
    [1,5,35,2],
    [1,5,55,1]
    ])
data.columns = ["high_income", "employment", "age", "marital_status"]

# Set a random seed to make results reproducible.
numpy.random.seed(1)

# The dictionary to store our tree.
tree = {}
nodes = []

# The function to find the column to split on.
def find_best_column(data, target_name, columns):
    information_gains = []
    
    for col in columns:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)

    # Find the name of the column with the highest gain.
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

# The function to construct an id3 decision tree.
def id3(data, target, columns, tree):
    unique_targets = pandas.unique(data[target])
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]

    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree["label"] = 0
        elif 1 in unique_targets:
            tree["label"] = 1
        return
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])

# Run the id3 algorithm on our dataset and print the resulting tree.
id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)

def find_best_column(data, target_name, columns):
    information_gains = []
    
    # Select two columns randomly.
    cols = numpy.random.choice(columns, 2)
    
    for col in cols:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)

    highest_gain_index = information_gains.index(max(information_gains))
    
    # Get the highest gain by indexing cols.
    highest_gain = cols[highest_gain_index]
    
    return highest_gain

id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)

{'median': 4.5, 'column': 'employment', 'number': 1, 'left': {'median': 25.0, 'column': 'age', 'number': 2, 'left': {'median': 22.5, 'column': 'age', 'number': 3, 'left': {'label': 0, 'number': 4}, 'right': {'label': 1, 'number': 5}}, 'right': {'label': 0, 'number': 6}}, 'right': {'median': 40.0, 'column': 'age', 'number': 7, 'left': {'median': 37.5, 'column': 'age', 'number': 8, 'left': {'label': 1, 'number': 9}, 'right': {'label': 0, 'number': 10}}, 'right': {'label': 1, 'number': 11}}}
{'median': 37.5, 'column': 'age', 'number': 12, 'left': {'median': 4.0, 'column': 'employment', 'number': 13, 'left': {'median': 22.5, 'column': 'age', 'number': 14, 'left': {'label': 0, 'number': 15}, 'right': {'label': 1, 'number': 16}}, 'right': {'label': 1, 'number': 17}}, 'right': {'median': 55.0, 'column': 'age', 'number': 18, 'left': {'median': 47.5, 'column': 'age', 'number': 19, 'left': {'label': 0, 'number': 20}, 'right': {'label': 1, 'number': 21}}, 'right': {'label': 0, 'number': 22}}}


We can also use scikit-learn library to repeat our random subset selection process. This is easier and with far less typing. We just set the splitter parameter on DecisionTreeClassifier to "random", and the max_features parameter to "auto". If we have N columns, this will pick a subset of features of size √N, compute the gini coefficient (similar to information gain) for each, and split the node on the best column in the subset.

In [28]:
# We'll build 10 trees
tree_count = 10

# Each bag will have 70% of the number of original rows.
bag_proportion = .7

predictions = []
for i in range(tree_count):

    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    # Fit a decision tree model to the "bag".
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75, splitter='random', 
                                 max_features='auto')
    clf.fit(bag[columns], bag["high_income"])
    
    # Using the model, make predictions on the test data.
    predictions.append(clf.predict_proba(test[columns])[:,1])

combined = numpy.sum(predictions, axis=0) / 10
rounded = numpy.round(combined)

print(roc_auc_score(rounded, test["high_income"]))

0.779318823421


We can simplify this process further by using RandomForestClassifier from Scikit-learn.

In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=75)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
print(roc_auc_score(predictions, test["high_income"]))

0.79247969822


Next, let's compare the accuracy of decision tree and random forest algorithms. We can conclude that random forests have a better accuracy over decision trees.

In [31]:
# Compare the results of decision tree and random forest
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(train[columns])
print(roc_auc_score(predictions, train["high_income"]))

predictions = clf.predict(test[columns])
print(roc_auc_score(predictions, test["high_income"]))

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=75)

clf.fit(train[columns], train['high_income'])

predictions_train = clf.predict(train[columns])
print(roc_auc_score(predictions_train, train['high_income']))

predictions_test = clf.predict(test[columns])
print(roc_auc_score(predictions_test, test['high_income']))

0.788248611021
0.786587913238
0.794602753238
0.79247969822
