In [52]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing, model_selection, metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [2]:
input_dat = "./adult.csv"

In [25]:
data = pd.read_csv(
    input_dat, 
    names=["age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
           "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
           "hours_per_week", "native_country", "wealth"], index_col=False)

In [26]:
data.loc[data["capital_gain"] != 0, ["capital_gain"]] = 1
data.loc[data["capital_loss"] != 0, ["capital_loss"]] = 1

In [27]:
# keep US only
data["country"] = "Other"
data.loc[data["native_country"] == " United-States", "country"] = "US"

In [29]:
data = pd.get_dummies(
    data, 
    columns=[
        'relationship', 'race', 'sex', "country"])

In [30]:
#TODO: clean up the other variables. Remove empty spaces and "?"

In [31]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'capital_gain', 'capital_loss',
       'hours_per_week', 'native_country', 'wealth', 'relationship_ Husband',
       'relationship_ Not-in-family', 'relationship_ Other-relative',
       'relationship_ Own-child', 'relationship_ Unmarried',
       'relationship_ Wife', 'race_ Amer-Indian-Eskimo',
       'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White',
       'sex_ Female', 'sex_ Male', 'country_Other', 'country_US'],
      dtype='object')

In [33]:
feature_columns = ['age', 'education_num',
       'capital_gain', 'capital_loss',
       'hours_per_week', 'relationship_ Husband',
       'relationship_ Not-in-family', 'relationship_ Other-relative',
       'relationship_ Own-child', 'relationship_ Unmarried',
       'relationship_ Wife', 'race_ Amer-Indian-Eskimo',
       'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White',
       'sex_ Female', 'sex_ Male', 'country_Other', 'country_US']

In [40]:
X = data[feature_columns]
Y = data["wealth"] == " >50K"

In [41]:
X_trainset, X_testset, Y_trainset, Y_testset = model_selection.train_test_split(X, Y, test_size=0.2, random_state=3)

In [90]:
# cross validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

kfold = model_selection.KFold(n_splits=5, random_state=42)
for max_depth in range(2, 20):
    for min_node_size in range(5, 30, 5):
        richTree = DecisionTreeClassifier(criterion='entropy', 
                                          max_depth=max_depth,
                                          min_samples_leaf=min_node_size)
        results = model_selection.cross_validate(estimator=richTree,
                                                 X=X_trainset,
                                                 y=Y_trainset,
                                                 cv=kfold,
                                                 scoring=scoring)
        print("max_dept = {}, min_node_size = {}, accuracy={}, precision = {}, recall = {}, f1-score = {}".format(
            max_depth, 
            min_node_size,
            results['test_accuracy'].mean(),
            results['test_precision'].mean(),
            results['test_recall'].mean(),
            results['test_f1_score'].mean()))

max_dept = 2, min_node_size = 5, accuracy=0.8146114303127356, precision = 0.7264138917360363, recall = 0.36680327462296664, f1-score = 0.4874049726528103
max_dept = 2, min_node_size = 10, accuracy=0.8146114303127356, precision = 0.7264138917360363, recall = 0.36680327462296664, f1-score = 0.4874049726528103
max_dept = 2, min_node_size = 15, accuracy=0.8146114303127356, precision = 0.7264138917360363, recall = 0.36680327462296664, f1-score = 0.4874049726528103
max_dept = 2, min_node_size = 20, accuracy=0.8146114303127356, precision = 0.7264138917360363, recall = 0.36680327462296664, f1-score = 0.4874049726528103
max_dept = 2, min_node_size = 25, accuracy=0.8146114303127356, precision = 0.7264138917360363, recall = 0.36680327462296664, f1-score = 0.4874049726528103
max_dept = 3, min_node_size = 5, accuracy=0.8214832883732534, precision = 0.7235439602785274, recall = 0.416591895168753, f1-score = 0.5285666295166798
max_dept = 3, min_node_size = 10, accuracy=0.8214832883732534, precision =

max_dept = 12, min_node_size = 25, accuracy=0.8367244791515056, precision = 0.6938645079909127, recall = 0.5740708340038501, f1-score = 0.6277616524846832
max_dept = 13, min_node_size = 5, accuracy=0.8270117385051489, precision = 0.6646899730154099, recall = 0.5649549333749764, f1-score = 0.6105543793548691
max_dept = 13, min_node_size = 10, accuracy=0.8316184781323038, precision = 0.676640590174516, recall = 0.5729959806155355, f1-score = 0.6204235428876415
max_dept = 13, min_node_size = 15, accuracy=0.8356495346714622, precision = 0.6868010867602298, recall = 0.5816302120201016, f1-score = 0.6294232208784123
max_dept = 13, min_node_size = 20, accuracy=0.8349201901772696, precision = 0.6866935950774822, recall = 0.5756084419971613, f1-score = 0.6259472365686907
max_dept = 13, min_node_size = 25, accuracy=0.8366861282830655, precision = 0.6927672518419923, recall = 0.5757505997993901, f1-score = 0.6284906812324195
max_dept = 14, min_node_size = 5, accuracy=0.8247851772861748, precision

In [91]:
# modeling
richTree = DecisionTreeClassifier(criterion="entropy", max_depth=11, min_samples_leaf=25)
richTree.fit(X_trainset, Y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=25, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [92]:
predTree = richTree.predict(X_testset)

In [93]:
cnf_matrix = confusion_matrix(Y_testset, predTree)

In [94]:
report = classification_report(Y_testset, predTree)

In [95]:
cnf_matrix

array([[4514,  417],
       [ 672,  910]])

In [96]:
print(report)

             precision    recall  f1-score   support

      False       0.87      0.92      0.89      4931
       True       0.69      0.58      0.63      1582

avg / total       0.83      0.83      0.83      6513

