In [1]:
import pandas as pd

from helpers import clean_and_backfill_data

from splitnode import SplitNode
from tree import Tree

In [2]:
male_wages = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/plm/Males.csv').drop(['Unnamed: 0', 'nr'], axis=1)
male_wages = clean_and_backfill_data(male_wages)
male_wages.head()

Unnamed: 0,year,school,exper,union,ethn,married,health,wage,industry,occupation,residence
0,1980,14,1,False,other,False,False,1.19754,Business_and_Repair_Service,Service_Workers,north_east
1,1981,14,2,True,other,False,False,1.85306,Personal_Service,Service_Workers,north_east
2,1982,14,3,False,other,False,False,1.344462,Business_and_Repair_Service,Service_Workers,north_east
3,1983,14,4,False,other,False,False,1.433213,Business_and_Repair_Service,Service_Workers,north_east
4,1984,14,5,False,other,False,False,1.568125,Personal_Service,"Craftsmen, Foremen_and_kindred",north_east


In [3]:
features = filter(lambda x: x!='wage', male_wages.columns)

X, y = male_wages[list(features)], male_wages['wage']

### Instantiating Node Object

In [4]:
n = SplitNode()
print(n, '\n')
_ = n.split_data(X, y)
print(n)

feature: None
split value: None
 

feature: school
split value: 12.0



### Training a Tree

In [5]:
t = Tree(min_samples_leaf=5, max_depth=10)
t.fit(X, y)

#### Traversing Tree to Group w/ Max Y

In [6]:
ct = 0
branch = t
while not branch.is_leaf:
    print('Split {}'.format(ct))
    print('# Samples: {}, Avg y: {:.3f}'.format(branch.n_samples, branch.avg))
    print(branch.split_node)
    if branch.left_child.avg > branch.right_child.avg:
        print("==> left branch")
        branch = branch.left_child
    else:
        branch = branch.right_child
        print("==> right branch")
    ct += 1
    print('\n')

Split 0
# Samples: 4360, Avg y: 1.649
feature: school
split value: 12.0

==> right branch


Split 1
# Samples: 2888, Avg y: 1.741
feature: year
split value: 1984.0

==> right branch


Split 2
# Samples: 1444, Avg y: 1.857
feature: school
split value: 13.0

==> right branch


Split 3
# Samples: 520, Avg y: 2.010
feature: industry
split value: Trade

==> left branch


Split 4
# Samples: 417, Avg y: 2.051
feature: industry
split value: Agricultural

==> left branch


Split 5
# Samples: 406, Avg y: 2.065
feature: year
split value: 1985

==> right branch


Split 6
# Samples: 305, Avg y: 2.114
feature: industry
split value: Finance

==> right branch


Split 7
# Samples: 32, Avg y: 2.339
feature: residence
split value: north_east

==> right branch




In [7]:
male_wages.iloc[0]

year                                 1980
school                                 14
exper                                   1
union                               False
ethn                                other
married                             False
health                              False
wage                              1.19754
industry      Business_and_Repair_Service
occupation                Service_Workers
residence                      north_east
Name: 0, dtype: object

In [8]:
t.get_decision_path(male_wages.iloc[0])

# Samples: 4360
Population average y: 1.65
school >= 12.0
	# Samples: 2888	Avg Y: 1.741 (+0.092)
year < 1984.0
	# Samples: 1444	Avg Y: 1.624 (-0.117)
union != True
	# Samples: 1060	Avg Y: 1.572 (-0.052)
industry != Manufacturing
	# Samples: 831	Avg Y: 1.517 (-0.055)
married != True
	# Samples: 566	Avg Y: 1.454 (-0.064)
exper < 3.0
	# Samples: 128	Avg Y: 1.274 (-0.180)
occupation != Professional, Technical_and_kindred
	# Samples: 104	Avg Y: 1.220 (-0.053)
school >= 14.0
	# Samples: 38	Avg Y: 1.355 (+0.135)
exper < 2
	# Samples: 10	Avg Y: 1.179 (-0.176)
occupation == Service_Workers
	# Samples: 5	Avg Y: 1.063 (-0.116)


### Compare w/ Scikit DecisionTreeRegressor

In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz

from helpers import get_dummie_data

In [10]:
dummied_df = get_dummie_data(male_wages)
train_cols = list(filter(lambda col: col!='wage', dummied_df.columns))
X = dummied_df[train_cols]
y = dummied_df['wage']

dtr = DecisionTreeRegressor(min_samples_leaf=5, max_depth=10)
dtr.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [11]:
export_graphviz(dtr, out_file='images/wage_tree.dot',
                feature_names = train_cols, max_depth=4)

! dot -Tpng images/wage_tree.dot -o images/wage_tree.png

##### Visualizing the Scikit Tree

<img src="images/wage_tree.png">

In [12]:
dtr.predict([X.iloc[0]])

array([1.06303568])

In [13]:
t.predict(male_wages.iloc[0])

1.06303568336

In [14]:
t.get_decision_path(male_wages.iloc[0])

# Samples: 4360
Population average y: 1.65
school >= 12.0
	# Samples: 2888	Avg Y: 1.741 (+0.092)
year < 1984.0
	# Samples: 1444	Avg Y: 1.624 (-0.117)
union != True
	# Samples: 1060	Avg Y: 1.572 (-0.052)
industry != Manufacturing
	# Samples: 831	Avg Y: 1.517 (-0.055)
married != True
	# Samples: 566	Avg Y: 1.454 (-0.064)
exper < 3.0
	# Samples: 128	Avg Y: 1.274 (-0.180)
occupation != Professional, Technical_and_kindred
	# Samples: 104	Avg Y: 1.220 (-0.053)
school >= 14.0
	# Samples: 38	Avg Y: 1.355 (+0.135)
exper < 2
	# Samples: 10	Avg Y: 1.179 (-0.176)
occupation == Service_Workers
	# Samples: 5	Avg Y: 1.063 (-0.116)
