In [8]:
import numpy as np
import random
import sys
import pickle
sys.path.insert(0, '/Users/trimcao/Dropbox/Richardson/Fall-2017/cs6375-ml-ruozzi/solution/lib')
sys.path.insert(0, '/home/trimcao/Dropbox/Richardson/Fall-2017/cs6375-ml-ruozzi/solution/lib')
from DecisionTree import Branch, DecisionTree, BoostedTree

### Read in data

In [2]:
def read_in(file_path):
    X = []
    y = []
    with open(file_path, 'r') as f:
        for line in f:
            info = line.strip('\n').split(',')
            X.append([int(i) for i in info[1:]])
            y.append(int(info[0]))
    X = np.array(X)
    y = np.array(y)
    # make the labels -1 and 1
    y[y==0] = -1
    return X, y

In [3]:
X, y = read_in('hw3_data/heart_train.data')

In [4]:
X_test, y_test = read_in('hw3_data/heart_test.data')

In [5]:
N = X.shape[0]

### Preparation

In [6]:
# create 88 decision trees
def gen_tree(attribute, X, y, weights=None):
    """
    Generate trees with height = 1.
               a
             /   \
    """
    trees = []
    for l1 in [-1, 1]:
        for l2 in [-1, 1]:
            # create a decision tree
            tree = DecisionTree()
            tree.labels = set(y)
            root = Branch()
            tree.tree = root
            # split attribute 1
            root.split_feature = attribute
            # left branch of root
            left = Branch()
            left.predict = l1
            root.children[0] = left
            # right branch of root
            right = Branch()
            right.predict = l2
            root.children[1] = right
            # append tree to the list
            trees.append(tree)
    return trees

### Coordinate Descent

In [None]:
# build the trees
num_feats = X.shape[1]
trees = []
for i in range(num_feats):
    trees.extend(gen_tree(i, X, y))

In [9]:
# variables
alpha = [0 for each in trees]
r = [1 for each in trees]

In [10]:
# compute r
def find_r(trees, alpha, x, y, k=-1):
    """
    Compute exponential loss for one sample for every learner except k
    (if k is not -1, i.e. k is not None).
    Note: y is a single label, not a vector of label.
    """
    num_trees = len(trees)
    sum_predict = 0
    for i in range(num_trees):
        if i != k:
            sum_predict += alpha[i]*trees[i].predict_single(x)
    return np.exp(-y*sum_predict)

In [11]:
# compute exponential loss
def exp_loss(trees, alpha, X, y):
    loss = 0
    for i in range(X.shape[0]):
        loss += find_r(trees, alpha, X[i], y[i])
    return loss

In [31]:
def predict_single(trees, alpha, x):
    """
    Predict a single sample using trees.
    """
    result = 0
    for i in range(len(trees)):
        result += alpha[i]*trees[i].predict_single(x)
    if result >= 0:
        return 1
    else:
        return -1

In [13]:
def update_alpha(t, trees, alpha, X, y):
    """
    Update alpha of tree t.
    """
    sum_correct = 0
    sum_incorrect = 0 
    for i in range(X.shape[0]):
        r = find_r(trees, alpha, X[i], y[i], k=t)
        pred = trees[t].predict_single(X[i])
        if pred == y[i]:
            sum_correct += r
        else:
            sum_incorrect += r
    new_alpha = 0.5*np.log(sum_correct/sum_incorrect)
    return new_alpha

In [21]:
def fit(trees, X, y, epoch=50):
    """
    Method that trains a boosted tree using coordinate descent.
    """
    alpha = [0 for each in trees]
    num_trees = len(trees)
    # 1 epoch = 1 loop over all trees
    for e in range(epoch):
        # just iterate over the trees with no special selection.
        for t in range(num_trees):
            # update alpha of t
            alpha[t] = update_alpha(t, trees, alpha, X, y)
        # display exponential loss
        if e%20 == 0:
            print('Epoch:', e+1)
            print('Exponential loss =', exp_loss(trees, alpha, X, y))
    return alpha

In [23]:
# test 
alpha = fit(trees, X, y, epoch=500)

Epoch: 1
Exponential loss = 60.2226183657
Epoch: 21
Exponential loss = 44.1653073215
Epoch: 41
Exponential loss = 42.3974947525
Epoch: 61
Exponential loss = 41.6312418853
Epoch: 81
Exponential loss = 41.1810394122
Epoch: 101
Exponential loss = 40.8789105319
Epoch: 121
Exponential loss = 40.6606100925
Epoch: 141
Exponential loss = 40.4951345183
Epoch: 161
Exponential loss = 40.365323985
Epoch: 181
Exponential loss = 40.2607971584
Epoch: 201
Exponential loss = 40.1748669339
Epoch: 221
Exponential loss = 40.1030180912
Epoch: 241
Exponential loss = 40.0420869764
Epoch: 261
Exponential loss = 39.9897893475
Epoch: 281
Exponential loss = 39.9444341985
Epoch: 301
Exponential loss = 39.9047428938
Epoch: 321
Exponential loss = 39.8697308175
Epoch: 341
Exponential loss = 39.8386276185
Epoch: 361
Exponential loss = 39.8108220887
Epoch: 381
Exponential loss = 39.7858232177
Epoch: 401
Exponential loss = 39.7632321398
Epoch: 421
Exponential loss = 39.7427215745
Epoch: 441
Exponential loss = 39.724020

In [24]:
pickle.dump( alpha, open( "alpha_coordinate.p", "wb" ) )

In [33]:
# predict on test set
def predict(trees, alpha, X):
    preds = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
        preds[i] = predict_single(trees, alpha, X[i])
    return preds

In [34]:
test_preds = predict(trees, alpha, X_test)
print(np.mean(test_preds==y_test))

0.700534759358


### Compare with adaBoost 

In [6]:
boost = BoostedTree(X, y, M=20)
boost.fit_hw2(X, y, X_test, y_test)

Iteration: 1
Accuracy on train set: 0.725
Accuracy on test set: 0.614973262032
Iteration: 2
Accuracy on train set: 0.725
Accuracy on test set: 0.614973262032
Iteration: 3
Accuracy on train set: 0.7875
Accuracy on test set: 0.711229946524
Iteration: 4
Accuracy on train set: 0.7625
Accuracy on test set: 0.652406417112
Iteration: 5
Accuracy on train set: 0.7625
Accuracy on test set: 0.652406417112
Iteration: 6
Accuracy on train set: 0.775
Accuracy on test set: 0.663101604278
Iteration: 7
Accuracy on train set: 0.775
Accuracy on test set: 0.663101604278
Iteration: 8
Accuracy on train set: 0.8125
Accuracy on test set: 0.679144385027
Iteration: 9
Accuracy on train set: 0.8125
Accuracy on test set: 0.679144385027
Iteration: 10
Accuracy on train set: 0.825
Accuracy on test set: 0.695187165775
Iteration: 11
Accuracy on train set: 0.825
Accuracy on test set: 0.72192513369
Iteration: 12
Accuracy on train set: 0.825
Accuracy on test set: 0.711229946524
Iteration: 13
Accuracy on train set: 0.825
Ac

In [12]:
print(boost.stages)

[0.48470027859405157, 0.23296515192905001, 0.30841915161660433, 0.26905996040337699, 0.12707489951194617, 0.1939136051934971, 0.16646581973436514, 0.20484093019665991, 0.11233780854211395, 0.16915310437537498, 0.12721184797605228, 0.15386377069176382, 0.098070542088112209, 0.15796706354327916, 0.13458519734012808, 0.13035666473731899, 0.10144574639261557, 0.12621641283923687, 0.10744964087976837, 0.11081674484994171]


In [17]:
alpha = pickle.load( open( "alpha_coordinate.p", "rb" ) )

In [19]:
print(np.isclose(alpha,0))

[ True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False]


In [20]:
print(alpha)

[-1.0935696792557912e-14, -2.1113691460473563, -1.9262369477246838e-14, -0.76458219017592477, -6.7335026443520275e-14, -0.49233289336149083, -1.1934897514720575e-14, -0.41478204146804826, -4.6851411639183802e-14, -4.4058549874767028, -3.3140157285062021e-14, -2.8100377931850606, -1.056377207930948e-13, 0.80199150091920635, 2.9864999362415821e-14, 0.51902683511736547, -8.770761894538814e-15, 0.31751616799032567, -9.1593399531576251e-15, 0.14126530348945926, -9.9920072216264187e-16, 0.13729670511430384, -1.7652546091540301e-14, 0.10879826039253997, -3.2196467714129642e-15, 0.60814522507657842, 1.2212453270876572e-14, 0.32285429265169135, 6.6613381477509353e-16, 4.0254521840867517, 9.2148511043887141e-15, 2.6753260028212358, -3.3029134982599499e-14, -2.7467516857012848, -1.9761969838328178e-14, -2.0949377107361311, -8.6597395920769707e-14, 1.9790532209929785, 1.8762769116164792e-14, 0.75423913827232736, -7.9936057773011918e-15, 0.38770152892165788, 1.9984014443252778e-15, 0.25630175152301

In [18]:
alpha[np.isclose(alpha,0)] = 0
print(alpha)

TypeError: only integer scalar arrays can be converted to a scalar index

In [24]:
count = 0
for i in range(len(alpha)):
    if not np.isclose(alpha[i], 0):
        print('alpha', i, '=', alpha[i])
        count += 1
    else:
         print('alpha', i, '= 0')

alpha 0 : 0
alpha 1 : -2.11136914605
alpha 2 : 0
alpha 3 : -0.764582190176
alpha 4 : 0
alpha 5 : -0.492332893361
alpha 6 : 0
alpha 7 : -0.414782041468
alpha 8 : 0
alpha 9 : -4.40585498748
alpha 10 : 0
alpha 11 : -2.81003779319
alpha 12 : 0
alpha 13 : 0.801991500919
alpha 14 : 0
alpha 15 : 0.519026835117
alpha 16 : 0
alpha 17 : 0.31751616799
alpha 18 : 0
alpha 19 : 0.141265303489
alpha 20 : 0
alpha 21 : 0.137296705114
alpha 22 : 0
alpha 23 : 0.108798260393
alpha 24 : 0
alpha 25 : 0.608145225077
alpha 26 : 0
alpha 27 : 0.322854292652
alpha 28 : 0
alpha 29 : 4.02545218409
alpha 30 : 0
alpha 31 : 2.67532600282
alpha 32 : 0
alpha 33 : -2.7467516857
alpha 34 : 0
alpha 35 : -2.09493771074
alpha 36 : 0
alpha 37 : 1.97905322099
alpha 38 : 0
alpha 39 : 0.754239138272
alpha 40 : 0
alpha 41 : 0.387701528922
alpha 42 : 0
alpha 43 : 0.256301751523
alpha 44 : 0
alpha 45 : -0.453426019943
alpha 46 : 0
alpha 47 : -0.277082109197
alpha 48 : 0
alpha 49 : 0.794651871233
alpha 50 : 0
alpha 51 : 0.386217574

In [14]:
print(count)

44
