## Attempts to Create a Decision Tree from Integer Programming

In [48]:
from node import *
from get_int_data import *
import cplex
import numpy as np
import cvxpy as cp

In [49]:
cp.__version__

'0.4.11'

In [5]:
def depth_k_tree(k):
    """Generates a balanced decision tree of depth k."""

    tree = Tree()
    tree.add_node(0)
    count = 1
    for j in range(k-1):
        leaves = tree.get_leaf_nodes()
        for leaf in leaves:
            tree.add_node(count,leaf,'L')
            count = count + 1
            tree.add_node(count,leaf,'R')
            count = count + 1

    return tree


In [9]:
tree = depth_k_tree(3)
trs, trl, tes, tel, gs = get_int_data('spect-heart.csv', split=.9,gs=False)

In [37]:
build_int_model(tree)

TypeError: build_int_model() missing 2 required positional arguments: 'I' and 'labels'

In [45]:
def build_int_model(tree, I, labels, C=1, maxtime=1800, gs = []):
    """
    Sets up the IP model to solve for an ODT and then solves it.
    
    INPUTS:
    tree (required): a tree object from the Tree class in node.py. 
    I (required): n (samples) x p (features) numpy array of training data
    labels (required): length n (samples)) numpy array of 0-1 valued class labels
    
    Remaining inputs are optional:
    C: (default 1) weight on negative-class examples (set less than 1 to emphasize positive-class examples)
    maxtime: (default 1800) number of seconds before CPLEX quits and returns best found integer solution
    priorities: (default False) tell CPLEX priorities on which to branch variables (v > z > anything else,
    with v variables closest to the root node being the most important) 
    anchor: (default True) use the anchoring (symmetry-breaking) equalities discussed in the write-up 	
    deleted: (default True) delete spurious variables (this would be done by CPLEX automatically, but might as well)
    relaxed: (default True) relax the integrality of the z variables adjacent to leaf nodes
    relaxedobj: (default True) relax the integrality of the correct classification variables
    strengthen: (default True) use strengthened inequalities for paths through the tree
    gs: (default []) provide the group structure of the instance - see the README for more information

    OUTPUT:
    will be written on the tree object

    """	
#     p = cplex.Cplex()
#     p.objective.set_sense(p.objective.sense.maximize)

    numSamples = len(I)
    numFeatures = len(I[0])

    # get group structure from inputs, or else squeeze it out	
    if gs != []:
        groups = gs
        numGroups = int(max(groups)+1)
    else:
        # clone
        I2 = np.zeros((numSamples,2*numFeatures))
        for j in range(numFeatures):
            I2[:,2*j] = I[:,j]
            I2[:,2*j+1] = 1-I[:,j]
        numGroups = numFeatures 
        I = I2
        groups = np.zeros(2*numFeatures)
        for j in range(numGroups):
            groups[2*j] = j
            groups[2*j+1] = j
        numFeatures = 2*numFeatures

    leaves = tree.get_leaf_nodes()
    numNodes = max(leaves) + 1

    # set up the buckets
    numBuckets = 0
    bucket_dict = {}
    for leaf in leaves:
        bucket_dict[leaf] = numBuckets
        numBuckets += 1

    # establish the variables
    names = []
    for k in range(numNodes):
        for j in range(numFeatures):
            name = 'z_'+repr(j)+'_'+repr(k)
            names.append(name)
    z_ = numNodes*numFeatures

    for k in range(numNodes):
        for g in range(numGroups):
            name = 'v_'+repr(g)+'_'+repr(k)
            names.append(name)
    zv_ = z_ + numNodes*numGroups

    for b in range(2*numBuckets):
        for i in range(numSamples):
            name = 'c_'+repr(i)+'_'+repr(b)
            names.append(name)
    zvc_ = zv_ + 2*numBuckets*numSamples

    # path enumeration
    paths = []
    senses = []
    for leaf in leaves:
        curr = leaf
        path = []
        path.append(curr)
        leafsense = []
        while curr != 0:
            leafsense.append(tree[curr].sense)
            curr = tree[curr].parent
            path.append(curr)
        paths.append(path)
        senses.append(leafsense)


    A = np.zeros((numNodes*((numNodes+1)*numSamples + numGroups*numFeatures+2)+1,zvc_))
    rhs = []
    ineq = ''
    constraint_cnt = 0
    cnames = []

    # pick a group at each node
    for k in range(numNodes):
        for g in range(numGroups):
            A[constraint_cnt,z_ + k*numGroups + g] = 1
        rhs.append(1)
        ineq = ineq + "E"
        constraint_cnt = constraint_cnt + 1
        cname = 'OneGroupPerNode_Node_'+repr(k)
        cnames.append(cname)

    # group hierarchy constraints:
    group_no = 0
    curr_feature = 0
    for j in range(numFeatures):
        ind = groups[j]
        if ind != group_no or j == (numFeatures - 1):
            prev_feature = curr_feature
            if j == (numFeatures - 1):
                curr_feature = numFeatures
            else:
                curr_feature = j
            for k in range(numNodes):
                for j in range(prev_feature,curr_feature):
                    A[constraint_cnt, z_ + k * numGroups + int(group_no)] = -1
                    A[constraint_cnt, k * numFeatures + j] = 1

                    rhs.append(0)
                    ineq = ineq + "L"
                    constraint_cnt += 1
                    
                    cname = 'GroupHierarchy_Group_'+repr(group_no)+'_Feature_'+repr(j)+'_Node_'+repr(k)
                    cnames.append(cname)
            group_no = ind

        # weaker left tree constraints
    for k in range(numNodes):
        collected_buckets = []
        sensescopy = deepcopy(senses)
        for sense in sensescopy:
            sense.insert(0,'L')
        for path in range(len(paths)):
            if k in paths[path]:
                ind = paths[path].index(k)
                if sensescopy[path][ind] == 'L':
                    collected_buckets.append(bucket_dict[paths[path][0]])	
                if sensescopy[path][ind] == 'L' and ind != 0:
                    collected_buckets.append(numBuckets + bucket_dict[paths[path][0]])
    
        for i in range(numSamples):
            for c in collected_buckets:
                for j in range(numFeatures):
                    A[constraint_cnt,k*numFeatures + j] = -I[i][j]
                A[constraint_cnt,zv_ + c*numSamples + i] = 1
                rhs.append(0)				
                ineq = ineq + "L"
                constraint_cnt = constraint_cnt + 1
                cname = 'BucketConstraintLeft_'+repr(k)+'_Sample_'+repr(i)+'_Bucket_'+repr(c)
                cnames.append(cname)
    # weaker right tree constraints
    for k in range(numNodes):
        collected_buckets = []
        sensescopy = deepcopy(senses)
        for sense in sensescopy:
            sense.insert(0,'R')
        for path in range(len(paths)):
            if k in paths[path]:
                ind = paths[path].index(k)
                if sensescopy[path][ind] == 'R':
                    collected_buckets.append(numBuckets + bucket_dict[paths[path][0]])	
                if sensescopy[path][ind] == 'R' and ind != 0:
                    collected_buckets.append(bucket_dict[paths[path][0]])

        for i in range(numSamples):
            for c in collected_buckets:
                for j in range(numFeatures):
                    A[constraint_cnt,k*numFeatures + j] = I[i][j]
                A[constraint_cnt,zv_ + c*numSamples + i] = 1
                rhs.append(1)				
                ineq = ineq + "L"
                constraint_cnt = constraint_cnt + 1
                cname = 'BucketConstraintRight_'+repr(k)+'_Sample_'+repr(i)+'_Bucket_'+repr(c)
                cnames.append(cname)

    print("Number of rows: %s " % constraint_cnt)
    numRows = constraint_cnt
    numCols = zvc_
    print("Number of columns: %s " % numCols)

    indices = [[i for i in range(numRows) if A[i,j] != 0] for j in range(numCols)]
    values = [[A[i,j] for i in range(numRows) if A[i,j] != 0] for j in range(numCols)]
    cols = [[indices[i],values[i]] for i in range(numCols)]

    rhs = np.array(rhs)
    senses = ineq

    # define the objective	
    obj = np.zeros(numCols)

    for i in range(numSamples):
        if labels[i] == 0:
            for b in range(numBuckets):
                obj[zv_ + b*numSamples + i] = C
        else:
            for b in range(numBuckets):
                obj[zv_ + numBuckets*numSamples + b*numSamples + i] = 1

    # set up types, priorities
    priority_vec = []

    types = numNodes*(numGroups+numFeatures)*'I'
    for j in range(numNodes*numFeatures):
        priority_vec.append((j,1,p.order.branch_direction.down))


    types = types + 2*numBuckets*numSamples*'I'	

#     p = cplex.Cplex()
#     p.objective.set_sense(p.objective.sense.maximize)
    obj = cp.Maximize()

#     lb = np.zeros(numCols)
#     ub = np.ones(numCols)

#     # Load into p
#     p.linear_constraints.add(rhs=rhs,senses=senses)
#     p.linear_constraints.set_names(zip(range(constraint_cnt),cnames))

#     p.variables.add(obj = obj, lb = lb, ub = ub, columns=cols, types=types, names=names)

    
    prob = cp.Problem(obj, constraints)
#     p.parameters.timelimit.set(maxtime)
#     p.write("dtint.lp")
#     p.solve()
    sol = prob.solve()
    
    trial = sol.get_objective_value()
    print("Solution value = ",trial)


    bucket_counts = np.zeros(2*numBuckets)
    for b in range(numBuckets):
        indices = range(zv_ + b*numBad,zv_ + (b+1)*numBad)
        bucket_count = 0
        for i in indices:
            if p.solution.get_values(i) == 1:
                bucket_count = bucket_count + 1
        bucket_counts[b] = bucket_count
        indices = range(zv_ + numBuckets*numBad + b*numGood,zv_ + numBuckets*numBad + (b+1)*numGood)
        bucket_count = 0
        for i in indices:
            if p.solution.get_values(i) == 1:
                bucket_count = bucket_count + 1
        print('Left bucket count ' + repr(b) + ': ' + repr(bucket_counts[b]))
        bucket_counts[numBuckets + b] = bucket_count
        print('Right bucket count ' + repr(b) + ': ' + repr(bucket_counts[numBuckets+b]))

    # fill up the tree structure:

    # what are the groups in the solution?
    solgroups = []
    splits = [[] for k in range(numNodes)]

    for k in range(numNodes):
        for j in range(z_ + k*numGroups,z_ + (k+1)*numGroups):
            if sol.get_values(j) > 0.99:
                group = j - z_ - k*numGroups
                solgroups.append(group)
    for k in range(numNodes):
        indices = [i for i,x in enumerate(groups) if x == solgroups[k]]
        for ind in indices:
            dir = sol.get_values(k*numFeatures + ind)
            if dir > 0.99:
                splits[k].append(ind)
        tree[k].add_splitvar(splits[k])



In [46]:
# from dt_int import *
# from get_int_data import *
tree = depth_k_tree(3)

# note below gs = True since we have group structure present in the csv:\
trs,trl,tes,tel,gs = get_int_data('breast-cancer-wisconsin.csv', split = .5, gs = True) 
build_int_model(tree,trs,trl,gs=gs)



7 623
8 623
9 623
10 623
11 623
12 623
13 623
14 623
15 623
16 623
17 632
18 632
19 632
20 632
21 632
22 632
23 632
24 632
25 632
26 632
27 641
28 641
29 641
30 641
31 641
32 641
33 641
34 641
35 641
36 641
37 650
38 650
39 650
40 650
41 650
42 650
43 650
44 650
45 650
46 650
47 659
48 659
49 659
50 659
51 659
52 659
53 659
54 659
55 659
56 659
57 668
58 668
59 668
60 668
61 668
62 668
63 668
64 668
65 668
66 668
67 677
68 677
69 677
70 677
71 677
72 677
73 677
74 677
75 677
76 677
77 624.0
78 624.0
79 624.0
80 624.0
81 624.0
82 624.0
83 624.0
84 624.0
85 624.0
86 624.0
87 633.0
88 633.0
89 633.0
90 633.0
91 633.0
92 633.0
93 633.0
94 633.0
95 633.0
96 633.0
97 642.0
98 642.0
99 642.0
100 642.0
101 642.0
102 642.0
103 642.0
104 642.0
105 642.0
106 642.0
107 651.0
108 651.0
109 651.0
110 651.0
111 651.0
112 651.0
113 651.0
114 651.0
115 651.0
116 651.0
117 660.0
118 660.0
119 660.0
120 660.0
121 660.0
122 660.0
123 660.0
124 660.0
125 660.0
126 660.0
127 669.0
128 669.0
129 669.0
130 66

TypeError: non-float value in input sequence