In [4]:
import numpy as np
import random

In [5]:
def read_in(file_path):
    X = []
    y = []
    with open(file_path, 'r') as f:
        for line in f:
            info = line.strip('\n').split(',')
            X.append([i for i in info[1:]])
            y.append(info[0])
    X = np.array(X)
    y = np.array(y)
    return X, y

In [6]:
X, y = read_in('./hw2_data/mush_train.data')

In [28]:
CHOICES = set(y)

In [5]:
# find the values of each feature
num_feats = 22
feats = {}
for i in range(num_feats):
    feats[i] = set(X[:,i])

In [6]:
print(feats)

{0: {'k', 'b', 'f', 'c', 's', 'x'}, 1: {'y', 'g', 'f', 's'}, 2: {'g', 'w', 'r', 'n', 'y', 'b', 'e', 'c', 'p', 'u'}, 3: {'f', 't'}, 4: {'s', 'l', 'n', 'a', 'y', 'f', 'c', 'p', 'm'}, 5: {'a', 'f'}, 6: {'c', 'w'}, 7: {'n', 'b'}, 8: {'k', 'g', 'w', 'r', 'n', 'b', 'y', 'e', 'o', 'p', 'u', 'h'}, 9: {'e', 't'}, 10: {'r', 'b', 'e', 'c', 'm'}, 11: {'y', 'k', 'f', 's'}, 12: {'y', 'k', 'f', 's'}, 13: {'g', 'w', 'n', 'b', 'y', 'e', 'o', 'c', 'p'}, 14: {'g', 'w', 'n', 'b', 'y', 'e', 'o', 'c', 'p'}, 15: {'p'}, 16: {'n', 'y', 'o', 'w'}, 17: {'o', 'n', 't'}, 18: {'l', 'n', 'e', 'f', 'p'}, 19: {'k', 'w', 'r', 'n', 'b', 'y', 'o', 'u', 'h'}, 20: {'v', 'n', 'a', 'y', 'c', 's'}, 21: {'g', 'd', 'w', 'l', 'p', 'u', 'm'}}


### Split a tree demo (using feature X[0])

In [36]:
X[:, 0:3]

array([['f', 'f', 'n'],
       ['x', 'y', 'y'],
       ['x', 'y', 'n'],
       ..., 
       ['f', 'f', 'n'],
       ['x', 'f', 'y'],
       ['x', 'f', 'g']], 
      dtype='<U1')

In [110]:
# create a starting dict called 'tree'
tree = {}
tree['predict'] = None
tree['X'] = X
tree['y'] = y
tree['split_feature'] = None
tree['remain_feats'] = [i for i in range(X.shape[1])]
# tree['remain_feats'] = [i for i in range(3)]
tree['children'] = {}

In [111]:
print(tree['remain_feats'])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


In [39]:
print(tree['children']['s']['remain_feats'])

KeyError: 's'

### Design a recursive function

In [188]:
def predictable(branch):
    """
    Check if a branch in the tree is predictable
    (not need to recurse on)
    """
    y = branch['y']
    X = branch['X']
    if len(set(y)) == 1: # branch is pure
        branch['predict'] = y[0]
        return True
    elif X.shape[0] == 0: # no sample
        # what is the prediction in this case?
        # just return a random choice
        branch['predict'] = random.choice(list(CHOICES))
        return True
    elif len(branch['remain_feats']) == 0: # no remaining feature 
        # predict using majority vote
        best_choice = None
        most_votes = -1
        for choice in CHOICES:
            cur_vote = np.sum(y == choice)
            if cur_vote > most_votes:
                most_votes = cur_vote
                best_choice = choice
        branch['predict'] = best_choice
        return True
    else:
        return False

In [189]:
def expand_tree(parent, feature_dict):
    """
    Recursive function to expand a tree. 
    """
    X_cur = parent['X']
    y_cur = parent['y']
    # base conditions (probably need a function too)
    # print(X_cur)
    # print(y_cur)
    # print(parent['remain_feats'])
    # print(predictable(parent))
    # print()
    if predictable(parent):
        return
    # compute split feature (just test random for now)
    split_feat = select_feature(y_cur, X_cur, parent['remain_feats'])
    parent['split_feature'] = split_feat
    feat_vals = feature_dict[split_feat]
    for each in feat_vals:
        child = {}
        child['split_feature'] = None # set it as none as default
        # split data based on the feature
        child['X'] = X_cur[X_cur[:,split_feat]==each]
        child['y'] = y_cur[X_cur[:,split_feat]==each]
        #
        child['predict'] = None # will be updated later in the recursion
        # number of remaining features
        child['remain_feats'] = list(parent['remain_feats'])
        child['remain_feats'].remove(split_feat)
        # link the child to the parent
        parent['children'][each] = child
        # call the function on children 
        child['children'] = {}
        expand_tree(child, feature_dict)

In [192]:
expand_tree(tree, feats)

In [193]:
# let's try to make a prediction

# choices = 'f, y, g'

first_feat = tree['split_feature']
print(first_feat)

4


In [116]:
first_split = tree['children']['f']
print(first_split['predict'])
print(first_split['split_feature'])

KeyError: 'f'

In [89]:
second_split = first_split['children']['g']
print(second_split['predict'])
print(second_split['split_feature'])

None
1


In [109]:
third_split = second_split['children']['y']
print(third_split['predict'])
print(third_split['split_feature'])
y_third = third_split['y']
print(np.sum(y_third=='p'))
print(np.sum(y_third=='e'))

p
None
90
87


In [214]:
def predict_single(x, tree):
    """
    Predict using the decision tree using features in X
    (currently single sample). 
    """
    prediction = None
    subtree = tree
    while not prediction:
        prediction = subtree['predict']
        # print(prediction)
        next_split = subtree['split_feature']
        # print(next_split)
        if next_split != None:
            subtree = subtree['children'][x[next_split]]
            # print(subtree['predict'])
    return prediction
        

In [215]:
def predict(X, tree):
    predictions = []
    for i in range(X.shape[0]):
        predictions.append(predict_single(X[i,:], tree))
    return np.array(predictions)

In [120]:
predict(X_test[5], tree)

'p'

### Choose feature to split

In [122]:
def probability(y):
    """
    Compute a probability table for a variable y.
    """
    choices = set(y)
    num_samples = y.shape[0]
    prob_dict = {}
    for each in choices:
        prob_dict[each] = np.sum(y==each) / num_samples
    return prob_dict

In [125]:
probability(y)

{'e': 0.51655348047538197, 'p': 0.48344651952461798}

In [174]:
def entropy(y):
    """
    Compute entropy.
    """
    prob_dict = probability(y)
    result = 0
    choices = set(y)
    for each in choices:
        result -= prob_dict[each]*log_zero(prob_dict[each])
    return result
    

In [129]:
entropy(y)

0.69259904497005076

In [143]:
def normalize(prob_dict):
    """
    Normalize a probability table.
    """
    total = 0
    for each in prob_dict:
        total += prob_dict[each]
    for each in prob_dict:
        prob_dict[each] = prob_dict[each] / total
    return prob_dict
    

In [182]:
def conditional_probability(y, x):
    """
    conditional of y given x
    """
    # obtain probability table for x
    prob_x = probability(x)
    # compute conditional probability
    choices_y = set(y)
    choices_x = set(x)
    num_samples = y.shape[0]
    prob_dict = {}
    for each_y in choices_y:
        prob_dict[each_y] = {}
        for each_x in choices_x:
            num_true = 0
            for y_cur, x_cur in zip(y, x):
                # prob_dict[each_y][each_x] = (np.sum(y==each_y & x==each_x)/num_samples) / prob_x[each_x] 
                if y_cur == each_y and x_cur == each_x:
                    num_true += 1
            prob_dict[each_y][each_x] = (num_true/num_samples) / prob_x[each_x]
        # normalize
        # prob_dict[each_y] = normalize(prob_dict[each_y])
    return prob_dict

In [141]:
x_test = X[:, 1]

In [145]:
conditional_probability(y, x_test)

{'e': {'f': 0.42523118677721472,
  'g': 0.0,
  's': 0.27324977613864626,
  'y': 0.30151903708413913},
 'p': {'f': 0.1360060298394184,
  'g': 0.41263367876619927,
  's': 0.23487521350432544,
  'y': 0.21648507789005686}}

In [149]:
def log_zero(number):
    if number == 0:
        return 0
    else:
        return np.log(number)

In [150]:
def conditional_entropy(y, x):
    """
    Compute conditional entropy.
    """
    prob_x = probability(x)
    prob_y_given_x = conditional_probability(y, x)
    choices_y = set(y)
    choices_x = set(x)   
    total = 0
    for each_x in choices_x:
        for each_y in choices_y:
            cur_prob_y_given_x = prob_y_given_x[each_y][each_x]
            total -= prob_x[each_x] * cur_prob_y_given_x * log_zero(cur_prob_y_given_x)
    return total
    

In [151]:
conditional_entropy(y, x_test)

0.67673917732710642

In [191]:
def select_feature(y, X, possible_features):
    """
    Select the best feature to split in the decision
    tree.
    """
    best_info_gain = -1
    split_feat = -1
    for feat in possible_features:
        info_gain = entropy(y) - conditional_entropy(y, X[:,feat])
        # print(feat, info_gain)
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            split_feat = feat
    return split_feat

In [159]:
split_feat = select_feature(y, X, [i for i in range(X.shape[1])])

In [160]:
print(split_feat)

15


#### Test on the small example in the lecture

In [199]:
X_small = np.array([[1,1], [1,0], [1,1], [1,0], [0,1], [0,0], [0,1], [0,0]])
y_small = np.array([1, 1, 1, 1, 1, -1, -1, -1])

In [200]:
print(select_feature(y_small, X_small, [i for i in range(X_small.shape[1])]))

0


In [201]:
print(entropy(y_small))

0.661563238158


In [202]:
# should be 0.2811
print(conditional_entropy(y_small, X_small[:,0]))

0.281167572309


In [203]:
print(conditional_entropy(y_small, X_small[:,1]))

0.627741162589


In [204]:
print(conditional_probability(y_small, X_small[:,0]))

{1: {0: 0.25, 1: 1.0}, -1: {0: 0.75, 1: 0.0}}


### Prediction on test set

In [205]:
X_test, y_test = read_in('./hw2_data/mush_test.data')

In [216]:
pred = predict(X_test, tree)

In [217]:
print('Accuracy: ', np.mean(pred==y_test))

Accuracy:  1.0


In [3]:
import sys
sys.path.insert(0, '/Users/trimcao/Dropbox/Richardson/Fall-2017/cs6375-ml-ruozzi/solution/lib')
from DecisionTree import DecisionTree, BoostedTree

In [7]:
clf = DecisionTree()
clf.fit(X, y)
clf.depth

4

In [8]:
train_preds = clf.predict(X)
print(np.mean(train_preds==y))

1.0
