Let ${(X_1, Y_1),(X_2, Y_2), . . . ,(X_m, Y_m)}$ denote a data set, where $X_i$ represents a vector of k (binary) feature values,
and $Y_i$ is a corresponding binary class or label that we will need to learn to be able to predict from the X-values.
We generate data via the following scheme, defining a distribution for our data set: Let $X = (X_1, X_2, X_3, . . . , X_k)$
be a vector of binary values, satisfying the following
- $X_1 = 1$ with probability 1/2, $X_1 = 0$ with probability 1/2
- For i = 2, . . . , k, $X_i = X_{i−1}$ with probability 3/4, and $X_i = 1 − X_{i−1}$ with probability 1/4.
In this way, the first feature value is uniformly random, but every successive feature is strongly correlated with the
value of the feature before it. We can then define Y to be a function of X as
$$Y = X_1 if w_2X_2 + w_3X_3 + . . . + w_kX_k ≥ 1/2$$
$$Y = 1 − X_1 else$$

In other words, if the ‘weighted average’ of $X_2, . . . X_k$ tilts high, Y will agree with $X_1$; if the weighted average of
$X_2, . . . , X_k$ tilts low, Y will disagree with $X_1$. Take the weights to be defined by $w_i = \frac{0.9^i}{0.9^2 + 0.9^3 + ... + 0.9^k}$

#### 1. For a given value of k, m, (number of features, number of data points), write a function to generate a training data set based on the above scheme.

In [51]:
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint

In [2]:
def create_data(k, m):
    X = [[0]*k for i in range(m)]
    for i in range(m):
        X[i][0] = int(np.random.choice(2, size=1))
        for j in range(1, k):
            temp = np.random.choice(2, 1, p=[0.25,0.75])
            if temp == 1:
                X[i][j] = X[i][j-1]
            else:
                X[i][j] = 1 - X[i][j-1]
    return X
            
def create_weights(k):
    div = 0
    weight = [0]*(k+1)
    for i in range(2, k+1):
        div += 0.9**i
    for i in range(1, k+1):
        weight[i] = (0.9**i)/div
        
    return weight[1:]
    
def create_y(X, w, k, m):
    y = []
    for i in range(m):
        val = np.dot(X[i][1:], w[1:].T)
#         print(val)
        if val < 0.5:
            y.append(1 - X[i][0])
        else:
            y.append(X[i][0])
    return y

#### 2. Given a data set, write a function to fit a decision tree to that data based on splitting the variables by maximizing the information gain. Additionally, return the training error of this tree on the data set, $err_{train}(\hat{f})$. It may be useful to have a function that takes a data set and a variable, and returns the data set partitioned based on the values of that variable

In [159]:
class DecisionTree():      
    def entropy(self, data):
        target = data.keys()[-1]
        entropy_y = 0
        target_vals = data[target].unique()
        
        for val in target_vals:
            p = data[target].value_counts()[val]/len(data[target])
            entropy_y += -p*np.log2(p)
        return entropy_y
    
    def conditional_entropy(self, data, feature):
        target = data.keys()[-1]
        target_vals = data[target].unique()
        feature_vals = data[feature].unique()
        cond_entropy_y = 0
        
        for fval in feature_vals:
            entropy = 0
            for tval in target_vals:
                num = len(data[feature][data[feature] == fval][data[target] == tval])
                denom = len(data[feature][data[feature] == fval])
                e = num/(denom + epsilon)
                entropy += -(e)*np.log2(e + epsilon)
            cond_entropy_y += -(denom/len(data))*entropy
            
        return abs(cond_entropy_y)
            
    def information_gain_split(self, data):
        IG = []
        for key in data.keys()[:-1]:
            IG.append(self.entropy(data) - self.conditional_entropy(data, key))
        
        return data.keys()[:-1][np.argmax(IG)]
    
    def get_subset(self, data, node, value):
        return data[data[node] == value].reset_index(drop=True)
    
    def build_tree(self, data, tree=None):
        target = data.keys()[-1]
        best_split = self.information_gain_split(data)
        feature_vals = data[best_split].unique()
        
        if tree is None:
            tree = {}
            tree[best_split] = {}
        
            
        for val in feature_vals:
            subset = self.get_subset(data, best_split, val)
            target_val, target_counts = np.unique(subset[subset.keys()[-1]], return_counts=True)
#             print(target_val, target_counts)
            
            if len(target_counts) == 1:
                tree[best_split][val] = target_val[0]
            else:
                tree[best_split][val] = self.build_tree(subset)        

        return tree
    
    def predict(self, instance_data, tree):
        for node in tree.keys():
            value = instance_data[node]
            tree = tree[node][value]
            prediction = 0
            
            if type(tree) is dict:
                prediction = self.predict(instance_data, tree)
            else:
                prediction = tree
                break
        
        return prediction
    
    def fit(self, data, tree):
        error = 0
        for i in range(len(data)):
            prediction = self.predict(data.iloc[i], tree)
            if prediction != data.iloc[i][-1]:
                error += 1
        return error/len(data)
        


In [160]:
# Global variables
k, m = 10, 100
epsilon = np.finfo(float).eps

X = np.asarray(create_data(k, m))
w = np.asarray(create_weights(k))
y = np.asarray(create_y(X, w, k, m)).reshape((m,1))

cols = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

# Training data is an appended version of X and y arrays
train_data = pd.DataFrame(np.append(X, y, axis=1), columns=cols[:k+1])
print(train_data)

    a  b  c  d  e  f  g  h  i  j  k
0   1  1  0  0  0  0  0  0  0  0  0
1   0  0  0  0  0  1  0  0  0  1  1
2   1  1  1  0  1  0  0  1  1  1  1
3   1  1  1  1  1  1  1  1  0  0  1
4   1  0  1  1  1  1  1  0  0  0  1
5   1  0  0  1  1  1  1  1  0  0  1
6   1  1  1  1  1  1  0  0  1  1  1
7   0  0  1  1  1  1  1  1  1  0  0
8   0  1  1  1  1  1  1  1  1  1  0
9   1  1  0  0  1  1  1  1  1  0  1
10  0  1  1  1  0  1  1  0  0  1  0
11  0  0  0  1  1  0  0  1  0  0  1
12  0  0  0  0  0  0  0  0  0  0  1
13  0  1  1  1  1  1  1  1  0  1  0
14  0  0  1  0  0  0  0  0  0  0  1
15  0  1  1  1  0  0  0  0  0  0  1
16  1  0  0  0  1  1  1  1  1  1  1
17  0  0  1  1  1  0  0  1  1  1  0
18  1  1  1  1  1  1  0  0  1  0  1
19  1  1  1  1  1  1  1  1  0  1  1
20  0  0  0  0  0  0  0  1  1  0  1
21  1  1  1  1  1  0  0  0  0  0  1
22  1  1  1  1  0  0  1  1  1  1  1
23  1  1  1  1  0  1  1  0  0  0  1
24  0  0  0  0  0  0  0  1  1  1  1
25  0  0  0  1  1  1  1  1  1  0  0
26  1  1  1  1  1  1  1  1  

In [161]:
dt = DecisionTree()

In [162]:
tree = dt.build_tree(train_data)
pprint.pprint(tree)

{'a': {0: {'e': {0: {'g': {0: 1, 1: {'j': {0: 1, 1: 0}}}},
                 1: {'f': {0: {'c': {0: {'g': {0: 1, 1: 0}}, 1: 0}}, 1: 0}}}},
       1: {'c': {0: {'e': {0: 0,
                           1: {'b': {0: {'d': {0: {'i': {0: 0,
                                                         1: {'f': {0: 0,
                                                                   1: 1}}}},
                                               1: 1}},
                                     1: 1}}}},
                 1: {'e': {0: {'i': {0: {'f': {0: 0, 1: 1}}, 1: 1}}, 1: 1}}}}}}


In [163]:
instance = train_data.iloc[24]
instance

a    0
b    0
c    0
d    0
e    0
f    0
g    0
h    1
i    1
j    1
k    1
Name: 24, dtype: int64

In [164]:
prediction = dt.predict(instance, tree)
prediction

1

In [165]:
error = dt.fit(train_data, tree)
error

0.0