Let ${(X_1, Y_1),(X_2, Y_2), . . . ,(X_m, Y_m)}$ denote a data set, where $X_i$ represents a vector of k (binary) feature values,
and $Y_i$ is a corresponding binary class or label that we will need to learn to be able to predict from the X-values.
We generate data via the following scheme, defining a distribution for our data set: Let $X = (X_1, X_2, X_3, . . . , X_k)$
be a vector of binary values, satisfying the following
- $X_1 = 1$ with probability 1/2, $X_1 = 0$ with probability 1/2
- For i = 2, . . . , k, $X_i = X_{i−1}$ with probability 3/4, and $X_i = 1 − X_{i−1}$ with probability 1/4.
In this way, the first feature value is uniformly random, but every successive feature is strongly correlated with the
value of the feature before it. We can then define Y to be a function of X as
$$Y = X_1 if w_2X_2 + w_3X_3 + . . . + w_kX_k ≥ 1/2$$
$$Y = 1 − X_1 else$$

In other words, if the ‘weighted average’ of $X_2, . . . X_k$ tilts high, Y will agree with $X_1$; if the weighted average of
$X_2, . . . , X_k$ tilts low, Y will disagree with $X_1$. Take the weights to be defined by $w_i = \frac{0.9^i}{0.9^2 + 0.9^3 + ... + 0.9^k}$

#### 1. For a given value of k, m, (number of features, number of data points), write a function to generate a training data set based on the above scheme.

In [192]:
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [188]:
def create_data(k, m):
    X = [[0]*k for i in range(m)]
    for i in range(m):
        X[i][0] = int(np.random.choice(2, size=1))
        for j in range(1, k):
            temp = np.random.choice(2, 1, p=[0.25,0.75])
            if temp == 1:
                X[i][j] = X[i][j-1]
            else:
                X[i][j] = 1 - X[i][j-1]
    return X
            
def create_weights(k):
    div = 0
    weight = [0]*(k+1)
    for i in range(2, k+1):
        div += 0.9**i
    for i in range(1, k+1):
        weight[i] = (0.9**i)/div
        
    return weight[1:]
    
def create_y(X, w, k, m):
    y = []
    for i in range(m):
        val = np.dot(X[i][1:], w[1:].T)
#         print(val)
        if val < 0.5:
            y.append(1 - X[i][0])
        else:
            y.append(X[i][0])
    return y

#### 2. Given a data set, write a function to fit a decision tree to that data based on splitting the variables by maximizing the information gain. Additionally, return the training error of this tree on the data set, $err_{train}(\hat{f})$. It may be useful to have a function that takes a data set and a variable, and returns the data set partitioned based on the values of that variable

In [308]:
class DecisionTree():
    def __init__(self, data):
        self.data = data
        
    def entropy(self):
        target = self.data.keys()[-1]
        entropy_y = 0
        target_vals = self.data[target].unique()
        
        for val in target_vals:
            p = self.data[target].value_counts()[val]/len(self.data[target])
            entropy_y += -p*np.log2(p)
        return entropy_y
    
    def conditional_entropy(self, feature):
        target = self.data.keys()[-1]
        target_vals = self.data[target].unique()
        feature_vals = self.data[feature].unique()
        cond_entropy_y = 0
        
        for fval in feature_vals:
            entropy = 0
            for tval in target_vals:
                num = len(self.data[feature][self.data[feature] == fval][self.data[target] == tval])
                denom = len(self.data[feature][self.data[feature] == fval])
                e = num/(denom + epsilon)
                entropy += -(e)*np.log2(e)
            cond_entropy_y += (denom/len(self.data))*entropy
            
        return abs(cond_entropy_y)
            
    def information_gain_split(self):
        IG = []
        for key in self.data.keys()[:-1]:
            IG.append(self.entropy() - self.conditional_entropy(key))
            
        return self.data.keys()[:-1][max(IG)]
    
    def get_subset(self, node, value):
        return self.data[self.data[node] == value].reset_index(drop=True)
    
    def buildTree(self, tree=None):
        target = self.data.keys()[-1]
        best_split = self.information_gain_split()
        feature_vals = self.data[best_split].unique()
        
        if tree is None:
            tree = {}
            tree[best_split] = {}
            
        for val in feature_vals:
            subset = 

In [315]:
# Global variables
k, m = 4, 30
epsilon = np.finfo(float).eps

X = np.asarray(create_data(k, m))
w = np.asarray(create_weights(k))
y = np.asarray(create_y(X, w, k, m)).reshape((m,1))

# Training data is an appended version of X and y arrays
train_data = pd.DataFrame(np.append(X, y, axis=1))
print(train_data)

    0  1  2  3  4
0   1  0  0  0  0
1   1  1  1  1  1
2   1  1  1  1  1
3   1  1  1  1  1
4   1  0  0  0  0
5   1  1  1  1  1
6   1  1  1  1  1
7   0  0  1  1  0
8   1  1  0  0  0
9   1  0  0  0  0
10  1  1  1  0  1
11  1  1  0  0  0
12  1  1  1  1  1
13  1  1  1  0  1
14  1  1  1  0  1
15  1  1  0  0  0
16  0  1  1  1  0
17  1  0  1  0  0
18  1  1  1  0  1
19  0  1  1  1  0
20  0  0  0  0  1
21  0  0  0  0  1
22  0  1  0  0  1
23  1  1  1  1  1
24  1  1  1  1  1
25  1  1  0  0  0
26  1  1  0  0  0
27  0  0  0  1  1
28  1  1  1  1  1
29  0  0  0  0  1


In [316]:
dt = DecisionTree(train_data)

In [317]:
dt.information_gain()

[0.0006863849678270251, 0.03082069266505927, 0.10837633356070331, 0.0679808886855997]


In [280]:
len(train_data[4][train_data[4] == 0][train_data[0] == 0])

5

In [294]:
len(train_data.columns)-1

4

2.220446049250313e-16

In [292]:
train_data.iloc[:,:len(train_data.columns)-1]

Unnamed: 0,0,1,2,3
0,0,1,1,1
1,1,1,1,0
2,1,0,1,1
3,0,0,0,0
4,1,1,1,1
5,1,1,1,1
6,1,1,0,0
7,0,0,1,1
8,1,1,1,0
9,1,1,1,0


In [291]:
train_data.columns[:]

RangeIndex(start=0, stop=4, step=1)