In [293]:
import pandas as pd
import numpy as np

def main():

    train_path = 'https://www.csie.ntu.edu.tw/~htlin/course/ml20fall/hw6/hw6_train.dat'
    test_path = 'https://www.csie.ntu.edu.tw/~htlin/course/ml20fall/hw6/hw6_test.dat'

    df_train = pd.read_csv(train_path, header=None, sep=' ')
    df_test = pd.read_csv(test_path, header=None, sep=' ')

    X_train = df_train.iloc[:, :-1].to_numpy()
    y_train = df_train.iloc[:, -1].to_numpy()
    X_test = df_test.iloc[:,:-1].to_numpy()
    y_test = df_test.iloc[:,-1].to_numpy()
    
    tree = train(X_train, y_train)
    Eout = error(tree, X_test, y_test)
    print('Eout =',Eout)

def impurity(y):
    #Gini
    N = len(y)
    
    if N == 0:
        return 1
    
    pos = (y == 1).sum() / N
    neg = (y == -1).sum() / N
        
    return (1 - (pos**2 + neg**2))

def loss(X, y, theta):
    y1 = y[X < theta]
    y2 = y[X >= theta]
    return len(y1) * impurity(y1) + len(y2) * impurity(y2)

def get_theta(X):
    X = np.sort(X)
    theta = (X[:-1] + X[1:]) / 2
    theta = np.r_[X[0] - 1, theta,  X[-1] + 1]
    
    return theta

def dStump(X,y):
    n, d = X.shape
    
    theta_best = 0
    feature = 0
    b_best = float('inf')
    
    for i in range(d):
        x = X[:,i]
        thetaList = get_theta(x)
        for theta in thetaList:
            b = loss(x,y,theta)
            if b < b_best:
                b_best = b
                feature = i
                theta_best = theta
    
    return feature, theta_best, b_best 

def terminate(X, y):
    # all X are the same or all y are the same
    condition1 = (X[0] == X).all()
    condition2 = (impurity(y) == 0)
    result = condition1 | condition2
    return result

class Dtree:
    def __init__(self, theta, feature, value=None):
        self.theta = theta
        self.feature = feature
        self.value = value
        self.left = None
        self.right = None


def train(X,y):
    if terminate(X,y):
        return Dtree(None, None, y[0])
    else:
        feature, theta, b_best = dStump(X,y)
        tree = Dtree(theta, feature)
        cut1 = X[:,feature] < theta
        X1 = X[cut1]
        y1 = y[cut1]
        cut2 = X[:,feature] >= theta
        X2 = X[cut2]
        y2 = y[cut2]
        left = train(X1,y1)
        right = train(X2,y2)
        tree.left = left
        tree.right = right
        
        return tree

def predict(tree, X):
    if tree.value != None:
        return tree.value
    if X[tree.feature] < tree.theta:
        return predict(tree.left, X)
    else:
        return predict(tree.right, X)


def error(tree, X, y):
    N = len(y)
    y_pred = []
    for i in X:
        y_pred.append( predict(tree,i))
    
    return sum(y_pred != y)/N

main()

Eout = 0.166
