In [1]:
#from multiprocessing import Pool
#from functools import partial
import numpy as np
#from numba import jit

import random

In [2]:
#TODO: loss of least square regression and binary logistic regression
'''
    pred() takes GBDT/RF outputs, i.e., the "score", as its inputs, and returns predictions.
    g() is the gradient/1st order derivative, which takes true values "true" and scores as input, and returns gradient.
    h() is the heassian/2nd order derivative, which takes true values "true" and scores as input, and returns hessian.
'''
class leastsquare(object):
    '''Loss class for mse. As for mse, pred function is pred=score.'''
    def pred(self,score):
        return score

    def g(self,true,score):
        return -2*(true-score) 

    def h(self,true,score):
        return ([2]*len(score))
    
    def loss(self,true,score):
        return np.sum((true-score)**2)

class logistic(object):
    '''Loss class for log loss. As for log loss, pred function is logistic transformation.'''
    def pred(self,score):
        prob = (1/(1+np.exp(-1*score)))
        pred = np.array([1 if p > 0.5 else 0 for p in prob])
        return pred        

    def g(self,true,score):
        y = true
        yhat = score
        return (-y/(1+np.exp(yhat))) + ((1-y)/(1+np.exp(-1*yhat)))

    def h(self,true,score):
        y = true
        yhat = score
        return (y*np.exp(yhat))/((1+np.exp(yhat))**2)  +  ((1-y)*np.exp(-yhat))/((1+np.exp(-yhat))**2)

    def loss(self,true,score):
        y = true
        yhat = score
        return np.sum(y*np.log(1+np.exp(-yhat)) + (1-y)*np.log(1+np.exp(yhat)))


In [3]:
# TODO: class of Random Forest
class RF(object):
    '''
    Class of Random Forest
    
    Parameters:
        n_threads: The number of threads used for fitting and predicting.
        loss: Loss function for gradient boosting.
            'mse' for regression task and 'log' for classfication task.
            A child class of the loss class could be passed to implement customized loss.
        max_depth: The maximum depth d_max of a tree.
        min_sample_split: The minimum number of samples required to further split a node.
        lamda: The regularization coefficient for leaf score, also known as lambda.
        gamma: The regularization coefficient for number of tree nodes, also know as gamma.
        rf: rf*m is the size of random subset of features, from which we select the best decision rule.
        num_trees: Number of trees.
    '''
    def __init__(self,
        n_threads = None, loss = 'mse',
        max_depth = 3, min_sample_split = 10, 
        lamda = 1, gamma = 0,
        rf = 0.99, num_trees = 100):
        
        self.n_threads = n_threads
        self.loss = loss
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.lamda = lamda
        self.gamma = gamma
        self.rf = rf
        self.num_trees = num_trees
        
        #Instantiate loss class
        if(self.loss == 'mse'):
            self.lossType = leastsquare()
        elif(self.loss == 'log'):
            self.lossType = logistic()        
        else:
            print("Invalid choice of loss")
        
        
        self.trees = []
        self.yhat0 = 0

    def fit(self, train, target):
        # train is n x m 2d numpy array
        # target is n-dim 1d array
        #TODO
        
        print("INSIDE RF FIT(" +
              "max_depth="+str(self.max_depth) +
              ", min_sample_split="+str(self.min_sample_split) +
              ", lamda="+str(self.lamda) +
              ", gamma="+str(self.gamma) +           
              ", rf="+str(self.rf) +
              ", num_trees="+str(self.num_trees) +
              ")")
        
            
        #Initialize yhat0
        #self.yhat0 = np.mean(target)
        #yhat0 = np.array([np.mean(target)]*len(target))
        
        self.yhat0 = 0
        yhat0 = np.zeros(len(target))
               
        for k in range(self.num_trees):
            #Random selection from train_target
            #randIndex = np.random.choice(range(len(target)), len(target))
            #X = train[randIndex,:]
            #y = target[randIndex]
            
            X=train
            y=target
            
           
            #Calculate g and h
            g = self.lossType.g(y, yhat0)
            h = self.lossType.h(y, yhat0)
            
            #Instantiate the Tree object
            myTree = Tree(n_threads = None, 
                      max_depth = self.max_depth, 
                      min_sample_split = self.min_sample_split,
                      lamda = self.lamda,
                      gamma = self.gamma, 
                      rf = self.rf)            
                    
            #Fit the kth tree
            myTree = myTree.fit(X,g,h)
            self.trees.append(myTree)
            
            if(DEBUG_FLAG1):
                #Do the prediction to compute loss
                yhat_k = self.predict(train)
                
                if(self.loss == 'mse'):
                    loss = root_mean_square_error(target,self.lossType.pred(yhat_k))
                    print("------Tree"+str(k)+": rmse loss=" + str(loss)+"-------")

                if(self.loss == 'log'):
                    acc = accuracy(target,yhat_k)
                    loss = self.lossType.loss(target,yhat_k)
                    print("------Tree"+str(k)+": accuracy=" + str(acc)+ ", loss=" + str(loss) +"-------")   
   
        return self

    def predict(self, test):
        #TODO
               
        yhat_k=0
        for k in range(len(self.trees)):
            yhat_k += self.trees[k].predict(test)
            
        
        score = yhat_k/len(self.trees)  
        score += np.array([self.yhat0]*test.shape[0]) #Add the bias term yhat_0
        return self.lossType.pred(score)
    

In [4]:
# TODO: class of GBDT
class GBDT(object):
    '''
    Class of gradient boosting decision tree (GBDT)
    
    Parameters:
        n_threads: The number of threads used for fitting and predicting.
        loss: Loss function for gradient boosting.
            'mse' for regression task and 'log' for classfication task.
            A child class of the loss class could be passed to implement customized loss.
        max_depth: The maximum depth D_max of a tree.
        min_sample_split: The minimum number of samples required to further split a node.
        lamda: The regularization coefficient for leaf score, also known as lambda.
        gamma: The regularization coefficient for number of tree nodes, also know as gamma.
        learning_rate: The learning rate eta of GBDT.
        num_trees: Number of trees.
    '''
    def __init__(self,
        n_threads = None, loss = 'mse',
        max_depth = 3, min_sample_split = 10, 
        lamda = 1, gamma = 0,
        learning_rate = 0.1, num_trees = 100):
        
        self.n_threads = n_threads
        self.loss = loss
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.lamda = lamda
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.num_trees = num_trees
        
        self.trees = []
        self.yhat0 = []


       #Instantiate the loss class
        if(self.loss == 'mse'):
            self.lossType = leastsquare()
        elif(self.loss == 'log'):
            self.lossType = logistic()        
        else:
            print("Invalid choice of loss")

    def fit(self, train, target):
        # train is n x m 2d numpy array
        # target is n-dim 1d array
        #TODO
        
        print("INSIDE GBDT FIT(" +
              ", max_depth="+str(self.max_depth) +
              ", min_sample_split="+str(self.min_sample_split) +
              ", lamda="+str(self.lamda) +
              ", gamma="+str(self.gamma) +           
              ", lr="+str(self.learning_rate) +
              ", num_trees="+str(self.num_trees) +
              ")")        
            
        
        #Initialize yhat_prev
        yhat_prev = np.array([np.mean(target)]*len(target))
        self.yhat0 = np.mean(target)
        
        #yhat_prev = np.zeros(len(target))
        #self.yhat0 = 0.0
        
        for k in range(self.num_trees):           
            #Calculate g           
            g = self.lossType.g(target,yhat_prev)
            
            #Calculate h
            h = self.lossType.h(target,yhat_prev)            
            
            #Instantiate the Tree object
            myTree = Tree(n_threads = None, 
                      max_depth = self.max_depth, 
                      min_sample_split = self.min_sample_split,
                      lamda = self.lamda,
                      gamma = self.gamma, 
                      rf = 0)
            
                
            myTree = myTree.fit(train,g,h)
            self.trees.append(myTree)            
            
            #Get prediction fk
            fk = myTree.predict(train)
            
            #yhat_k = yhat_prev + eta*fk
            yhat_k = yhat_prev + self.learning_rate*fk
            
            #yhat_prev = yhat_k
            yhat_prev = yhat_k
            
            if(DEBUG_FLAG1):
                #loss = lossType.loss(target,yhat_k)
                if(self.loss == 'mse'):
                    loss = root_mean_square_error(target,self.lossType.pred(yhat_k))
                    print("------Tree"+str(k)+": rmse loss=" + str(loss)+"-------")

                if(self.loss == 'log'):
                    acc = accuracy(target,self.lossType.pred(yhat_k))
                    loss = self.lossType.loss(target,yhat_k)
                    print("------Tree"+str(k)+": accuracy=" + str(acc)+ ", loss=" + str(loss) +"-------")   

        
        return self

    def predict(self, test):
        #TODO
        
        yhat_prev = np.array([self.yhat0]*test.shape[0])
        
        for k in range(self.num_trees):
            #Get the prediction from current tree
            fk = self.trees[k].predict(test)  
            yhat_k = yhat_prev + self.learning_rate*fk
            yhat_prev = yhat_k
            
        
        score = yhat_k
        return self.lossType.pred(score)

In [5]:
# TODO: class of a node on a tree
class TreeNode(object):
    '''
    Data structure that are used for storing a node on a tree.
    
    A tree is presented by a set of nested TreeNodes,
    with one TreeNode pointing two child TreeNodes,
    until a tree leaf is reached.
    
    A node on a tree can be either a leaf node or a non-leaf node.
    '''
    
    #TODO
    def __init__(self, split_feature=0, split_threshold=0, left_child=None, right_child=None, is_leaf=False):
        
        self.is_leaf = is_leaf
        self.split_threshold = split_threshold
        self.split_feature = split_feature
        
        self.left = None
        self.right = None
        
        self.w = 0   
        
    def forward(self, x):
        if x[self.split_feature] < self.split_threshold:
            return self.left
        else:
            return self.right
        
    def print(self):
        print("Is leaf node =", self.is_leaf)
        print("Split_feature = ", self.split_feature)
        print("Split_threshold = ", self.split_threshold)
        print("Left Child = ", self.left)
        print("Right Child = ", self.right)
        print("w = ", self.w)
        


In [6]:
# TODO: class of single tree
class Tree(object):
    '''
    Class of a single decision tree in GBDT

    Parameters:
        n_threads: The number of threads used for fitting and predicting.
        max_depth: The maximum depth of the tree.
        min_sample_split: The minimum number of samples required to further split a node.
        lamda: The regularization coefficient for leaf prediction, also known as lambda.
        gamma: The regularization coefficient for number of TreeNode, also know as gamma.
        rf: rf*m is the size of random subset of features, from which we select the best decision rule,
            rf = 0 means we are training a GBDT.
    '''
    
    def __init__(self, n_threads = None, 
                 max_depth = 3, min_sample_split = 10,
                 lamda = 1, gamma = 0, rf = 0):
        self.n_threads = n_threads
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.lamda = lamda
        self.gamma = gamma
        self.rf = 0
        self.int_member = 0
        
        self.root = None

    def fit(self, train, g, h):
        '''
        train is the training data matrix, and must be numpy array (an n_train x m matrix).
        g and h are gradient and hessian respectively.
        '''
        #TODO
        
        self.root = self.construct_tree(train,g,h,depth=0)
        
        return self

    def predict(self,test):
        '''
        test is the test data matrix, and must be numpy arrays (an n_test x m matrix).
        Return predictions (scores) as an array.
        '''
        #TODO
        result = np.zeros(test.shape[0])
        c = 0
        
        for x in test: #iterate through each row in test data matrix 
            curNode = self.root
            while(curNode.is_leaf == False):
                curNode = curNode.forward(x)
            
            result[c] = curNode.w
            c +=1
 
        return result

    def construct_tree(self, train, g, h, depth):
        '''
        Tree construction, which is recursively used to grow a tree.
        First we should check if we should stop further splitting.
        
        The stopping conditions include:
            1. tree reaches max_depth $d_{max}$
            2. The number of sample points at current node is less than min_sample_split, i.e., $n_{min}$
            3. gain <= 0
        '''
        #TODO

        
        #Check stopping critera
        if(depth==self.max_depth):
            #Max depth reached, this should be a leaf node
            node = TreeNode(split_feature = 0, split_threshold=0, left_child=None, right_child=None, is_leaf=True)          
            #Calculate the weight w of the leaf node
            node.w = (-1*np.sum(g))/(np.sum(h)+self.lamda)  
            if(DEBUG_FLAG2):
                print("Leaf Node at max depth")
                #print("g=",g)
                #print("h=",h)
                print("w=",node.w)
            return node
        
        if(len(g)<self.min_sample_split):
            #Less than min number of samples to split, this should be a leaf node
            node = TreeNode(split_feature = 0, split_threshold=0, left_child=None, right_child=None, is_leaf=True) 
 
            #Calculate the weight w of the leaf node
            node.w = (-1*np.sum(g))/(np.sum(h)+self.lamda)
            if(DEBUG_FLAG2):
                print("Leaf Node at min sample")
                #print("g=",g)
                #print("h=",h)
                print("w=",node.w)
            return node            
        
        
        #Find the decision rule for train,g,h
        p,t,gain = self.find_best_decision_rule(train,g,h)
        if(DEBUG_FLAG2):
            print("p="+str(p)+", t="+str(t)+", gain="+str(gain))
        
        
        #Check the other stopping criteria
        if(gain<=0):
            #Dont split, thsi should be a leaf node
            node = TreeNode(split_feature = 0, split_threshold=0, left_child=None, right_child=None, is_leaf=True)

            #Calculate the weight w of the leaf node
            node.w = (-1*np.sum(g))/(np.sum(h)+self.lamda)
            if(DEBUG_FLAG2):
                print("Leaf Node at min gain")
                #print("g=",g)
                #print("h=",h)
                print("w=",node.w)
            return node
        
        X1,g1,h1,X2,g2,h2 = self.split(train,g,h,p,t)
        
        node = TreeNode(split_feature=p, split_threshold=t, left_child=None, right_child=None, is_leaf=False)
        node.left = self.construct_tree(X1,g1,h1,depth+1)
        node.right = self.construct_tree(X2,g2,h2,depth+1)
        
        return node
    
    def split(self,X,g,h,p,t):
        
        #Sort Xgh according to pth column
        Xgh = np.append(X,np.array([g]).T, axis=1)
        Xgh = np.append(Xgh,np.array([h]).T, axis=1)
        Xgh = Xgh[np.argsort(Xgh[:,p])]
    
        X_sort = Xgh[:,0:X.shape[1]]
        g_sort = Xgh[:,X.shape[1]]
        h_sort = Xgh[:,Xgh.shape[1]-1]
        
        #Find the index to split according to threshold t
        Xp = X_sort[:,p]
        for i in range(len(Xp)):
            if(Xp[i] < t):
                i+=1
            else:
                break
            
        X1 = X_sort[0:i,:]
        g1 = g_sort[0:i]
        h1 = h_sort[0:i]
        
        X2 = X_sort[i:X.shape[0],:]
        g2 = g_sort[i:len(g)]
        h2 = h_sort[i:len(h)]
        
        #X1 = X(pth column) < t
        return X1,g1,h1,X2,g2,h2

    def find_best_decision_rule(self, train, g, h):
        '''
        Return the best decision rule [feature, treshold], i.e., $(p_j, \tau_j)$ on a node j, 
        train is the training data assigned to node j
        g and h are the corresponding 1st and 2nd derivatives for each data point in train
        g and h should be vectors of the same length as the number of data points in train
        
        for each feature, we find the best threshold by find_threshold(),
        a [threshold, best_gain] list is returned for each feature.
        Then we select the feature with the largest best_gain,
        and return the best decision rule [feature, treshold] together with its gain.
        '''
        #TODO
        best_gain = float('-inf')
        best_feature = 0
        best_threshold = 0
        
        #indices representing all feature indices
        p_choices = list(range(0,train.shape[1]))
        
        if(self.rf==0): #GBDT
            p_choices = p_choices
        elif(self.rf > 1):
            print("Wrong value for rf.")
        elif(self.rf<1): #Random  Forest
            p_choices = random.sample(p_choices, round(rf*m))
            
            
        for p in p_choices:
            threshold,gain = self.find_threshold(g,h,train,p)
            if (gain > best_gain):
                best_gain = gain
                best_threshold = threshold
                best_feature = p
                
        
        feature = best_feature
        threshold = best_threshold
        gain = best_gain
        
        return feature, threshold, gain
    
    def find_threshold(self, g, h, train,p):
        '''
        Given a particular feature $p_j$,
        return the best split threshold $\tau_j$ together with the gain that is achieved.
        '''
        #TODO
        
        #print("p=",p)
        
        #Sort train,g,h according to 'p'
        Xgh = np.append(train,np.array([g]).T, axis=1)
        Xgh = np.append(Xgh,np.array([h]).T, axis=1)
        Xgh = Xgh[np.argsort(Xgh[:,p])]
        train_sort = Xgh[:,0:train.shape[1]]
        g_sort = Xgh[:,train.shape[1]]
        h_sort = Xgh[:,Xgh.shape[1]-1]
        #print("g_sort=",g_sort)
        #print("h_sort=",h_sort)
        
        #Come up with the list of thresholds
        t_list = train_sort[:,p]
        ts_list = np.roll(t_list, 1)
        ts_list[0]= 0
        t_list = 0.5*(t_list + ts_list)
        t_list = t_list[1:len(t_list)] #The first entry is redundant actually.
        #print("t_list=",t_list)
         
        #Loop through the thresolds to find best gain
        gain_best = float('-inf')
        threshold_best = 0
        for i in range(len(t_list)):
            #print("g_sort[0:i+1]",g_sort[0:i+1])
            G_L = np.sum(g_sort[0:i+1])
            G_R = np.sum(g_sort[i+1:len(g_sort)])
            H_L = np.sum(h_sort[0:i+1])
            H_R = np.sum(h_sort[i+1:len(h_sort)])  
            #print("t=",t_list[i])
            #print("G_L=", G_L)
            #print("G_R=", G_R)
            #print("H_L=", H_L)
            #print("H_R=", H_R)
            gain = ((G_L*G_L)/(H_L+self.lamda)) 
            gain += ((G_R*G_R)/(H_R+self.lamda)) 
            gain -= (((G_L+G_R)**2)/(H_L+H_R+self.lamda))
            #print("gain=", gain)
            gain = 0.5*gain - self.gamma

            #gain = ((G_L*G_L)/(H_L+self.lamda)) + ((G_R*G_R)/(H_R+self.lamda)) - (((G_L+G_R)**2)/(H_L+H_R+self.lamda))
            
            
            if(gain>gain_best):
                gain_best = gain
                threshold_best = t_list[i]
                                           
        threshold = threshold_best
        best_gain = gain_best      
        
        return [threshold, best_gain]

In [7]:
# TODO: Evaluation functions (you can use code from previous homeworks)

# RMSE
def root_mean_square_error(pred, y):
    #TODO
    error = y - pred
    rmse = np.sqrt(np.sum(np.dot(error,error))/len(error))
    return rmse

# precision
def accuracy(pred, y):
    #TODO
    correct = 0
    for j in range(len(y)):
        if(pred[j]==y[j]):
            correct +=1
    precision = correct/len(pred) 
    return precision


In [9]:
DEBUG_FLAG1 = False
DEBUG_FLAG2 = False

# TODO: GBDT regression on boston house price dataset

# load data
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

for k in range(1):
    max_depth1 = random.randint(2,10)
    min_sample_split1 = random.randint(5,50)
    lamda1 = random.uniform(0.001,10)
    gamma1 = random.uniform(0,1)
    learning_rate1 = random.uniform(0.1,1.0)
    num_trees1 = random.randint(5,40)
    
    
    max_depth1=4
    min_sample_split1=11
    lamda1=2
    gamma1=0.1
    lr1=0.3
    num_trees1=16
    
    myGBDT = GBDT( n_threads = None, loss = 'mse',
                  max_depth = max_depth1, min_sample_split=min_sample_split1, 
                  lamda=lamda1 , gamma=gamma1 ,
                  learning_rate=lr1 , num_trees=num_trees1 )
    
    myGBDT = myGBDT.fit(X_train, y_train)
    train_pred = myGBDT.predict(X_train)
    test_pred = myGBDT.predict(X_test)
    
    train_rmse = root_mean_square_error(train_pred,y_train)
    test_rmse = root_mean_square_error(test_pred,y_test)
    print("train_rmse=", train_rmse)
    print("test_rmse=", test_rmse)
    print("\n")


for k in range(1):
    max_depth1 = random.randint(2,10)
    min_sample_split1 = random.randint(5,50)
    lamda1 = random.uniform(0.001,10)
    gamma1 = random.uniform(0,1)
    rf1 = random.uniform(0.2,0.5)
    num_trees1 = random.randint(5,40)
    
    max_depth1=4
    min_sample_split1=11
    lamda1=2
    gamma1=0.1
    rf1=0.5
    num_trees1=16
    
    myRF = RF( n_threads = None, loss = 'mse',
              max_depth=max_depth1, min_sample_split=min_sample_split1, 
              lamda=lamda1, gamma=gamma1, rf=rf1,
              num_trees=num_trees1)


    myRF = myRF.fit(X_train, y_train)
    train_pred = myRF.predict(X_train)
    test_pred = myRF.predict(X_test)
    train_rmse = root_mean_square_error(train_pred,y_train)
    test_rmse = root_mean_square_error(test_pred,y_test)
    print("train_rmse=", train_rmse)
    print("test_rmse=", test_rmse)
    print("\n")



INSIDE GBDT FIT(, max_depth=4, min_sample_split=11, lamda=2, gamma=0.1, lr=0.3, num_trees=16)
train_rmse= 1.6532897579764643
test_rmse= 3.3151756403522863


INSIDE RF FIT(max_depth=4, min_sample_split=11, lamda=2, gamma=0.1, rf=0.5, num_trees=16)
train_rmse= 4.285572755247948
test_rmse= 5.174724611481524




In [10]:
# TODO: GBDT classification on credit-g dataset

# load data
from sklearn.datasets import fetch_openml
X, y = fetch_openml('credit-g', version=1, return_X_y=True, data_home='credit/')
y = np.array(list(map(lambda x: 1 if x == 'good' else 0, y)))

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

for k in range(1):
    max_depth1 = random.randint(2,10)
    min_sample_split1 = random.randint(5,50)
    lamda1 = random.uniform(0.001,10)
    gamma1 = random.uniform(0,1)
    learning_rate1 = random.uniform(0.1,1.0)
    num_trees1 = random.randint(5,40)
    
    
    max_depth1=4
    min_sample_split1=10
    lamda1=5.5
    gamma1=0.1
    lr1=0.7
    num_trees1=20
    
    myGBDT = GBDT( n_threads = None, loss = 'log',
                  max_depth = max_depth1, min_sample_split=min_sample_split1, 
                  lamda=lamda1 , gamma=gamma1 ,
                  learning_rate=lr1 , num_trees=num_trees1 )
    
    myGBDT = myGBDT.fit(X_train, y_train)
    train_pred = myGBDT.predict(X_train)
    test_pred = myGBDT.predict(X_test)
    train_acc = accuracy(train_pred,y_train)
    test_acc = accuracy(test_pred,y_test)
    print("train_acc=", train_acc)
    print("test_acc=", test_acc)
    print("\n")


for k in range(1):
    max_depth1 = random.randint(2,10)
    min_sample_split1 = random.randint(5,50)
    lamda1 = random.uniform(0.001,10)
    gamma1 = random.uniform(0,1)
    rf1 = random.uniform(0.2,0.5)
    num_trees1 = random.randint(5,40)
    
    max_depth1=4
    min_sample_split1=10
    lamda1=5.5
    gamma1=0.1
    rf1=0.5
    num_trees1=20
    
    myRF = RF( n_threads = None, loss = 'log',
              max_depth=max_depth1, min_sample_split=min_sample_split1, 
              lamda=lamda1, gamma=gamma1, rf=rf1,
              num_trees=num_trees1)
    
    myRF = myRF.fit(X_train, y_train)
    train_pred = myRF.predict(X_train)
    test_pred = myRF.predict(X_test)
    train_acc = accuracy(train_pred,y_train)
    test_acc = accuracy(test_pred,y_test)
    print("train_acc=", train_acc)
    print("test_acc=", test_acc)
    print("\n")

INSIDE GBDT FIT(, max_depth=4, min_sample_split=10, lamda=5.5, gamma=0.1, lr=0.7, num_trees=20)
train_acc= 0.8071428571428572
test_acc= 0.7733333333333333


INSIDE RF FIT(max_depth=4, min_sample_split=10, lamda=5.5, gamma=0.1, rf=0.5, num_trees=20)
train_acc= 0.7685714285714286
test_acc= 0.7633333333333333




In [None]:
# TODO: GBDT classification on breast cancer dataset

# load data
from sklearn import datasets
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

max_depth1=2
min_sample_split1=14
lamda1=5.5
gamma1=0.5
lr1=0.4
num_trees1=20
myGBDT = GBDT( n_threads = None, loss = 'log',
              max_depth = max_depth1, min_sample_split = min_sample_split1, 
              lamda = lamda1, gamma = gamma1,
              learning_rate = lr1, num_trees = num_trees1)

myGBDT = myGBDT.fit(X_train, y_train)
train_pred = myGBDT.predict(X_train)
test_pred = myGBDT.predict(X_test)
train_acc = accuracy(train_pred,y_train)
test_acc = accuracy(test_pred,y_test)
print("train_acc=", train_acc)
print("test_acc=", test_acc)
print("\n")


max_depth1=4
min_sample_split1=47 
lamda1=5.43
gamma1=0.9
rf1=0.4
num_trees1=35
myRF = RF( n_threads = None, loss = 'log',
              max_depth = max_depth1, min_sample_split = min_sample_split1, 
              lamda = lamda1, gamma = gamma1, rf=rf1,
              num_trees = num_trees1)

myRF = myRF.fit(X_train, y_train)
train_pred = myRF.predict(X_train)
test_pred = myRF.predict(X_test)
train_acc = accuracy(train_pred,y_train)
test_acc = accuracy(test_pred,y_test)
print("train_acc=", train_acc)
print("test_acc=", test_acc)
print("\n")