In [None]:
import sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
import numpy as np
import pandas as pd
import mosek
import gc
import cvxpy as cp

#### Helper Functions

def loss_gradient(y, y_hat): 
    return -(y-y_hat)

def converge_test(sequence, threshold,length):
    diff = np.diff(sequence)
    if len(diff) < (length+1):
        return False
    else:
        return (max(np.abs(diff[-length:])) < threshold)
    
def check_OOB_convergence(OOB_error_list):
    if OOB_error_list[-1] < 0:
        return True
    elif (len(OOB_error_list) < 4):
        return False
    elif all([x < 10**-4 for x in OOB_error_list[-3:]]):
        return True
    else:
        return False

    
### Tree Growing Functions

def IncrementalDepthBaggingRegressor_fit(xTrain,yTrain,max_depth, threshold,tail):
    train = xTrain.copy()
    train = train.reset_index().drop('index',axis = 1)
    train['yTrain'] = list(yTrain)
    features = xTrain.columns

    tree_list = []

    for depth in range (1,max_depth+1):
        early_stop_pred = []
        early_stop_train_err = []
        converged = False

        while converged == False:

            train1 = train.sample(n = len(train), replace = True)
            yTrain1 = train1['yTrain']
            xTrain1 = train1[features]

            rf = DecisionTreeRegressor(max_depth = depth)
            rf.fit(xTrain1,yTrain1)
            tree_list.append(rf)
            pred = rf.predict(xTrain[features])
            
            early_stop_pred.append(pred)
            early_stop_train_err.append(sklearn.metrics.mean_squared_error(yTrain,(np.mean(early_stop_pred,axis = 0))))
            #print(sklearn.metrics.mean_squared_error(yTrain,(np.mean(early_stop_pred,axis = 0))))
            converged = converge_test(early_stop_train_err,threshold,tail)

    return tree_list

def IncrementalDepthBaggingRegressor_predict(xTest,tree_list):
    pred = []
    for clf in tree_list:
        pred.append(clf.predict(xTest))
    return np.mean(pred,axis = 0)


def IncrementalDepthBagBoostRegressor_OOB_EarlyStop(xTrain,yTrain, threshold,tail):
    train = xTrain.copy()
    train['yTrain'] = list(yTrain)
    features = xTrain.columns
    pred_train = np.zeros(len(yTrain))
    tree_list = []
    
    OOB_error_list = []
    OOB_converged = False
    depth = 1
    while OOB_converged == False:
    
        early_stop_pred = []
        early_stop_train_err = []
        converged = False
        OOB_matrix = []
        tree_list1 = []

        if len(tree_list) > 0:
            current_pred = IncrementalDepthBagBoostRegressor_predict(xTrain,tree_list)
            xTrain['current_pred'] = current_pred
            current_pred = xTrain['current_pred']
            xTrain.drop('current_pred',axis = 1,inplace = True)
        else:
            xTrain['current_pred'] = 0
            current_pred = xTrain['current_pred']
            xTrain.drop('current_pred',axis = 1,inplace = True)
        
        while converged == False:
            
            train1 = train.sample(n = len(train), replace = True)
            OOB = train[~train.index.isin(train1.drop_duplicates().index.values)].index.values
            OOB_row = np.repeat(False,len(xTrain))
            OOB_row[OOB] = True
            OOB_matrix.append(OOB_row)        
            yTrain1 = train1['yTrain']
            xTrain1 = train1[features]
            tree = DecisionTreeRegressor(max_depth = depth)
            tree.fit(xTrain1,yTrain1)
            tree_list.append(tree)
            tree_list1.append(tree)
            pred = tree.predict(xTrain[features])
            early_stop_pred.append(pred)
            pred_train = pred_train + np.mean(early_stop_pred,axis = 0)

            early_stop_train_err.append(sklearn.metrics.mean_squared_error(yTrain,pred_train))
            converged = converge_test(early_stop_train_err,threshold,tail)

            if converged == False:
                pred_train = pred_train - np.mean(early_stop_pred,axis = 0)
                   
        ### compute OOB
        indicators = pd.DataFrame(OOB_matrix).transpose()
        OOB_pred_list = []
        yTrain2 = yTrain.copy()
        
        for i,row in xTrain.iterrows():
            row = row.to_frame().transpose()
            temp_series = indicators.iloc[i]
            OOB_trees = list(temp_series[temp_series].index.values)
            OOB_tree_list = list(np.array(tree_list1)[OOB_trees])
            
        
            if len(OOB_tree_list) > 0:
                OOB_pred = []
                for tree_temp in OOB_tree_list:
                    OOB_pred.append(tree_temp.predict(row)[0])
                OOB_pred_list.append(np.mean(OOB_pred))
            else:
                yTrain2 = yTrain2.drop(i)
                current_pred = current_pred.drop(i)
        
        next_pred = np.array(current_pred) + np.array(OOB_pred_list)
        current_err = sklearn.metrics.mean_squared_error(yTrain2,current_pred)
        next_err = sklearn.metrics.mean_squared_error(yTrain2,next_pred)
        print(current_err,next_err, depth)
        OOB_error_list.append(current_err-next_err)
        
        residuals = -loss_gradient(yTrain, pred_train) 
        train['yTrain'] = residuals.values
    
        OOB_converged = check_OOB_convergence(OOB_error_list)
        depth = depth + 1
    
    return tree_list

def IncrementalDepthBagBoostRegressor_predict(xTest,tree_list):
    res = []
    for i in tree_list:
        depth = i.max_depth
        pred = i.predict(xTest)
        res.append([depth,pred])
    res = pd.DataFrame(res,columns = ['depth','pred'])
    res = res.groupby('depth')['pred'].apply(np.mean).reset_index()
    res = np.sum(res['pred'].to_numpy())
    return res

### LASSO Functions

def OptimizationStepRegression(xTrain,yTrain,tree_list,lambd, optimization_type = 'penalized'):
    pred = []
    ind = []
    for tree in tree_list:
        pred.append(tree.predict(xTrain))
        ind.append([int(x > 0) for x in tree.feature_importances_])  

    pred = np.transpose(pred)
    ind = np.transpose(ind)
    
    w = cp.Variable(len(tree_list),nonneg=True)
    constraints = []
    if optimization_type == 'penalized':
        loss = cp.sum_squares(cp.matmul(pred,w)-yTrain) 
        objective = (1/len(yTrain))*loss + lambd*cp.norm(cp.matmul(ind,w),1)
    if optimization_type == 'constrained':
        objective = cp.sum_squares(cp.matmul(pred,w)-yTrain) 
        constraints = [cp.norm(cp.matmul(ind,w),1)<= lambd]

    prob = cp.Problem(cp.Minimize(objective),constraints)
    prob.solve(solver = cp.MOSEK,mosek_params = {mosek.dparam.optimizer_max_time: 10000.0} )
    weights = np.asarray(w.value)
    weights[np.abs(weights) < 10**-3] = 0 
    return weights

def ControlBurnRegressor_select_features(xTest, tree_list,weights):
    imp = []
    for i in range(0,len(weights)):
        imp.append(weights[i]*tree_list[i].feature_importances_)
    imp1 = np.sum(imp, axis = 0)
    return imp1

def ControlBurnRegressor_predict(xTest, tree_list, weights):
    res = []
    for i in range(0,len(tree_list)):
        res.append(weights[i]*tree_list[i].predict(xTest))
    return np.sum(res,axis = 0)

# Experiment Functions

In [None]:
from sklearn.ensemble import RandomForestRegressor

def load_openml(dataset,y_label = ''):
    dataset1 = sklearn.datasets.fetch_openml(dataset,as_frame = True)
    X, y = dataset1.data, dataset1.target 
    data = pd.DataFrame(X,columns = dataset1.feature_names)
    
    if len(y_label) == 0:
        data['y'] = y
    else:
        data['y'] = y[y_label]
        
    data = data.sample(frac = 1)
    y = data['y']
    X = data.drop('y',axis = 1)
    cat = list(set(X.columns) - set(X.select_dtypes(include=np.number).columns.tolist()))
    for col in cat:
        X[col] = X[col].astype('category').cat.codes
        X[col] = X[col].fillna(max(X[col]+1))
    X = X.fillna(X.median()) 
    return X,y

def baseline_regressor(xTrain,yTrain,xTest,yTest, num_features, num_trees):
    model = RandomForestRegressor(n_estimators = num_trees)
    rf = model.fit(xTrain,yTrain)
    imp = pd.DataFrame(np.column_stack((xTrain.columns,rf.feature_importances_)),columns = ['features','scores'])
    imp = imp.sort_values('scores',ascending = False)
    to_use = imp.head(num_features)['features'].values
    rf1 = model.fit(xTrain[to_use],yTrain)
    pred = rf1.predict(xTest[to_use])
    return sklearn.metrics.mean_squared_error(yTest,pred)

def evaluate_regression_experiment(xTrain,yTrain,xTest,yTest,lambd,tree_list,form):
    weights = OptimizationStepRegression(xTrain, yTrain,tree_list, lambd,form)
    pred_no_polish = ControlBurnRegressor_predict(xTest, tree_list,weights)
    acc_no_polish = sklearn.metrics.mean_squared_error(yTest,pred_no_polish)

    imp1 = ControlBurnRegressor_select_features(xTest, tree_list,weights)
    
    if sum(imp1>0) == 0:
        return [1,1,1,0]
    
    rf = RandomForestRegressor(n_estimators = 100).fit(xTrain[xTrain.columns[imp1 > 0]],yTrain)
    pred_polish = rf.predict(xTest[xTrain.columns[imp1 > 0]])
    acc_polish = sklearn.metrics.mean_squared_error(yTest,pred_polish)

    importances = pd.DataFrame(np.column_stack((xTrain.columns,imp1)),columns = ['features','scores'])
    importances = importances.sort_values('scores',ascending = False)
    num_features = np.sum(importances['scores'] != 0)
    
    acc_base = baseline_regressor(xTrain,yTrain,xTest,yTest, np.sum(imp1>0), len(tree_list))

    return [acc_no_polish,acc_polish,acc_base,num_features]


def bisection_lambd(xTrain,yTrain,xTest,yTest,tree_list,form,lambd,count_limit,feature_to_find):
    counter = 0
    to_find = 0
    total_results = []
    while to_find <= feature_to_find:
        results = evaluate_regression_experiment(xTrain,yTrain,xTest,yTest,lambd,tree_list,form)
        nfeat = results[3]
        print(to_find,nfeat,lambd)

        if nfeat == to_find:
            to_find = to_find + 1

        elif counter > count_limit:
            to_find = to_find + 1
            counter = 0

        elif nfeat < to_find:
            lambd = lambd/2

        elif nfeat > to_find:
            lambd = lambd + lambd/2

        counter = counter + 1

        total_results.append(results)
    return total_results


In [None]:
def ControlBurnRegressorExperiment(dataset,y_label,form,lambd_start,count_limit,features_to_find):
    X,y = load_openml(dataset,y_label = y_label)
    kf = KFold(n_splits=5)
    kf.get_n_splits(X)
    final_result_bagboost = pd.DataFrame(None)
    final_result_bag = pd.DataFrame(None)
    
    for train_index, test_index in kf.split(X):
        xTrain, xTest = X.iloc[train_index], X.iloc[test_index]
        yTrain, yTest = y.iloc[train_index], y.iloc[test_index]

        features = xTrain.columns
        xTrain = preprocessing.scale(xTrain)
        xTrain = pd.DataFrame(xTrain,columns = features)
        xTest = preprocessing.scale(xTest)
        xTest = pd.DataFrame(xTest,columns = features)
        yTest = pd.Series(preprocessing.scale(yTest))
        yTrain = pd.Series(preprocessing.scale(yTrain))


        tree_list = IncrementalDepthBagBoostRegressor_OOB_EarlyStop(xTrain,yTrain,10**-3,5)
        res = bisection_lambd(xTrain,yTrain,xTest,yTest,tree_list,form,lambd_start,count_limit,features_to_find)
        res = pd.DataFrame(res, columns = ['no_polish','polish','base','nonzero'])
        final_result_bagboost = final_result_bagboost.append(res)
        
        
        tree_list_bag = IncrementalDepthBaggingRegressor_fit(xTrain,yTrain,25,10**-3,5)
        res_bag = bisection_lambd(xTrain,yTrain,xTest,yTest,tree_list,form,lambd_start,count_limit,features_to_find)
        res_bag = pd.DataFrame(res_bag, columns = ['no_polish','polish','base','nonzero'])
        final_result_bag = final_result_bag.append(res_bag)

    return final_result_bagboost, final_results_bag


In [None]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing
import pandas as pd
dataset = 'boston'
X,y = load_openml(dataset,y_label = '')
kf = KFold(n_splits=5)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X):
    xTrain, xTest = X.iloc[train_index], X.iloc[test_index]
    yTrain, yTest = y.iloc[train_index], y.iloc[test_index]

# Experiments

In [None]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing
import pandas as pd

dataset = 'boston'
X,y = load_openml(dataset,y_label = '')


kf = KFold(n_splits=5)
kf.get_n_splits(X)

final_result_bagboost = pd.DataFrame(None)
final_result_bag = pd.DataFrame(None)

for train_index, test_index in kf.split(X):
    xTrain, xTest = X.iloc[train_index], X.iloc[test_index]
    yTrain, yTest = y.iloc[train_index], y.iloc[test_index]
    
    features = xTrain.columns
    xTrain = preprocessing.scale(xTrain)
    xTrain = pd.DataFrame(xTrain,columns = features)
    xTest = preprocessing.scale(xTest)
    xTest = pd.DataFrame(xTest,columns = features)
    yTest = pd.Series(preprocessing.scale(yTest))
    yTrain = pd.Series(preprocessing.scale(yTrain))
    

    tree_list = IncrementalDepthBagBoostRegressor_OOB_EarlyStop(xTrain,yTrain,10**-3,5)
    res = bisection_lambd(xTrain,yTrain,xTest,yTest,tree_list,'penalized',10,8,10)
    res = pd.DataFrame(res, columns = ['no_polish','polish','base','nonzero'])
    final_result_bagboost = final_result_bagboost.append(res)
    
    tree_list_bag = IncrementalDepthBaggingRegressor_fit(xTrain,yTrain,25,10**-3,5)
    res_bag = bisection_lambd(xTrain,yTrain,xTest,yTest,tree_list_bag,'penalized',10,8,10)
    res_bag = pd.DataFrame(res_bag, columns = ['no_polish','polish','base','nonzero'])
    final_result_bag = final_result_bag.append(res_bag)


In [None]:
final_result_agg = final_result_bagboost.groupby('nonzero').agg(['mean','std']).reset_index()
final_result_agg = final_result_agg[final_result_agg['nonzero'] != 0]
final_result_agg

In [None]:
final_result_bag_agg = final_result_bag.groupby('nonzero').agg(['mean','std']).reset_index()
final_result_bag_agg = final_result_bag_agg[final_result_bag_agg['nonzero'] != 0]
final_result_bag_agg

In [None]:
import matplotlib.pyplot as plt
plt.plot(final_result_agg['nonzero'],final_result_agg['base']['mean'],label = 'baseline', color = 'blue')
plt.scatter(final_result_agg['nonzero'],final_result_agg['base']['mean'],color = 'blue')
plt.errorbar(final_result_agg['nonzero'],final_result_agg['base']['mean'],yerr = final_result_agg['base']['std'],color = 'blue')


plt.plot(final_result_agg['nonzero'],final_result_agg['polish']['mean'],label = 'polished',color = 'green')
plt.scatter(final_result_agg['nonzero'],final_result_agg['polish']['mean'], color = 'green')
plt.errorbar(final_result_agg['nonzero'],final_result_agg['polish']['mean'],yerr = final_result_agg['polish']['std'], color = 'green')

#plt.plot(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'],label = 'nonpolished', color = 'orange')
#plt.scatter(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'], color = 'orange')
#plt.errorbar(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'],yerr = final_result_agg['no_polish']['std'], color = 'orange')
#plt.legend()


In [None]:
import matplotlib.pyplot as plt
plt.plot(final_result_bag_agg['nonzero'],final_result_bag_agg['base']['mean'],label = 'baseline', color = 'blue')
plt.scatter(final_result_bag_agg['nonzero'],final_result_bag_agg['base']['mean'],color = 'blue')
plt.errorbar(final_result_bag_agg['nonzero'],final_result_bag_agg['base']['mean'],yerr = final_result_bag_agg['base']['std'],color = 'blue')


plt.plot(final_result_bag_agg['nonzero'],final_result_bag_agg['polish']['mean'],label = 'polished',color = 'green')
plt.scatter(final_result_bag_agg['nonzero'],final_result_bag_agg['polish']['mean'], color = 'green')
plt.errorbar(final_result_bag_agg['nonzero'],final_result_bag_agg['polish']['mean'],yerr = final_result_bag_agg['polish']['std'], color = 'green')

#plt.plot(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'],label = 'nonpolished', color = 'orange')
#plt.scatter(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'], color = 'orange')
#plt.errorbar(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'],yerr = final_result_agg['no_polish']['std'], color = 'orange')
#plt.legend()



# Parallel Experiment

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing
import pandas as pd

dataset = 'topo_2_1'
X,y = load_openml(dataset,y_label = '')
print(len(X),len(X.columns))
kf = KFold(n_splits=5)
kf.get_n_splits(X)

final_result_bagboost = pd.DataFrame(None)
final_result_bag = pd.DataFrame(None)
folds_data = []

for train_index, test_index in kf.split(X):
    xTrain, xTest = X.iloc[train_index], X.iloc[test_index]
    yTrain, yTest = y.iloc[train_index], y.iloc[test_index]
    
    features = xTrain.columns
    xTrain = preprocessing.scale(xTrain)
    xTrain = pd.DataFrame(xTrain,columns = features)
    xTest = preprocessing.scale(xTest)
    xTest = pd.DataFrame(xTest,columns = features)
    yTest = pd.Series(preprocessing.scale(yTest))
    yTrain = pd.Series(preprocessing.scale(yTrain))
    
    folds_data.append([xTrain,xTest,yTrain,yTest])

In [None]:
%%time
import ray
import time

# Start Ray.
ray.init()

@ray.remote
def parallel_wrapper(arg):
    xTrain = arg[0]
    xTest = arg[1]
    yTrain = arg[2]
    yTest = arg[3]
    
    features_to_find = min(len(X.columns),10)
    
    tree_list = IncrementalDepthBagBoostRegressor_OOB_EarlyStop(xTrain,yTrain,10**-3,5)
    res = bisection_lambd(xTrain,yTrain,xTest,yTest,tree_list,'penalized',10,8,features_to_find)
    res = pd.DataFrame(res, columns = ['no_polish','polish','base','nonzero'])

    tree_list_bag = IncrementalDepthBaggingRegressor_fit(xTrain,yTrain,25,10**-3,5)
    res_bag = bisection_lambd(xTrain,yTrain,xTest,yTest,tree_list_bag,'penalized',10,8,features_to_find)
    res_bag = pd.DataFrame(res_bag, columns = ['no_polish','polish','base','nonzero'])
    
    return res,res_bag

result_ids = []
for i in folds_data:
    result_ids.append(parallel_wrapper.remote(i))
    

results = ray.get(result_ids)  
ray.shutdown()

final_result_bagboost = pd.DataFrame(None)
final_result_bag = pd.DataFrame(None)
for r in results:
    final_result_bagboost = final_result_bagboost.append(r[0])
    final_result_bag = final_result_bag.append(r[1])
    
final_result_agg = final_result_bagboost.groupby('nonzero').agg(['mean','std']).reset_index()
final_result_agg = final_result_agg[final_result_agg['nonzero'] != 0]
final_result_bag_agg = final_result_bag.groupby('nonzero').agg(['mean','std']).reset_index()
final_result_bag_agg = final_result_bag_agg[final_result_bag_agg['nonzero'] != 0]

In [None]:
import matplotlib.pyplot as plt
plt.plot(final_result_agg['nonzero'],final_result_agg['base']['mean'],label = 'baseline', color = 'blue')
plt.scatter(final_result_agg['nonzero'],final_result_agg['base']['mean'],color = 'blue')
plt.errorbar(final_result_agg['nonzero'],final_result_agg['base']['mean'],yerr = final_result_agg['base']['std'],color = 'blue')


plt.plot(final_result_agg['nonzero'],final_result_agg['polish']['mean'],label = 'polished',color = 'green')
plt.scatter(final_result_agg['nonzero'],final_result_agg['polish']['mean'], color = 'green')
plt.errorbar(final_result_agg['nonzero'],final_result_agg['polish']['mean'],yerr = final_result_agg['polish']['std'], color = 'green')

plt.plot(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'],label = 'nonpolished', color = 'orange')
plt.scatter(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'], color = 'orange')
plt.errorbar(final_result_agg['nonzero'],final_result_agg['no_polish']['mean'],yerr = final_result_agg['no_polish']['std'], color = 'orange')



plt.legend()
plt.title(dataset+ ' bagboost')
plt.ylabel('MSE')
plt.xlabel('Features Selected')


In [None]:
import matplotlib.pyplot as plt
plt.plot(final_result_bag_agg['nonzero'],final_result_bag_agg['base']['mean'],label = 'baseline', color = 'navy')
plt.scatter(final_result_bag_agg['nonzero'],final_result_bag_agg['base']['mean'],color = 'navy')
plt.errorbar(final_result_bag_agg['nonzero'],final_result_bag_agg['base']['mean'],yerr = final_result_bag_agg['base']['std'],color = 'navy')


plt.plot(final_result_bag_agg['nonzero'],final_result_bag_agg['polish']['mean'],label = 'polished',color = 'olive')
plt.scatter(final_result_bag_agg['nonzero'],final_result_bag_agg['polish']['mean'], color = 'olive')
plt.errorbar(final_result_bag_agg['nonzero'],final_result_bag_agg['polish']['mean'],yerr = final_result_bag_agg['polish']['std'], color = 'olive')

plt.plot(final_result_bag_agg['nonzero'],final_result_bag_agg['no_polish']['mean'],label = 'nonpolished', color = 'salmon')
plt.scatter(final_result_bag_agg['nonzero'],final_result_bag_agg['no_polish']['mean'], color = 'salmon')
plt.errorbar(final_result_bag_agg['nonzero'],final_result_bag_agg['no_polish']['mean'],yerr = final_result_bag_agg['no_polish']['std'], color = 'salmon')

plt.legend()
plt.title(dataset + ' bag')
plt.ylabel('MSE')
plt.xlabel('Features Selected')

In [None]:
final_result_agg.to_csv('..Results/'+dataset+'_bagboost.csv')
final_result_bag_agg.to_csv('..Results/'+dataset+'_bag.csv')