# Ensemble Learning through Pasting 

**Pasting == Bootstrapping Aggregation W/O replacement** 

$\implies$  hyperparam = {M = #learners, alpha = # relative trainset size}


In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

# add project root folder to path to allow import local modules
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# import local modules
from stochastic_models import *
from visualisations import *
from basic_estimator_model import *

## Parameter Configuration

In [2]:
np.random.seed(2020)

In [3]:
N_train = 5000
N_test = 50000
d = 1
T = 2

In [4]:
lambda_range = (N_train*1e-9 , N_train*1e-3)
alpha_range = (8.3*1e-5, 0.83)
length_scale = np.sort(1/np.sqrt((2*alpha_range[0], 2*alpha_range[1])))

## Create Kernel & prepare datasets

In [5]:
#kernel
kernel = RBF(length_scale= (length_scale[0] + length_scale[1])/2, length_scale_bounds=length_scale) \
        + WhiteKernel(noise_level= (lambda_range[0] + lambda_range[1])/2 , noise_level_bounds=lambda_range)


In [6]:
#generate drivers
s_train = MaxCallStochasticModel(N_train,d,[1/12,11/12])
s_train.generate_samples()
s_test = MaxCallStochasticModel(N_test, d, [1/12,11/12])
s_test.generate_samples()

In [7]:
#prepare datasets & values
y_train = s_train.y
X_train = s_train.X
S_train = s_train.S

y_test = s_test.y
X_test = s_test.X
S_test = s_test.S

V_T = y_test  
V_0 = s_test.generate_true_V(0)

V_0= V_0.mean()
V_0 # Average expected PROFIT!

0.0793534441124991

In [8]:
X_test.shape

(50000, 1, 2)

## Create Custom Models
(to have more control of splitting etc)

In [9]:
from aggregating.models import SimpleBagger
s = SimplePaster(11,0.5,None)
s.get_params()

{'M': 11, 'predictor': None, 'train_size_alpha': 0.5}

## GridSearch

nB: cannot use Scikit gridsearch as it performs CV and we want to have a different test set of a larger size!

In [None]:
from multiprocessing import  cpu_count,current_process
from multiprocessing.pool import ThreadPool
import functools
import threading

from aggregating.gridsearch import create_logger, evaluate_model
from aggregating.utils import flatten_X

# GRIDSEARCH PARAMS
model = SimpleBagger(0,0,GaussianProcessRegressor(kernel,copy_X_train=False))
trials = 3
M_grid = [1,2,3,4,5,6,7,8]
alpha_grid = [0.5,0.7,0.9,1.0]
results = []

### ACTUAL GRIDSEARCH
pool = ThreadPool(4) #avoid allocation issues
for m in M_grid:
    for alpha in alpha_grid:
        hyperparams= {'M':m, 'train_size_alpha':alpha}
        pool.apply_async(evaluate_model, args=(model,hyperparams,flatten_X(X_train),y_train,1, [1/12,11/12],trials,N_test,MaxCallStochasticModel),callback = create_logger(hyperparams,results))
pool.close()
pool.join()



 {'M': 1, 'train_size_alpha': 0.5} -> thread id = 30312
fit
(5000, 2)
 {'M': 1, 'train_size_alpha': 0.7} -> thread id = 19248
fit
(5000, 2)
 {'M': 1, 'train_size_alpha': 0.9} -> thread id = 2532
fit
(5000, 2)
 {'M': 1, 'train_size_alpha': 1.0} -> thread id = 4336
fit
(5000, 2)
predict
(50000, 2)
predict
(50000, 2)
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 0.5} , 0 -> 0.24638305622193926
predict
(50000, 2)
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 0.9} , 0 -> 0.22620107782688725
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 0.5} , 1 -> 0.19748239864428063
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 1.0} , 0 -> 0.25083322357295385
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 0.7} , 0 -> 0.21515010491763945
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 0.5} , 2 -> 0.2083075643798671
{'M': 1, 'train_size_alpha': 0.5} -> [0.24638305622193926, 0.19748239864428063, 0.2083075643798671]
 {'M': 2, 'train_size_alpha': 0.5} -> thread id = 30312logger {'M': 1, 'train_size_alp

predict
(50000, 2)
{'M': 4, 'train_size_alpha': 0.9} , 2 -> 0.25848875050424713
{'M': 4, 'train_size_alpha': 0.9} -> [0.21078455979161972, 0.23300284661523601, 0.25848875050424713]
 {'M': 5, 'train_size_alpha': 1.0} -> thread id = 30312
fit
(5000, 2)
logger {'M': 4, 'train_size_alpha': 0.9}, -> [0.21078455979161972, 0.23300284661523601, 0.25848875050424713]
{'M': 4, 'train_size_alpha': 1.0} , 2 -> 0.24890501266129125
{'M': 4, 'train_size_alpha': 1.0} -> [0.255399109741202, 0.24493980832421527, 0.24890501266129125]
 {'M': 6, 'train_size_alpha': 0.5} -> thread id = 19248logger {'M': 4, 'train_size_alpha': 1.0}, -> [0.255399109741202, 0.24493980832421527, 0.24890501266129125]

fit
(5000, 2)
predict
(50000, 2)
predict
(50000, 2)
predict
(50000, 2)
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.9} , 0 -> 0.18255171536251663
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.7} , 0 -> 0.15909589109479322
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 1.0} , 0 -> 0.2594996730035609
predict

predict
(50000, 2)
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 0.9} , 2 -> 0.18781109118281294
{'M': 1, 'train_size_alpha': 0.9} -> [0.18376331996696993, 0.17088990126740447, 0.18781109118281294]
 {'M': 1, 'train_size_alpha': 1.0} -> thread id = 2532
logger {'M': 1, 'train_size_alpha': 0.9}, -> [0.18376331996696993, 0.17088990126740447, 0.18781109118281294]
fit
(5000, 2)
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 1.0} , 0 -> 0.23826376028536506
predict
(50000, 2)
{'M': 8, 'train_size_alpha': 1.0} , 0 -> 0.3399799970953494{'M': 1, 'train_size_alpha': 1.0} , 1 -> 0.22302788473666652

predict
(50000, 2)
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 1.0} , 2 -> 0.21283132590257609
{'M': 1, 'train_size_alpha': 1.0} -> [0.23826376028536506, 0.22302788473666652, 0.21283132590257609]
 {'M': 2, 'train_size_alpha': 0.5} -> thread id = 2532
logger {'M': 1, 'train_size_alpha': 1.0}, -> [0.23826376028536506, 0.22302788473666652, 0.21283132590257609]fit
(5000, 2)

{'M': 8, 'train_size_alp

{'M': 4, 'train_size_alpha': 0.9} , 2 -> 0.17423794088215236
{'M': 4, 'train_size_alpha': 0.9} -> [0.25057752900771296, 0.24094789602710187, 0.17423794088215236]
 {'M': 5, 'train_size_alpha': 1.0} -> thread id = 19248logger {'M': 4, 'train_size_alpha': 0.9}, -> [0.25057752900771296, 0.24094789602710187, 0.17423794088215236]
fit
(5000, 2)

{'M': 4, 'train_size_alpha': 1.0} , 2 -> 0.15960210950977696
{'M': 4, 'train_size_alpha': 1.0} -> [0.17018600022043812, 0.17609008226876188, 0.15960210950977696]
 {'M': 6, 'train_size_alpha': 0.5} -> thread id = 2532
logger {'M': 4, 'train_size_alpha': 1.0}, -> [0.17018600022043812, 0.17609008226876188, 0.15960210950977696]fit
(5000, 2)

predict
(50000, 2)
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.7} , 0 -> 0.23730875663959133
predict
(50000, 2)
predict
(50000, 2)
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.9} , 0 -> 0.19340895250252688
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.7} , 1 -> 0.2354526563095605
predict
(50000, 2)
{'M'

predict
(50000, 2)
{'M': 1, 'train_size_alpha': 0.9} , 2 -> 0.14422471086902347
{'M': 1, 'train_size_alpha': 0.9} -> [0.15993114142242784, 0.12012557320655037, 0.14422471086902347]
 {'M': 1, 'train_size_alpha': 1.0} -> thread id = 2532logger {'M': 1, 'train_size_alpha': 0.9}, -> [0.15993114142242784, 0.12012557320655037, 0.14422471086902347]

fit
(5000, 2)
{'M': 8, 'train_size_alpha': 0.7} , 1 -> 0.20618155322286602
predict
(50000, 2)
predict
(50000, 2)
{'M': 8, 'train_size_alpha': 0.9} , 0 -> 0.24089850181115466
predict
(50000, 2)
{'M': 8, 'train_size_alpha': 1.0}


Traceback (most recent call last):
  File "C:\Users\thoma\School\Machine_learning\ML-Project2\aggregating\gridsearch.py", line 47, in evaluate_model
    y_hat = model.predict(Flattened_X_test)
  File "C:\Users\thoma\School\Machine_learning\ML-Project2\aggregating\models.py", line 27, in predict
    predictions = predictions + self.predictors[i].predict(X)
  File "c:\users\thoma\appdata\local\programs\python\python37\lib\site-packages\sklearn\gaussian_process\_gpr.py", line 338, in predict
    K_trans = self.kernel_(X, self.X_train_)
  File "c:\users\thoma\appdata\local\programs\python\python37\lib\site-packages\sklearn\gaussian_process\kernels.py", line 758, in __call__
    return self.k1(X, Y) + self.k2(X, Y)
  File "c:\users\thoma\appdata\local\programs\python\python37\lib\site-packages\sklearn\gaussian_process\kernels.py", line 1435, in __call__
    K = np.exp(-.5 * dists)
MemoryError: Unable to allocate array with shape (50000, 5000) and data type float64


logger {'M': 8, 'train_size_alpha': 1.0}, -> None {'M': 2, 'train_size_alpha': 0.5} -> thread id = 19248

fit
(5000, 2)
{'M': 1, 'train_size_alpha': 1.0} , 0 -> 0.20726015600341657
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 1.0} , 1 -> 0.20115846907627224
predict
(50000, 2)
predict
(50000, 2)
{'M': 2, 'train_size_alpha': 0.5} , 0 -> 0.1656885904550893
predict
(50000, 2)
{'M': 1, 'train_size_alpha': 1.0} , 2 -> 0.2145047751110511
{'M': 1, 'train_size_alpha': 1.0} -> [0.20726015600341657, 0.20115846907627224, 0.2145047751110511]
 {'M': 2, 'train_size_alpha': 0.7} -> thread id = 2532logger {'M': 1, 'train_size_alpha': 1.0}, -> [0.20726015600341657, 0.20115846907627224, 0.2145047751110511]

fit
(5000, 2)
{'M': 2, 'train_size_alpha': 0.5} , 1 -> 0.14133020760855625
predict
(50000, 2)
{'M': 8, 'train_size_alpha': 0.7} , 2 -> 0.22907781878972683
{'M': 8, 'train_size_alpha': 0.7} -> [0.25391039009714883, 0.20618155322286602, 0.22907781878972683]
 {'M': 2, 'train_size_alpha': 0.9} -> threa

predict
(50000, 2)
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.7} , 0 -> 0.22698790625224324
predict
(50000, 2)
{'M': 6, 'train_size_alpha': 0.5} , 0 -> 0.15831020113868247
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.9} , 1 -> 0.14383997465022963
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.7} , 1 -> 0.17058423594741856
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 1.0} , 0 -> 0.20502865297231423
predict
(50000, 2)
{'M': 6, 'train_size_alpha': 0.5} , 1 -> 0.1674200007883071
predict
(50000, 2)
{'M': 5, 'train_size_alpha': 0.7} , 2 -> 0.2334779827043963
{'M': 5, 'train_size_alpha': 0.7} -> [0.22698790625224324, 0.17058423594741856, 0.2334779827043963]
logger {'M': 5, 'train_size_alpha': 0.7}, -> [0.22698790625224324, 0.17058423594741856, 0.2334779827043963] {'M': 6, 'train_size_alpha': 0.7} -> thread id = 30312

fit
(5000, 2)
{'M': 6, 'train_size_alpha': 0.5} , 2 -> 0.1836707266996896
{'M': 6, 'train_size_alpha': 0.5} -> [0.15831020113868247, 0.1674200007883071, 0.1836

In [None]:
converted_results = np.ones((len(M_grid),len(alpha_grid),trials))*(-1)
for item in results:
    print(item)
    converted_results[M_grid.index(item[0]),alpha_grid.index(item[1])] = item[2]

print(converted_results.shape)
print(converted_results)

In [None]:
masked_results= np.ma.masked_where(converted_results <= 0.0,converted_results) # some runs have been aborted every now and then -> filter them out
means = masked_results.mean(axis=2)
sigmas = masked_results.std(axis=2)
means.shape

In [None]:
for i in range(len(alpha_grid)):
    plt.errorbar(np.array(M_grid),means[:,i],sigmas[:,i],marker ='o',label = f"alpha = {alpha_grid[i]}")
plt.title(f"Bagging Normalized error: N_train = {N_train}, d= {d}, N_test= {N_test}")
plt.xlabel("M")
plt.xticks(M_grid)
plt.ylabel("normalized error")
plt.legend()
plt.show()

## Store results

In [None]:
res_dict = {'N_train': N_train, 'N_test': N_test,'mgrid': M_grid, 'alpha_grid': alpha_grid, 'errors': converted_results.tolist()}

In [None]:
import json
import datetime
#with open(f'bagging_w_replacement_{str(datetime.date.today())}.json', 'w') as fp:
    #json.dump(res_dict, fp)