# Ensemble Learning through Bagging 

**bagging == Bootstrapping Aggregation** 

$\implies$  hyperparam = {M = #learners, alpha = # relative trainset size}


In [1]:
!git clone https://github.com/tlpss/ML-Project2.git

import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

# add project root folder to path to allow import local modules
import os
import sys
sys.path.append(os.path.abspath(os.path.join('./ML-Project2')))

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# import local modules
from stochastic_models import *
from visualisations import *


Cloning into 'ML-Project2'...
remote: Enumerating objects: 454, done.[K
remote: Counting objects: 100% (454/454), done.[K
remote: Compressing objects: 100% (315/315), done.[K
remote: Total 454 (delta 263), reused 295 (delta 124), pack-reused 0[K
Receiving objects: 100% (454/454), 1.48 MiB | 19.74 MiB/s, done.
Resolving deltas: 100% (263/263), done.


## Parameter Configuration

In [None]:
np.random.seed(2020)

In [None]:
N_train = 5000
N_test = 50000
d = 1
T = 2

In [None]:
lambda_range = (N_train*1e-9 , N_train*1e-3)
alpha_range = (8.3*1e-5, 0.83)
length_scale = np.sort(1/np.sqrt((2*alpha_range[0], 2*alpha_range[1])))

## Create Kernel & prepare datasets

In [None]:
#kernel
kernel = RBF(length_scale= (length_scale[0] + length_scale[1])/2, length_scale_bounds=length_scale) \
        + WhiteKernel(noise_level= (lambda_range[0] + lambda_range[1])/2 , noise_level_bounds=lambda_range)


In [None]:
#generate drivers
s_train = MaxCallStochasticModel(N_train,d,[1/12,11/12])
s_train.generate_samples()
s_test = MaxCallStochasticModel(N_test, d, [1/12,11/12])
s_test.generate_samples()

In [None]:
#prepare datasets & values
y_train = s_train.y
X_train = s_train.X
S_train = s_train.S

y_test = s_test.y
X_test = s_test.X
S_test = s_test.S

V_T = y_test  


In [None]:
## calculate V_0 separately using bigger dataset
s_v =MaxCallStochasticModel(200000, d, [1/12,11/12])
s_v.generate_samples()
V_0 = s_v.generate_true_V(0)

V_0= V_0.mean()
V_0 # Average expected PROFIT!

In [None]:
X_test.shape

## Create Custom Models
(to have more control of splitting etc)

In [None]:
from aggregating.models import SimpleBagger
s = SimpleBagger(11,0.5,None)
s.get_params()

## GridSearch

nB: cannot use Scikit gridsearch as it performs CV and we want to have a different test set of a larger size!

In [11]:
#reference:
from aggregating.gridsearch import create_logger, evaluate_model
from aggregating.utils import flatten_X

hyperparams= {'M':1, 'train_size_alpha':1.0} #baseline!
model = GaussianProcessRegressor(kernel)
reference_error = evaluate_model(model,hyperparams,flatten_X(X_train),y_train,1, [1/12,11/12],5,N_test,MaxCallStochasticModel,V_0)
print(reference_error)

 {'M': 1, 'train_size_alpha': 1.0} -> thread id = 140363136583552
{'M': 1, 'train_size_alpha': 1.0} , 0 -> 0.1438355169296338
{'M': 1, 'train_size_alpha': 1.0} , 1 -> 0.17310338400870004
{'M': 1, 'train_size_alpha': 1.0} , 2 -> 0.12835492187083933
{'M': 1, 'train_size_alpha': 1.0} , 3 -> 0.14720125953904917
{'M': 1, 'train_size_alpha': 1.0} , 4 -> 0.11411198863413166
{'M': 1, 'train_size_alpha': 1.0} -> [0.1438355169296338, 0.17310338400870004, 0.12835492187083933, 0.14720125953904917, 0.11411198863413166]
[0.1438355169296338, 0.17310338400870004, 0.12835492187083933, 0.14720125953904917, 0.11411198863413166]


In [None]:
reference_error_mean = sum(reference_error)/len(reference_error)
print(reference_error_mean)

0.1413214141964708


In [13]:
from multiprocessing import  cpu_count,current_process
from multiprocessing.pool import ThreadPool
import functools
import threading

from aggregating.gridsearch import create_logger, evaluate_model
from aggregating.utils import flatten_X

# GRIDSEARCH PARAMS
model = SimpleBagger(0,0,GaussianProcessRegressor(kernel,copy_X_train=False))
trials = 3
M_grid = [1,3,5,7,9]
alpha_grid = [0.2,0.3,0.4,0.5]
results = []
print(cpu_count())
### ACTUAL GRIDSEARCH
pool = ThreadPool(cpu_count()) 
for m in M_grid:
    for alpha in alpha_grid:
        hyperparams= {'M':m, 'train_size_alpha':alpha}
        pool.apply_async(evaluate_model, args=(model,hyperparams,flatten_X(X_train),y_train,1, [1/12,11/12],trials,N_test,MaxCallStochasticModel,V_0,None,[2020,2021,2022]),callback = create_logger(hyperparams,results))
pool.close()
pool.join()



2
 {'M': 1, 'train_size_alpha': 0.2} -> thread id = 140362423404288
fit
(5000, 2)
 {'M': 1, 'train_size_alpha': 0.3} -> thread id = 140362415011584
fit
(5000, 2)
predict
(50000, 2)
predict
(50000, 2)


KeyboardInterrupt: ignored

In [None]:
converted_results = np.ones((len(M_grid),len(alpha_grid),trials))*(-1)
for item in results:
    print(item)
    converted_results[M_grid.index(item[0]),alpha_grid.index(item[1])] = item[2]

print(converted_results.shape)
print(converted_results)

In [None]:
masked_results= np.ma.masked_where(converted_results <= 0.0,converted_results) # some runs have been aborted every now and then -> filter them out
means = masked_results.mean(axis=2)
sigmas = masked_results.std(axis=2)
means.shape

In [None]:
plt.hlines(reference_error_mean,xmin=M_grid[0],xmax=M_grid[-1],linestyles='dashed',label="reference error")
for i in range(len(alpha_grid)):
    plt.errorbar(np.array(M_grid),means[:,i],sigmas[:,i],marker ='o',label = f"alpha = {alpha_grid[i]}")
plt.title(f"Bagging Normalized error: N_train = {N_train}, d= {d}, N_test= {N_test}")
plt.xlabel("M")
plt.xticks(M_grid)
plt.ylabel("normalized error")
plt.legend(loc='upper right')
plt.show()

## Store results

In [None]:
res_dict = {'N_train': N_train, 'N_test': N_test,'mgrid': M_grid, 'alpha_grid': alpha_grid, 'errors': converted_results.tolist()}

In [None]:
import json
import datetime
with open(f'#0005_hard_bagging_{str(datetime.date.today())}.json', 'w') as fp:
    json.dump(res_dict, fp)