In [35]:
from joblib import Parallel, delayed
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.datasets import load_boston
# X, y = make_regression(n_samples=1000)

data = load_boston()
X, y = data['data'], data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [57]:
from sklearn.metrics import accuracy_score, mean_squared_error
from scipy.optimize import minimize
import numpy as np

np.random.seed(1123)

class ClassificationWeightsOptimizer(object):

    def __init__(self, score_func=None):
        self.score_func = score_func

    @staticmethod
    def objective(weights, y_true, y_pred):
        y_ens = np.round(np.average(y_pred, axis=1, weights=weights),0).astype(int)
        return (1-accuracy_score(y_true, y_ens))

    def _run_opt(self, y_true, y_pred):
        w0 = np.random.uniform(size=y_pred.shape[1])
        bounds = [(0.,1.)] * y_pred.shape[1]
        cons = [{'type': 'eq','fun': lambda w: w.sum() - 1}]
        res = minimize(self.objective,w0,
            args=(y_true,y_pred),method='SLSQP',bounds=bounds,
            options={'disp':False, 'maxiter':1000},constraints=cons)

        return (res.fun, res.x)

    def run_parallel(self, iters, y_true, y_pred):
        results = Parallel(n_jobs=-1)(delayed(self._run_opt)(y_true, y_pred) for i in range(iters))
        r, w = [re[0] for re in results], [re[1] for re in results]
        best_score = np.min(r)    
        best_weights = w[r.index(best_score)]
        print('\nOptimized weights:')
        print(f"Best Score: {best_score}")
        for i,w in enumerate(best_weights):
            print(f'Weight {i}: {w:.4f}')
        return {"best_score":best_score,"best_weights":best_weights}


    def run(self, iters, y_true, y_pred):
        results_list = [] 
        weights_list = []  
        for k in range(iters):
            r,w = self._run_opt(y_true, y_pred)
            results_list.append(r)
            weights_list.append(w)

        best_score = np.min(results_list)    
        best_weights = weights_list[results_list.index(best_score)]
        
        print('\nOptimized weights:')
        print(f"Best Score: {best_score}")
        for i,w in enumerate(best_weights):
            print(f'Weight {i}: {w:.4f}')

        return {"best_score":best_score,"best_weights":best_weights}


class RegressionWeightsOptimizer(object):

    def __init__(self):
        pass
    
    @staticmethod
    def objective(weights, y_true, y_pred):
        y_ens = np.average(y_pred, axis=1, weights=weights)
        return mean_squared_error(y_true, y_ens)

    def _run_opt(self, y_true, y_pred):
        # w0 = np.random.uniform(size=y_pred.shape[1])
        w0 = [1/y_pred.shape[1] for i in range(y_pred.shape[1])]
        bounds = [(0.,1.)] * y_pred.shape[1]
        cons = [{'type': 'eq','fun': lambda w: w.sum() - 1}]
        res = minimize(self.objective,w0,args=(y_true,y_pred),
            method='SLSQP',bounds=bounds,options={'disp':False, 'maxiter':1000},
            constraints=cons)
            
        return (res.fun, res.x)

    def run_parallel(self, iters, y_true, y_pred):
        results = Parallel(n_jobs=-1)(delayed(self._run_opt)(y_true, y_pred) for i in range(iters))
        r, w = [re[0] for re in results], [re[1] for re in results]
        best_score = np.min(r)    
        best_weights = w[r.index(best_score)]
        print('\nOptimized weights:')
        print(f"Best Score: {best_score}")
        for i,w in enumerate(best_weights):
            print(f'Weight {i}: {w:.4f}')
        return {"best_score":best_score,"best_weights":best_weights}


    def run(self, iters, y_true, y_pred):
        results_list = [] 
        weights_list = []  
        for k in range(iters):
            r,w = self._run_opt(y_true, y_pred)
            results_list.append(r)
            weights_list.append(w)

        best_score = np.min(results_list)    
        best_weights = weights_list[results_list.index(best_score)]
        
        print('\nOptimized weights:')
        print(f"Best Score: {best_score}")
        for i,w in enumerate(best_weights):
            print(f'Weight {i}: {w:.4f}')

        return {"best_score":best_score,"best_weights":best_weights}

In [58]:
from sklearn.ensemble import VotingRegressor

estimators = [
    ('m1', LinearRegression()),
    ('m2', ExtraTreesRegressor()),
    ('m3', RandomForestRegressor()),
    ('m4', GradientBoostingRegressor()),
    ('m5', KNeighborsRegressor()),
]

# voting = VotingRegressor(estimators, weights=None, n_jobs=-1, verbose=False)

In [59]:
from sklearn.base import clone

def _fit_single_estimator(model, X, y):
    return model.fit(X,y)


class AutoVotingRegressor():
    def __init__(self, estimators, weights=None, n_jobs=-1, verbose=False):
        
        self.estimators_ = estimators
        self.weights_ = weights
        self.n_jobs = n_jobs
        self.verbose=verbose
        self.n_estimators = len(self.estimators_)

        self.w_opt = RegressionWeightsOptimizer()


        self.estimators__ = estimators

    def _predict(self, X):
        """Collect results from clf.predict calls."""
        return np.asarray([est[1].predict(X) for est in self.estimators_]).T

    def predict(self, X):
        return np.average(self._predict(X), axis=1, weights=self.weights_)

    def fit(self, X, y):
        
        names, models = [e[0] for e in self.estimators__],[e[1] for e in self.estimators__]
        fitted_models = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_single_estimator)(
                clone(m),
                X,
                y
            )
            for m in models
        )
        self.estimators_ = [(names[i],fitted_models[i]) for i in range(self.n_estimators)]
        return self

    def compute_weights(self, X, y, eval_size=0.1, iters=1000):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=eval_size)

        self.fit(X_train, y_train)

        y_pred = self._predict(X_test)

        r = self.w_opt.run_parallel(iters, y_test, y_pred)

        self.weights_ = r['best_weights']

In [62]:
voting = AutoVotingRegressor(estimators=estimators)

voting.compute_weights(X_train, y_train)


# Optimized weights:
# Best Score: 5.186880268514192
# Weight 0: 0.1047
# Weight 1: 0.4823
# Weight 2: 0.0000
# Weight 3: 0.3239
# Weight 4: 0.0891


Optimized weights:
Best Score: 8.25379237381497
Weight 0: 0.0000
Weight 1: 0.0000
Weight 2: 0.0000
Weight 3: 1.0000
Weight 4: 0.0000


In [18]:
# voting.fit(X_train, y_train)

In [21]:
voting.score(X_test, y_test)

kf = KFold(n_splits=5)

0.9202159372071448

In [148]:
opt.run(iters=1, y_true=y, y_pred=val_preds)


Optimized weights:
Best Score: 0.0888888888888889
Weight 0: 0.0443
Weight 1: 0.3285
Weight 2: 0.0000
Weight 3: 0.0000
Weight 4: 0.3337
Weight 5: 0.2934


{'best_score': 0.0888888888888889,
 'best_weights': array([0.04430314, 0.32853739, 0.        , 0.        , 0.33374427,
        0.2934152 ])}

In [149]:
opt.run_parallel(iters=100, y_true=y_val, y_pred=val_preds)


Optimized weights:
Best Score: 0.0888888888888889
Weight 0: 0.0000
Weight 1: 0.1794
Weight 2: 0.2146
Weight 3: 0.0000
Weight 4: 0.3485
Weight 5: 0.2575


{'best_score': 0.0888888888888889,
 'best_weights': array([1.11022302e-16, 1.79394462e-01, 2.14585699e-01, 0.00000000e+00,
        3.48503571e-01, 2.57516268e-01])}

In [118]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from math import sqrt

NFOLDS = 6
SEED = 123



def get_oof(m, X_train, y_train, X_test, kf):
    ntrain = X_train.shape[0]
    ntest = X_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        x_tr = X_train[train_index]
        y_tr = y_train[train_index]
        x_te = X_train[test_index]

        m.fit(x_tr, y_tr)

        oof_train[test_index] = m.predict(x_te)
        oof_test_skf[i, :] = m.predict(X_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


In [123]:
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

model_1 = LinearRegression()
model_2 = RandomForestRegressor()
model_3 = ExtraTreesRegressor()
model_4 = Ridge()
all_models = [model_1, model_2, model_3, model_4]

oof = [get_oof(m, X_train, y_train, X_test, kf) for m in all_models]
oof_trains, oof_tests = [p[0] for p in oof],[p[1] for p in oof]


In [131]:
np.concatenate(oof_tests, axis=1)

array([[31.10217436, 29.22766667, 28.55516667, 30.73796599],
       [16.13644488, 16.78016667, 16.69216667, 15.54842484],
       [24.50123952, 21.73333333, 22.3765    , 24.0333555 ],
       [ 8.65779712, 12.595     , 13.03416667, 10.50764633],
       [16.64650655, 19.3745    , 20.28      , 17.3942869 ],
       [20.35858567, 15.5235    , 16.01883333, 20.55919493],
       [17.14314833, 16.06566667, 17.25516667, 17.66305658],
       [29.1170889 , 25.35883333, 25.24283333, 28.64992499],
       [22.30472703, 24.498     , 24.79233333, 22.84451068],
       [19.61202601, 19.63383333, 19.80983333, 20.15094986],
       [ 6.95782485,  7.96583333,  8.5115    ,  6.89361638],
       [36.49925801, 44.66016667, 44.39416667, 36.13784665],
       [15.34226987, 14.093     , 16.733     , 17.25835355],
       [22.93937797, 18.14133333, 20.50833333, 24.97382215],
       [25.31982075, 26.10433333, 25.84966667, 25.33839425],
       [ 6.02647584,  8.6235    ,  8.88416667,  6.24192213],
       [21.76602153, 21.

In [129]:
from sklearn.metrics import mean_squared_error

train_predictions = np.concatenate(oof_trains, axis=1)


def objective(weights):
    y_ens = np.average(train_predictions, axis=1, weights=weights)
    return mean_squared_error(y_train, y_ens)

from scipy.optimize import minimize

results_list = []    # a list to store the best score of each round
weights_list = []    # a list to store the best weights of each round

for k in range(1000):
    # I randomly set the initial weights from which the algorithm will try searching a minima    
    w0 = np.random.uniform(size=train_predictions.shape[1])

    # I define bounds, i.e. lower and upper values of weights.
    # I want the weights to be between 0 and 1.
    bounds = [(0,1)] * train_predictions.shape[1]

    # I set some constraints. Here, I want the sum of the weights to be equal to 1
    cons = [{'type': 'eq',
             'fun': lambda w: w.sum() - 1}]

    # I can now search for the best weights
    res = minimize(objective,
                   w0,
                   method='SLSQP',
                   bounds=bounds,
                   options={'disp':False, 'maxiter':1000},
                   constraints=cons)

    # I save the best score and the best weights of
    # this round in their respective lists
    results_list.append(res.fun)
    weights_list.append(res.x)

# After running all the rounds, I extract the best score
# and the corresponding weights
best_score = np.min(results_list)    
best_weights = weights_list[results_list.index(best_score)]


print('\nOptimized weights:')
for i,w in enumerate(best_weights):
    print(f'w {i}: {w:.4f}')
print('Best score: {:.4f}'.format(best_score))


Optimized weights:
w 0: 0.0000
w 1: 0.3485
w 2: 0.6515
w 3: 0.0000
Best score: 10.7048
