<a href="https://colab.research.google.com/github/zhensongren/learn-ml/blob/master/multi_output_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries and simulate input data

In [None]:
# linear regression for multioutput regression
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from numpy import absolute
from numpy import mean
from numpy import std

from sklearn.model_selection import RepeatedKFold
from sklearn.multioutput import RegressorChain
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create datasets
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3, random_state=1, noise=0.5)
y

array([[ 38.19768253,  23.41425082,  72.02700222],
       [ -3.59886923, 113.8431764 , -26.09104728],
       [309.54553436, 164.30758793, 183.95816511],
       ...,
       [186.48402065,  53.53872432,  90.95384241],
       [ 63.83768848,  25.87637243,  85.08239269],
       [276.61731597, 130.84468656, 105.49340764]])

In [None]:
y[:,2] = 3*y[:,0] + 5 
y[:,1] = 9*y[:,2] + 5 

y

array([[ 3.81976825e+01,  1.08133743e+03,  1.19593048e+02],
       [-3.59886923e+00, -4.71694693e+01, -5.79660770e+00],
       [ 3.09545534e+02,  8.40772943e+03,  9.33636603e+02],
       ...,
       [ 1.86484021e+02,  5.08506856e+03,  5.64452062e+02],
       [ 6.38376885e+01,  1.77361759e+03,  1.96513065e+02],
       [ 2.76617316e+02,  7.51866753e+03,  8.34851948e+02]])

# Cross-validated model results

## LR

In [None]:
# define model
model = LinearRegression()
# fit model
model.fit(X, y)
# make a prediction
row = [0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]
yhat = model.predict([row])
# summarize prediction
print(yhat[0])

[  64.61034995 1794.47944869  198.83104985]


In [None]:
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 4.050 (0.273)


## KNN

In [None]:
# k-nearest neighbors for multioutput regression
# define model
model = KNeighborsRegressor()
# fit model
model.fit(X, y)
# make a prediction
row = [0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]
yhat = model.predict([row])
# summarize prediction
print(yhat[0])

[ 0.56062403 65.13684884  6.68187209]


In [None]:
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 536.078 (46.206)


DecisionTreeRegressor

In [None]:
# define model
model = DecisionTreeRegressor()
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 577.021 (50.720)


## Use MultiOutputRegressor with LinearSVR

In [None]:
# define base model
model = LinearSVR()
# define the direct multioutput wrapper model
wrapper = MultiOutputRegressor(model)
# evaluate the model and collect the scores
n_scores = cross_val_score(wrapper, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 903.091 (87.952)


## Use RegressorChain, LinearSVR and specify order of prediction

In [None]:
# example of evaluating chained multioutput regression with an SVM model
# define base model
model = LinearSVR()
# define the chained multioutput wrapper model
wrapper = RegressorChain(model)

# evaluate the model and collect the scores
n_scores = cross_val_score(wrapper, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 4.167 (0.273)


### Chained LinearRegression

In [None]:
# define base model
model = LinearRegression()
# define the chained multioutput wrapper model
wrapper = RegressorChain(model)

# evaluate the model and collect the scores
n_scores = cross_val_score(wrapper, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 4.050 (0.273)
