In [13]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

# We'll use boston housing data as an example
bh = load_boston()
target = bh.target
x_data = bh.data

# We will evaluate the models using mean squared error
# Since Python's Sklearn requires scorer to have greater is better property
# We'll use the negated mean squared error as our scorer using make_scorer function
scorer = make_scorer(mean_squared_error, greater_is_better = False)

#Where applicable we'll use the same random seed
rand_state = 748932

## Building a Prediction Pipeline

Several of the models require that the input data is standardized. For these models we will use sklearn's prediction pipeline capabilities to ensure the standardization occurs as part of our cross-validated model tuning. For the remaining models we will use the un-standardized features in our model.

### Support Vector Regression 

Description of model here.

In [4]:
predPipeSVR = Pipeline(steps = [('scale', StandardScaler()), ('clf', SVR())])

# We need to tune the parameters c and kernel options:{'linear', 'poly', 'rbf', 'sigmoid'}
parameters = {'clf__kernel':('linear', 'rbf', 'poly', 'sigmoid'), 
              'clf__C':[0.001, 0.01, 0.1, 1, 10, 100], 
              'clf__epsilon':[0.1, 0.3, 0.5, 0.9]}

clf = GridSearchCV(predPipeSVR, parameters, scoring = scorer, cv=5, n_jobs = -1)
clf.fit(x_data, target)
resultsSVR = clf.cv_results_

print('The best parameters searched are:', clf.best_params_)


The best parameters searched are: {'clf__C': 10, 'clf__epsilon': 0.9, 'clf__kernel': 'rbf'}


### Elastic Net Regresssion

Elastic Net regression uses a weighted average of the Lasso (L1) and Ridge (L2) penalties for regularization as shown in the formula below. 

$$\hat{\beta} = argmin\left( \left(y-X\beta\right)^2 + \lambda*\left(\alpha*\sum{\left|\beta\right|} + \left(1-\alpha\right)*\sum{\beta^2} \right) \right)$$

Both penalties trade bias in the coefficient estimates, which are shrunk toward zero, for reduced model variance, which can improve predictive accuracy. 

When $\alpha=1$ the penalty reduces to the usual Lasso (L1) penalty in terms of the absolute value of the coefficients. This has the effect that some coefficients are zeroed out or removed from the model, resulting in implicit feature selection. On the other hand $\alpha=0$ results in the Ridge (L2) penalty. Since the L2 penalty is a function of the squared coefficients, the individual coefficients are shrunk toward zero but not typically zeroed out. 

Since the penalties are a function of the coefficients and these in turn are affected by the scale of the data, 



In [10]:
predPipeENET = Pipeline(steps = [('scale', StandardScaler()), ('clf', ElasticNet(random_state = rand_state))])

# Note: that sklearn's alpha penalty is the same as lambda in R's glmnet package
# thus, alpha controls strength of regularization
# Note: sklearn's l1_ratio is same as alpha in glmnet and controls mixing of l1 and l2 penalties
# l1_ratio = 1 runs straight Lasso, while l1_ratio = 0 runs straight ridge regression

parameters = {'clf__alpha':[0.25, 0.5, 1, 5, 10, 100], 'clf__l1_ratio':[0,0.1, 0.25, 0.5, 0.75, 0.9, 1]}
clf = GridSearchCV(predPipeENET, parameters, scoring = scorer, cv=5, n_jobs = -1)
clf.fit(x_data, target)
resultsENET = clf.cv_results_

print('The best parameters searched are:', clf.best_params_)


The best parameters searched are: {'clf__alpha': 0.25, 'clf__l1_ratio': 0}




### Random Forest Regressor 

We can also fit a random forest regressor.

In [11]:
predPipeENET = Pipeline(steps = [('scale', StandardScaler()), ('clf', RandomForestRegressor(n_estimators = 500) )])

parameters = {'max_features':[4, 8, 13], 'max_depth':[1, 5, 10, None],}
rfReg = RandomForestRegressor(n_estimators = 500) 
clf = GridSearchCV(rfReg, parameters, scoring = scorer, cv=5)
clf.fit(x_data, target)

resultsRFReg = clf.cv_results_

print('The best parameters searched are:', clf.best_params_)


The best parameters searched are: {'max_depth': None, 'max_features': 8}


### Gradient Boosted Regression 

Describe gradient boosted regression here.