In [None]:
%pylab inline

In [None]:
import seaborn

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.datasets import make_regression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error
seaborn.set_context('talk')
seaborn.set_style('white')
np.random.seed(12345)

# Dimensionality Reduction

## Generate example data

#### Generate some data with lots of features, but which are mostly combinations of other features

In [None]:
x, y = make_regression(n_features=200, effective_rank=2, noise=3)
train_x = x[0:75]
train_y = y[0:75]
test_x = x[75:]
test_y = y[75:]

## Set up pipeline to perform PCA

In [None]:
pca_model = make_pipeline(StandardScaler(), PCA(n_components=5), LinearRegression())

## Compare Linear+PCA with simple Linear

In [None]:
plot(test_y)
errors = []
for model in [LinearRegression(), pca_model]:
    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    errors.append(mean_squared_error(test_y, pred))
    plot(pred)
legend(['Truth', 'Linear', 'Linear with PCA'])
print('No PCA', errors[0])
print('PCA', errors[1])


#### Normal linear regression performs badly when input features are highly correlated

# Feature Selection

## Generate example with only one informative feature

#### A dataset with 100 input variables, only coef 1 is informative

In [None]:
x = np.random.randn(100, 10)
y = x[:, 1] + np.random.randn(100)

## Fit Lasso and Random Forest to compare

In [None]:
model = LassoCV()
model.fit(x, y)

rf = RandomForestRegressor(n_estimators=100, max_depth=3)
_ = rf.fit(x, y)


## Plot feature importances

In [None]:
coefs = pd.DataFrame(dict(coef=np.arange(10), lasso=np.abs(model.coef_), rf=rf.feature_importances_)).melt(id_vars='coef', var_name='model')
seaborn.barplot(x='coef', y='value', data=coefs, hue='model')
_ = ylabel("Importance")

#### Lasso sets uninformative variables to exactly zero. Random forest gives less informative variables lower importance