# Compare initialization strategies for the LLA algorithm

This notebook demonstrates the concave penalties that come with `ya_glm`. We fit these penalties using the LLA algorithm applied go a "good enough" initializer as in Fan et al. 2014.

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from ya_glm.toy_data import sample_sparse_lin_reg
from ya_glm.backends.fista.LinearRegression import FcpLLA, FcpLLACV,\
    LassoCV, RidgeCV, ENetCV

# Initialize, then fit concave penalty the LLA algorithm

In [4]:
# sample some linear regression data
X, y, coef, intercept = sample_sparse_lin_reg(n_samples=100, n_features=10, n_nonzero=5,
                                              X_dist='corr', x_corr=.1,
                                              random_state=1)

In [5]:
# initialize the coefficient with a Lasso estimate tuned with cross-validation
init = LassoCV().fit(X, y)

# these will give the same behavior
# init = LassoCV() # if an unfit estimator is passed in, it will be fit to the data
# init = 'default' # the default will use a LassoCV

# fit the concave penalty by initializing from the Lasso estimate
concave_est = FcpLLA(init=init)

# by default we take one LLA step -- see Fan et al 2014
# concave_est = FcpLLA(n_lla_steps=1, init=init) # default behavior
# but we can of course run for more LLA steps!
concave_est = FcpLLA(lla_n_steps=100, init=init)

# note the cross-validation estimator will fit the initializer once!
# then will use the same initailizer for each cross-validation fold
cv_est = FcpLLACV(estimator=concave_est)

# Compare different estimators

Lets compare different initialization strategies

In [6]:
# sample some linear regression data
# here lets use higher dimensional data
X, y, coef, intercept = sample_sparse_lin_reg(n_samples=100, n_features=100, n_nonzero=5,
                                              random_state=1)

In [None]:
cv_kws = {'cv': 10, 'cv_n_jobs': -1}

# Lasso
%time lasso = LassoCV(**cv_kws).fit(X, y)
print('Lasso L2 to truth',
      np.linalg.norm(lasso.best_estimator_.coef_ - coef), '\n')


# Ridge regression
%time ridge = RidgeCV(**cv_kws).fit(X, y)
print('Ridge L2 to truth',
      np.linalg.norm(ridge.best_estimator_.coef_ - coef), '\n')

# ElasticNet
%time enet = ENetCV(l1_ratio='tune', **cv_kws).fit(X, y)
print('ENet L2 to truth',
      np.linalg.norm(enet.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from lasso, one step
%time fcp_from_lasso_1 = FcpLLACV(estimator=FcpLLA(init=lasso, lla_n_steps=1),\
                                  **cv_kws).fit(X, y)
print('FCP, lasso init, one step L2 to truth',
      np.linalg.norm(fcp_from_lasso_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from ridge, one step
%time fcp_from_ridge_1 = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=1),\
                                  **cv_kws).fit(X, y)
print('FCP, ridge init, one step L2 to truth',
      np.linalg.norm(fcp_from_ridge_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from enet, one step
%time fcp_from_enet_1 = FcpLLACV(estimator=FcpLLA(init=enet, lla_n_steps=1),\
                                 **cv_kws).fit(X, y)
print('FCP, enet init, one step L2 to truth',
      np.linalg.norm(fcp_from_enet_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from lasso, many steps
%time fcp_from_lasso_many = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=100),\
                                     **cv_kws).fit(X, y)
print('FCP, lasso init, many steps L2 to truth',
      np.linalg.norm(fcp_from_lasso_many.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from ridge, many steps
%time fcp_from_ridge_many = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=100),\
                                     **cv_kws).fit(X, y)
print('FCP, ridge init, many steps L2 to truth',
      np.linalg.norm(fcp_from_ridge_many.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from enet, many steps
%time fcp_from_enet_many = FcpLLACV(estimator=FcpLLA(init=enet, lla_n_steps=1),\
                                    **cv_kws).fit(X, y)
print('FCP, enet init, many steps L2 to truth',
      np.linalg.norm(fcp_from_ridge_many.best_estimator_.coef_ - coef), '\n')

CPU times: user 142 ms, sys: 75.6 ms, total: 218 ms
Wall time: 3.16 s
Lasso L2 to truth 0.5399115512254988 

CPU times: user 257 ms, sys: 117 ms, total: 374 ms
Wall time: 836 ms
Ridge L2 to truth 1.4667162260738444 

CPU times: user 823 ms, sys: 66.4 ms, total: 889 ms
Wall time: 13.9 s
ENet L2 to truth 0.5399115512254988 

