# Compare initialization strategies for the LLA algorithm

This notebook demonstrates the concave penalties that come with `ya_glm`. We fit these penalties using the LLA algorithm applied go a "good enough" initializer as in Fan et al. 2014.

In [6]:
import numpy as np
import matplotlib.pyplot as plt

from ya_glm.toy_data import sample_sparse_lin_reg
from ya_glm.backends.fista.LinearRegression import FcpLLA, FcpLLACV,\
    LassoCV, RidgeCV, LassoENetCV

# Initialize, then fit concave penalty the LLA algorithm

In [2]:
# sample some linear regression data
X, y, coef, intercept = sample_sparse_lin_reg(n_samples=100, n_features=10, n_nonzero=5,
                                              X_dist='corr', x_corr=.1,
                                              random_state=1)

In [3]:
# initialize the coefficient with a Lasso estimate tuned with cross-validation
init = LassoCV().fit(X, y)

# these will give the same behavior
# init = LassoCV() # if an unfit estimator is passed in, it will be fit to the data
# init = 'default' # the default will use a LassoCV

# fit the concave penalty by initializing from the Lasso estimate
concave_est = FcpLLA(init=init)

# by default we take one LLA step -- see Fan et al 2014
# concave_est = FcpLLA(n_lla_steps=1, init=init) # default behavior
# but we can of course run for more LLA steps!
concave_est = FcpLLA(lla_n_steps=100, init=init)

# note the cross-validation estimator will fit the initializer once!
# then will use the same initailizer for each cross-validation fold
cv_est = FcpLLACV(estimator=concave_est)

# Compare different estimators

Lets compare different initialization strategies

In [4]:
# sample some linear regression data
# here lets use higher dimensional data
X, y, coef, intercept = sample_sparse_lin_reg(n_samples=100, n_features=100, n_nonzero=5,
                                              random_state=1)

In [5]:
cv_kws = {'cv': 10, 'cv_n_jobs': -1}

# Lasso
%time lasso = LassoCV(**cv_kws).fit(X, y)
print('Lasso L2 to truth',
      np.linalg.norm(lasso.best_estimator_.coef_ - coef), '\n')


# Ridge regression
%time ridge = RidgeCV(**cv_kws).fit(X, y)
print('Ridge L2 to truth',
      np.linalg.norm(ridge.best_estimator_.coef_ - coef), '\n')

# ElasticNet
%time enet = LassoENetCV(l1_ratio='tune', **cv_kws).fit(X, y)
print('ENet L2 to truth',
      np.linalg.norm(enet.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from lasso, one step
%time fcp_from_lasso_1 = FcpLLACV(estimator=FcpLLA(init=lasso, lla_n_steps=1),\
                                  **cv_kws).fit(X, y)
print('FCP, lasso init, one step L2 to truth',
      np.linalg.norm(fcp_from_lasso_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from ridge, one step
%time fcp_from_ridge_1 = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=1),\
                                  **cv_kws).fit(X, y)
print('FCP, ridge init, one step L2 to truth',
      np.linalg.norm(fcp_from_ridge_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from enet, one step
%time fcp_from_enet_1 = FcpLLACV(estimator=FcpLLA(init=enet, lla_n_steps=1),\
                                 **cv_kws).fit(X, y)
print('FCP, enet init, one step L2 to truth',
      np.linalg.norm(fcp_from_enet_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from lasso, many steps
%time fcp_from_lasso_many = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=100),\
                                     **cv_kws).fit(X, y)
print('FCP, lasso init, many steps L2 to truth',
      np.linalg.norm(fcp_from_lasso_many.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from ridge, many steps
%time fcp_from_ridge_many = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=100),\
                                     **cv_kws).fit(X, y)
print('FCP, ridge init, many steps L2 to truth',
      np.linalg.norm(fcp_from_ridge_many.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from enet, many steps
%time fcp_from_enet_many = FcpLLACV(estimator=FcpLLA(init=enet, lla_n_steps=1),\
                                    **cv_kws).fit(X, y)
print('FCP, enet init, many steps L2 to truth',
      np.linalg.norm(fcp_from_ridge_many.best_estimator_.coef_ - coef), '\n')

CPU times: user 152 ms, sys: 81.5 ms, total: 233 ms
Wall time: 3.7 s
Lasso L2 to truth 0.5399115512254988 

CPU times: user 246 ms, sys: 84 ms, total: 330 ms
Wall time: 1.08 s
Ridge L2 to truth 1.4667162260738444 

CPU times: user 1.02 s, sys: 77.9 ms, total: 1.1 s
Wall time: 18.1 s
ENet L2 to truth 0.5399115512254988 

CPU times: user 1.92 s, sys: 221 ms, total: 2.14 s
Wall time: 6.35 s
FCP, lasso init, one step L2 to truth 0.2934270519442212 

CPU times: user 2.01 s, sys: 205 ms, total: 2.21 s
Wall time: 11.1 s
FCP, ridge init, one step L2 to truth 0.3433528089934003 

CPU times: user 2.65 s, sys: 196 ms, total: 2.85 s
Wall time: 21.1 s
FCP, enet init, one step L2 to truth 0.2934270519442212 

CPU times: user 2.59 s, sys: 218 ms, total: 2.81 s
Wall time: 18.4 s
FCP, lasso init, many steps L2 to truth 0.2964122567536647 

CPU times: user 2.56 s, sys: 265 ms, total: 2.83 s
Wall time: 19.7 s
FCP, ridge init, many steps L2 to truth 0.2964122567536647 

CPU times: user 2.85 s, sys: 201 ms