# Compare initialization strategies for the LLA algorithm

This notebook demonstrates the concave penalties that come with `ya_glm`. We fit these penalties using the LLA algorithm applied go a "good enough" initializer as in Fan et al. 2014.

In [6]:
import numpy as np
import matplotlib.pyplot as plt

from ya_glm.toy_data import sample_sparse_lin_reg


from ya_glm.models.Lasso import LassoCV
from ya_glm.models.Ridge import RidgeCV
from ya_glm.models.ENet import ENetCV

from ya_glm.models.FcpLLA import FcpLLA, FcpLLACV

# Initialize, then fit concave penalty the LLA algorithm

In [2]:
# sample some linear regression data
X, y, coef, intercept = sample_sparse_lin_reg(n_samples=100, n_features=10, n_nonzero=5,
                                              X_dist='corr', x_corr=.1,
                                              random_state=1)

In [3]:
# initialize the coefficient with a Lasso estimate tuned with cross-validation
init = LassoCV().fit(X, y)

# these will give the same behavior
# init = LassoCV() # if an unfit estimator is passed in, it will be fit to the data
# init = 'default' # the default will use a LassoCV

# fit the concave penalty by initializing from the Lasso estimate
concave_est = FcpLLA(init=init)

# by default we take one LLA step -- see Fan et al 2014
# concave_est = FcpLLA(n_lla_steps=1, init=init) # default behavior
# but we can of course run for more LLA steps!
concave_est = FcpLLA(lla_n_steps=100, init=init)

# note the cross-validation estimator will fit the initializer once!
# then will use the same initailizer for each cross-validation fold
cv_est = FcpLLACV(estimator=concave_est)

# Compare different estimators

Lets compare different initialization strategies

In [4]:
# sample some linear regression data
# here lets use higher dimensional data
X, y, coef, intercept = sample_sparse_lin_reg(n_samples=100, n_features=100, n_nonzero=5,
                                              random_state=1)

In [5]:
cv_kws = {'cv': 10, 'cv_n_jobs': -1}

# Lasso
%time lasso = LassoCV(**cv_kws).fit(X, y)
print('Lasso L2 to truth',
      np.linalg.norm(lasso.best_estimator_.coef_ - coef), '\n')


# Ridge regression
%time ridge = RidgeCV(**cv_kws).fit(X, y)
print('Ridge L2 to truth',
      np.linalg.norm(ridge.best_estimator_.coef_ - coef), '\n')

# ElasticNet
%time enet = ENetCV(l1_ratio='tune', **cv_kws).fit(X, y)
print('ENet L2 to truth',
      np.linalg.norm(enet.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from lasso, one step
%time fcp_from_lasso_1 = FcpLLACV(estimator=FcpLLA(init=lasso, lla_n_steps=1),\
                                  **cv_kws).fit(X, y)
print('FCP, lasso init, one step L2 to truth',
      np.linalg.norm(fcp_from_lasso_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from ridge, one step
%time fcp_from_ridge_1 = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=1),\
                                  **cv_kws).fit(X, y)
print('FCP, ridge init, one step L2 to truth',
      np.linalg.norm(fcp_from_ridge_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from enet, one step
%time fcp_from_enet_1 = FcpLLACV(estimator=FcpLLA(init=enet, lla_n_steps=1),\
                                 **cv_kws).fit(X, y)
print('FCP, enet init, one step L2 to truth',
      np.linalg.norm(fcp_from_enet_1.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from lasso, many steps
%time fcp_from_lasso_many = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=100),\
                                     **cv_kws).fit(X, y)
print('FCP, lasso init, many steps L2 to truth',
      np.linalg.norm(fcp_from_lasso_many.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from ridge, many steps
%time fcp_from_ridge_many = FcpLLACV(estimator=FcpLLA(init=ridge, lla_n_steps=100),\
                                     **cv_kws).fit(X, y)
print('FCP, ridge init, many steps L2 to truth',
      np.linalg.norm(fcp_from_ridge_many.best_estimator_.coef_ - coef), '\n')

# FCP, initialize from enet, many steps
%time fcp_from_enet_many = FcpLLACV(estimator=FcpLLA(init=enet, lla_n_steps=1),\
                                    **cv_kws).fit(X, y)
print('FCP, enet init, many steps L2 to truth',
      np.linalg.norm(fcp_from_ridge_many.best_estimator_.coef_ - coef), '\n')

CPU times: user 219 ms, sys: 107 ms, total: 325 ms
Wall time: 3.16 s
Lasso L2 to truth 0.5399082946992609 

CPU times: user 259 ms, sys: 185 ms, total: 444 ms
Wall time: 760 ms
Ridge L2 to truth 1.4668701386280432 

CPU times: user 919 ms, sys: 55.7 ms, total: 974 ms
Wall time: 13.2 s
ENet L2 to truth 0.5400790718053406 

CPU times: user 1.91 s, sys: 158 ms, total: 2.07 s
Wall time: 9.7 s
FCP, lasso init, one step L2 to truth 0.2934439869808888 

CPU times: user 2.39 s, sys: 170 ms, total: 2.56 s
Wall time: 15 s
FCP, ridge init, one step L2 to truth 0.34341860588566014 

CPU times: user 3.05 s, sys: 169 ms, total: 3.22 s
Wall time: 27.7 s
FCP, enet init, one step L2 to truth 0.29340474485873375 

CPU times: user 3.62 s, sys: 283 ms, total: 3.91 s
Wall time: 58.6 s
FCP, lasso init, many steps L2 to truth 0.2964121594377123 

CPU times: user 3.95 s, sys: 353 ms, total: 4.3 s
Wall time: 48.2 s
FCP, ridge init, many steps L2 to truth 0.2964121594377123 

CPU times: user 3.57 s, sys: 254 ms