# LASSO

lasso是一个估量稀疏系数的线性模型。因为它倾向于选择具有比较少的参数值的解，所以它在一些特定的情况下很有用，因此它可以有效的减少依赖的变量个数。它通过构造一个罚函数得到一个较为精炼的模型，使得它压缩一些系数，同时设定一些系数为零。因此保留了子集收缩的优点，是一种处理具有复共线性数据的有偏估计。

调整参数lambda的确定：

交叉验证法。对lambda的格点值，进行交叉验证，选取交叉验证误差最小的lambda值。最后，按照得到的lambda值，用全部数据重新拟合模型即可。

In [1]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit([[0,0],[1,1]],[0,1])
print(reg.predict([[1,1]]))

[ 0.8]


In [6]:
# 用于稀疏信号的Lasso及弹性网络
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
np.random.seed(42)

n_samples, n_features = 50, 200
X = np.random.randn(n_samples, n_features)
coef = 3 * np.random.randn(n_features)
inds = np.arange(n_features)
np.random.shuffle(inds)
coef[inds[10:]] = 0  # sparsify coef
y = np.dot(X, coef)

# add noise
y += 0.01 * np.random.normal((n_samples,))

# Split data in train set and test set
n_samples = X.shape[0]
X_train, y_train = X[:n_samples / 2], y[:n_samples / 2]
X_test, y_test = X[n_samples / 2:], y[n_samples / 2:]



In [9]:
from sklearn.linear_model import Lasso
alpha = 0.1
lasso = Lasso(alpha=alpha)
y_pred_lasso = lasso.fit(X_train,y_train).predict(X_test)
r2_score_lasso = r2_score(y_test,y_pred_lasso)
print(lasso)
print('r^2 on test data:%f',r2_score_lasso)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
r^2 on test data:%f 0.384710361065


In [12]:
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

plt.plot(enet.coef_, color='lightgreen', linewidth=2,
         label='Elastic net coefficients')
plt.plot(lasso.coef_, color='gold', linewidth=2,
         label='Lasso coefficients')
plt.plot(coef, '--', color='navy', label='original coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
          % (r2_score_lasso, r2_score_enet))

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.7,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
r^2 on test data : 0.240176


<matplotlib.text.Text at 0x2433f505780>

## 设置正则化参数
aplha 参数控制了估量器次数的稀疏程度，可以使用交叉验证的方法来寻找最优的alpha

对于带有许多共线性回归的高维度数据集，LassoCV可以说是合适的，但是对于样本的数量与观察数量相比非常小的话，LassoLarsCV处理速度要远远快于LassoCV.

In [17]:
import time
from sklearn.linear_model import LassoCV,LassoLarsCV,LassoLarsIC
from sklearn import datasets
diabetes  = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
rng = np.random.RandomState(42)
x = np.c_[x,rng.randn(x.shape[0],14)]
x /= np.sqrt(np.sum(x**2,axis=0))   # 数据的表转化处理

In [18]:
# LassoLarsIC: least angle regression with BIC/AIC criterion
model_bic = LassoLarsIC(criterion='bic')
t1 = time.time()
model_bic.fit(x,y)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(x,y)
alpha_aic_ = model_aic.alpha_

def plot_ic_criterion(model,name,color):
    alpha_ = model.alpha_
    alphas_ = model.alphas_
    criterion = model.criterion_
    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
             linewidth=3, label='%s criterion' % name)
    plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
                label='alpha: %s estimate' % name)
    plt.xlabel('-log(alpha)')
    plt.ylabel('criterion')
plt.figure()
plot_ic_criterion(model_aic,'AIC','b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)'
          % t_bic)



NameError: name 'criterion_' is not defined