In [1]:
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_boston,load_iris

In [2]:
lb = load_boston()
features = lb.feature_names
x_reg ,y_reg = lb.data,lb.target
x_reg = pd.DataFrame(x_reg,columns=features)
y_reg = pd.DataFrame(y_reg,columns=['target'])

In [3]:
li = load_iris()
features2 = li.feature_names
x_cls ,y_cls = li.data,li.target
x_cls = pd.DataFrame(x_cls,columns=features2)
y_cls = pd.DataFrame(y_cls,columns=['target'])

In [4]:
x_reg.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
x_cls.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,accuracy_score,mean_absolute_error,roc_auc_score,mean_squared_error
from sklearn.linear_model import LinearRegression

## 线性回归

普通最小二乘的系数估计依赖于**特征的独立性**。

当特征相关且设计矩阵的列之间具有**近似线性相关性**时， 设计矩阵趋于**奇异矩阵**，最小二乘估计对观测目标的随机误差高度敏感，可能产生很大的方差。

R ^ 2定义为（1- u / v），其中u是残差平方和（（y_true-y_pred）** 2）.sum（），而v是总平方和（（y_true- y_true.mean（））** 2）.sum（）（也叫真实值的离差平方和）;

[线性模型系数解释中的常见缺陷](http://scikit-learn.org.cn/view/257.html)

In [7]:
X_reg_train,X_reg_test,y_reg_train,y_reg_test = train_test_split(x_reg,y_reg,random_state=2022,test_size=0.2)

'''
normalize=True , 在回归之前通过减去均值并除以l2范数来对回归变量X进行归一化；
如果不在LinearRegression对象里调用归一化，则使用sklearn.preprocessing.StandardScaler
'''
reg = LinearRegression(normalize=True) 

# 通过 sample_weight 的平方根重新缩放数据样本
reg.fit(X_reg_train,y_reg_train,sample_weight=0.25)
print(f'系数：{reg.coef_} \n 截距：{reg.intercept_}')
print(f'训练集的r2:{reg.score(X_reg_train,y_reg_train)}')

y_reg_pred = reg.predict(X_reg_test)
print(f'测试集的r2:{r2_score(y_reg_test,y_reg_pred)}')

系数：[[-1.09358524e-01  4.17604608e-02  4.92275845e-02  2.75483861e+00
  -1.37425402e+01  4.71242684e+00 -1.19656681e-02 -1.43522444e+00
   2.98673854e-01 -1.23637115e-02 -8.95540107e-01  1.03495304e-02
  -5.21139480e-01]] 
 截距：[27.61502321]
训练集的r2:0.7560832029228641
测试集的r2:0.6225687597000795


## 岭回归 Ridge

线性回归基础上 + **L2正则项**

![20220706210934](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220706210934.png)

In [8]:
from sklearn.linear_model import Ridge

In [9]:
reg_ridge = Ridge(alpha=0.5,normalize=True) # alpha 为正则化系数，越大，则惩罚越强，w越小
reg_ridge.fit(X_reg_train,y_reg_train)
print(f'系数：{reg_ridge.coef_}')


print(f'训练集的r2:{reg_ridge.score(X_reg_train,y_reg_train)}')

y_reg_pred = reg_ridge.predict(X_reg_test)
print(f'测试集的r2:{r2_score(y_reg_test,y_reg_pred)}')

系数：[[-6.86575752e-02  1.83801716e-02 -5.51674004e-02  2.71481628e+00
  -4.30433161e+00  4.04915189e+00 -9.39472663e-03 -4.92180619e-01
   2.76924906e-02 -2.87975920e-03 -6.44100560e-01  7.47061607e-03
  -3.44996611e-01]]
训练集的r2:0.7033850851955237
测试集的r2:0.6452418447538283


## 岭回归变种 RidgeClassifier

也叫，带有线性核的最小二乘支持向量机

使用Ridge回归的分类器，首先**将目标值转换为{-1, 1}**，然后将问题视为回归任务（在多类情况下为多输出回归）。

**预测类对应于回归预测的符号**

In [10]:
from sklearn.linear_model import RidgeClassifier

X_cls_train,X_cls_test,y_cls_train,y_cls_test = train_test_split(x_cls,y_cls,random_state=2022,test_size=0.2)

cls_ridge = RidgeClassifier(normalize=True)
cls_ridge.fit(X_cls_train,y_cls_train)
cls_ridge.coef_
print(f'训练集准确率:{cls_ridge.score(X_cls_train,y_cls_train)}') 


y_cls_pred = cls_ridge.predict(X_cls_test)
print(f'测试集准确率：{accuracy_score(y_cls_test,y_cls_pred)}')

训练集准确率:0.8583333333333333
测试集准确率：0.9


![20220706234704](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220706234704.png)

In [11]:
# 预测样本的置信度分数
# 样本的置信度分数与该样本到超平面的有符号距离成正比
confidence_score = cls_ridge.decision_function([[4.5,3.1,2.0,0.35]])
print(confidence_score)

print(f'属于第{np.argmax(confidence_score)}类')

[[ 0.38820477 -0.40871633 -0.97948845]]
属于第0类


## RidgeCV 

RidgeCV 通过内置的alpha 参数的控制交叉验证来实现岭回归。 

该对象与 GridSearchCV 的使用方法相同，只是它默认为 Generalized Cross-Validation(广义交叉验证 GCV)，这是一种有效的留一交叉验证方法（LOO-CV）

In [12]:
from sklearn.linear_model import RidgeCV

reg_ridgecv = RidgeCV(alphas=np.logspace(-6,6,13),cv=10) 
reg_ridgecv.fit(X_reg_train,y_reg_train)
print(f'最佳alpha取值：{reg_ridgecv.alpha_}')
reg_ridgecv.coef_


print(f'训练集的r2:{reg_ridgecv.score(X_reg_train,y_reg_train)}')

y_reg_pred = reg_ridgecv.predict(X_reg_test)
print(f'测试集的r2:{r2_score(y_reg_test,y_reg_pred)}')

最佳alpha取值：1.0
训练集的r2:0.7547330246063915
测试集的r2:0.6120292876368366


## Lasso

Lasso是一个估计稀疏系数的线性模型。它在某些情况下是有用的，因为它倾向于给出非零系数较少的解，从而有效地减少了给定解所依赖的特征数。

线性回归 + L1正则项

![20220707002437](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220707002437.png)

Lasso产生稀疏矩阵，可用来作特征选择，alpha参数越高，所选择的特征就越少。

In [13]:
from sklearn.linear_model import  Lasso, LassoCV

reg_lasso = Lasso(alpha=1.1)
reg_lasso.fit(X_reg_train,y_reg_train)

print(f'系数：{np.round(reg_lasso.coef_,5)} \n 截距：{reg_lasso.intercept_}')
print(f'训练集的r2:{reg.score(X_reg_train,y_reg_train)}')

y_reg_pred = reg_lasso.predict(X_reg_test)
print(f'测试集的r2:{r2_score(y_reg_test,y_reg_pred)}')

系数：[-0.06458  0.04446 -0.       0.      -0.       1.13055  0.02012 -0.71969
  0.27929 -0.01466 -0.7708   0.00871 -0.80275] 
 截距：[41.15011875]
训练集的r2:0.7560832029228641
测试集的r2:0.6460796009959273


In [25]:
reg_lassocv = LassoCV(cv=5,random_state=2022,alphas=np.arange(0,2,0.01))
reg_lassocv.fit(X_reg_train,y_reg_train)
print(f'最佳alpha：{reg_lassocv.alpha_}')

print(f'系数：{np.round(reg_lassocv.coef_,5)} \n 截距：{reg_lassocv.intercept_}')
print(f'训练集score：{reg_lassocv.score(X_reg_train,y_reg_train)}')

y_reg_pred = reg_lassocv.predict(X_reg_test)
print(f'测试集score：{r2_score(y_reg_test,y_reg_pred)}')

最佳alpha：0.0
系数：[-1.093600e-01  4.176000e-02  4.923000e-02  2.754840e+00 -1.374254e+01
  4.712430e+00 -1.197000e-02 -1.435220e+00  2.986700e-01 -1.236000e-02
 -8.955400e-01  1.035000e-02 -5.211400e-01] 
 截距：27.615023209546624
训练集score：0.7560832029228641
测试集score：0.6225687597000796


## 弹性网络 -- ElasticNet 

通过调整**l1_ratio**，调整L1\L2的组合

![20220707101826](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220707101826.png)


In [31]:
from sklearn.linear_model import ElasticNet

reg_elsnet = ElasticNet(alpha=0.5,l1_ratio=0.5)
reg_elsnet.fit(X_reg_train,y_reg_train)

print(f'系数：{np.round(reg_elsnet.coef_,5)} \n 截距：{reg_elsnet.intercept_}')
print(f'训练集r2：{reg_elsnet.score(X_reg_train,y_reg_train)}')

y_reg_pred = reg_elsnet.predict(X_reg_test)
print(f'测试集r2:{r2_score(y_reg_test,y_reg_pred)}')

系数：[-0.09379  0.05137 -0.       0.      -0.       2.01367  0.00582 -1.05469
  0.33827 -0.01671 -0.85229  0.0097  -0.7472 ] 
 截距：[38.58675094]
训练集r2：0.7200146013972221
测试集r2:0.6388913772483735


## 最小角回归 -- Least-angle regression， LARS