### 1. 理论知识

### 2.数据准备

##### 2.1 导入数据集

In [1]:
# 导入模块
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

# 处理数据集
iris=load_iris()
X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y = pd.DataFrame(data=iris.target, columns=["label"])

# 特征重命名
map_={
    'sepal length (cm)': '花萼长度',
    'sepal width (cm)': '花萼宽度',
    'petal length (cm)': '花瓣长度',
    'petal width (cm)': '花瓣宽度'
}
X=X.rename(columns=map_)
X

Unnamed: 0,花萼长度,花萼宽度,花瓣长度,花瓣宽度
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


#### 2.2 数据处理

**注意！！一定要对数据进行归一化，统一量纲，不然梯度下降大概率无法收敛！！**

In [2]:
from sklearn.preprocessing import Normalizer

# 数据归一化！！！！非常重要！！！！
X_1=Normalizer().fit_transform(X)

# 添加补充列
X_1=np.hstack((X_1,np.ones((len(X_1),1))))

# 最后将y array化，方便后续计算，矩阵乘法@符号，和A.dot(B)形式都要求为array，否则只能用np.dot(A,B)
y=y.values
X_1

array([[0.80377277, 0.55160877, 0.22064351, 0.0315205 , 1.        ],
       [0.82813287, 0.50702013, 0.23660939, 0.03380134, 1.        ],
       [0.80533308, 0.54831188, 0.2227517 , 0.03426949, 1.        ],
       [0.80003025, 0.53915082, 0.26087943, 0.03478392, 1.        ],
       [0.790965  , 0.5694948 , 0.2214702 , 0.0316386 , 1.        ],
       [0.78417499, 0.5663486 , 0.2468699 , 0.05808704, 1.        ],
       [0.78010936, 0.57660257, 0.23742459, 0.0508767 , 1.        ],
       [0.80218492, 0.54548574, 0.24065548, 0.0320874 , 1.        ],
       [0.80642366, 0.5315065 , 0.25658935, 0.03665562, 1.        ],
       [0.81803119, 0.51752994, 0.25041771, 0.01669451, 1.        ],
       [0.80373519, 0.55070744, 0.22325977, 0.02976797, 1.        ],
       [0.786991  , 0.55745196, 0.26233033, 0.03279129, 1.        ],
       [0.82307218, 0.51442011, 0.24006272, 0.01714734, 1.        ],
       [0.8025126 , 0.55989251, 0.20529392, 0.01866308, 1.        ],
       [0.81120865, 0.55945424, 0.

### 3. 算法实现

#### 3.1 梯度下降
**非凸函数，不可正规方程求解**

In [3]:
# 获取数据集的类别，要为标签y构造一个概率矩阵，即假如有180个样本，就是180*3的矩阵，其中每一列的值对应当前标签的概率

# 分类的类别数
n_classes=np.unique(y).shape[0] 

# 分类的样本数
n_samples = X_1.shape[0]

# 分类的标签数
n_features = X_1.shape[1]

# 初始化权重参数矩阵
A = np.random.rand(n_features,n_classes)

# 对标签进行onehot编码
y_onehot = np.eye(n_classes)[y]

In [4]:
# 
sum_softmax = np.sum(np.exp(X_1@A), axis=1, keepdims=True)
y_pre=np.exp(X_1@A) / sum_softmax
y_pre

array([[0.18543378, 0.4676689 , 0.34689733],
       [0.18334177, 0.47045038, 0.34620785],
       [0.18520107, 0.46772924, 0.34706968],
       [0.18393531, 0.47357605, 0.34248864],
       [0.18612465, 0.46755672, 0.34631863],
       [0.18491017, 0.46856256, 0.34652727],
       [0.18567741, 0.46779098, 0.34653161],
       [0.1847003 , 0.47071302, 0.34458668],
       [0.18370992, 0.47282155, 0.34346853],
       [0.18371281, 0.47426795, 0.34201924],
       [0.18536777, 0.46826702, 0.34636521],
       [0.18465356, 0.4737778 , 0.34156864],
       [0.18384025, 0.47269294, 0.34346681],
       [0.18639764, 0.46665979, 0.34694257],
       [0.18717898, 0.46004545, 0.35277556],
       [0.1873079 , 0.46208867, 0.35060343],
       [0.18653864, 0.45999846, 0.3534629 ],
       [0.18512776, 0.46593537, 0.34893687],
       [0.18421712, 0.46967675, 0.34610614],
       [0.18613981, 0.46705715, 0.34680304],
       [0.18318871, 0.47386768, 0.34294361],
       [0.18536835, 0.46574783, 0.34888382],
       [0.