# Section 1 : 数据预处理

In [4]:
import pandas as pd
import numpy as np
# 首先将缺失值中？表示替换为nan
data = pd.read_csv("breast-cancer-wisconsin.data",names=["Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli","Mitoses","Class"])
data.replace("?", np.nan , inplace = True)
data["Bare Nuclei"] = data["Bare Nuclei"].astype("float")
cleaned_data = data.dropna()
normalized_data = pd.DataFrame()
def Standard_Score(arr):
    mean = np.mean(arr)
    std = np.std(arr)
    return ((arr - mean)/std)
for (columnName, columnData) in cleaned_data.iteritems():
    normalized_data[columnName] = Standard_Score(columnData)
normalized_data["Sample code number"] = cleaned_data["Sample code number"]
normalized_data["Class"] = cleaned_data["Class"]
normalized_data.to_csv("normalized-breast-cancer-wisconsin.data",header=False,index=False,sep=',')
display(normalized_data)

X = normalized_data.iloc[:,1:-1]
#Class 一列中，4表明患癌症，2表示不换癌症。转化成1-0
y = normalized_data["Class"].replace([4,2],[1,0]).values

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.348400,2
1,1002945,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.348400,2
2,1015425,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.348400,2
3,1016277,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.348400,2
4,1017023,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.348400,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,-0.511643,-0.702212,-0.741774,-0.639366,-0.105454,-0.424217,-0.998853,-0.612927,-0.348400,2
695,841769,-0.866417,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.998853,-0.612927,-0.348400,2
696,888820,0.197905,2.236180,2.271896,0.059333,1.695166,-0.149582,1.860738,2.337476,0.229166,4
697,897471,-0.156869,1.583204,0.932487,0.408682,-0.105454,0.125054,2.677764,1.026185,-0.348400,4


# Section 2 : 分割训练集和测试集
逻辑回归通过定义$\beta = \begin{bmatrix}w\\b \end{bmatrix},\hat{\mathbf{x}}=\begin{bmatrix}\mathbf{x}\\\mathbb{1}\end{bmatrix}$ ，将 $\mathbf{ \omega}^\top \mathbf{x} +{ b}$ 简写为 $\mathbf{\beta}^\top\hat{\mathbf{x}}$, 所以通过给$\mathbf{X}$加一列全1向量变成增广矩阵$\hat{\mathbf{X}}$

In [5]:
X_hat = np.concatenate((X.values,np.ones([X.shape[0],1])),axis = 1)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=20)
X_hat_train = np.concatenate((X_train.values,np.ones([X_train.shape[0],1])),axis = 1)
X_hat_test = np.concatenate((X_test.values,np.ones([X_test.shape[0],1])),axis = 1)

# Section 3 : 梯度下降法估计Logistic Regression模型的参数$\beta$
$$
\ell(\boldsymbol{\beta})=\sum_{i=1}^{m}\left(-y_{i} \boldsymbol{\beta}^{\mathrm{T}} \hat{\boldsymbol{x}}_{i}+\ln \left(1+e^{\boldsymbol{\beta}^{\mathrm{T}} \hat{\boldsymbol{x}}_{i}}\right)\right)
$$
$$
\boldsymbol{\beta}^{*}=\underset{\boldsymbol{\beta}}{\arg \min } \ell(\boldsymbol{\beta})
$$

### 对于梯度下降法而言：
$$
\boldsymbol{\beta}^{t+1}=\boldsymbol{\beta}^{t}- s \frac{\partial \ell(\boldsymbol{\beta})}{\partial \boldsymbol{\beta}}\\

\nabla \ell =\hat{\mathbf{X}}^{\top}(\boldsymbol{\mu}-\mathbf{y})
$$
### 其中：
$$\boldsymbol{\mu}=\left(\mu_{1}, \ldots, \mu_{n}\right)^{\mathrm{T}}$$
$$
\mu_{i}=\frac{1}{1+\exp \left(-\boldsymbol{\beta}^{\mathrm{T}} \hat{\mathbf{x}_{i}}\right)}, i=1, \ldots, n$$

In [6]:
#梯度下降法
def target_function(X_hat,beta,y):
    result = 0
    for i in range(X_hat.shape[0]):
        result += -y[i]* beta.T @ X_hat[i,:] + np.log( 1+ np.exp(beta.T @ X_hat[i,:]))
    return result

def mu(X_hat,beta,y):
    mu = []
    for i in range(X_hat.shape[0]):
        mu.append(float(1/(1+np.exp(-beta.T @ X_hat[i,:]))))
    mu = np.array(mu)
    return mu

def gradient(X_hat,beta,y):
    m = mu(X_hat,beta,y)
    return X_hat.T @ (m - y)

def Gradient_Decent(X_hat,y,step,eps):
    beta = np.zeros(X_hat.shape[1])
    t=0 #计数器 
    err = np.inf
    while err > eps and t < 1e6:
        original_lx = target_function(X_hat,beta,y)
        beta=beta-step*gradient(X_hat,beta,y)
        err = abs(target_function(X_hat,beta,y) - original_lx)
        t += 1
    return beta,t,target_function(X_hat,beta,y)
GD_beta,GD_t,GD_target = Gradient_Decent(X_hat_train , y_train , step = 1e-2, eps = 1e-6)
print( "使用梯度下降法迭代次数为:{t} \n目标函数最优值为：{target} \n最优解为：{beta}\n".format(t = GD_t,target = GD_target,beta = GD_beta))

使用梯度下降法迭代次数为:358 
目标函数最优值为：30.526983644244435 
最优解为：[ 1.24668391  1.61365038  1.55144388  0.50037237  0.29338002  1.73905281
  0.72092771  0.44772339  0.72605074 -0.71245936]



# Section 4 : 牛顿法估计Logistic Regression模型的参数$\beta$
$$
\ell(\boldsymbol{\beta})=\sum_{i=1}^{m}\left(-y_{i} \boldsymbol{\beta}^{\mathrm{T}} \hat{\boldsymbol{x}}_{i}+\ln \left(1+e^{\boldsymbol{\beta}^{\mathrm{T}} \hat{\boldsymbol{x}}_{i}}\right)\right)
$$
$$
\boldsymbol{\beta}^{*}=\underset{\boldsymbol{\beta}}{\arg \min } \ell(\boldsymbol{\beta})
$$
### 对于牛顿法而言：
$$
\boldsymbol{\beta}^{t+1}=\boldsymbol{\beta}^{t}-\left(\frac{\partial^{2} \ell(\boldsymbol{\beta})}{\partial \boldsymbol{\beta} \partial \boldsymbol{\beta}^{\mathrm{T}}}\right)^{-1} \frac{\partial \ell(\boldsymbol{\beta})}{\partial \boldsymbol{\beta}}=\boldsymbol{\beta}^{t} - \mathbf{H} \nabla \ell
$$
### 其中：
$$\boldsymbol{\mu}=\left(\mu_{1}, \ldots, \mu_{n}\right)^{\mathrm{T}}$$
$$
\mu_{i}=\frac{1}{1+\exp \left(-\boldsymbol{\beta}^{\mathrm{T}} \hat{\mathbf{x}_{i}}\right)}, i=1, \ldots, n$$
$$\nabla \ell=\hat{\mathbf{X}}^{\mathrm{T}}(\boldsymbol{\mu}-\mathbf{y})$$
$$\mathbf{H}=\hat{\mathbf{X}}^{\mathrm{T}} \mathbf{S} \hat{\mathbf{X}}$$
$$\mathbf{S}=\operatorname{diag}\left(\mu_{1}\left(1-\mu_{1}\right), \ldots, \mu_{n}\left(1-\mu_{n}\right)\right) $$

In [7]:
def Hessian(X_hat,beta,y):
    S = np.zeros([X_hat.shape[0],X_hat.shape[0]])
    m = mu(X_hat,beta,y)
    for i in range(X_hat.shape[0]):
        S[i,i] = m[i]*(1-m[i])
    return X_hat.T @ S @ X_hat

def Newton_Method(X_hat,y,eps):
    beta = np.zeros(X_hat.shape[1])
    t=0 #计数器 
    err = np.inf
    while err > eps and t < 1e6:
        original_lx = target_function(X_hat,beta,y)
        beta=beta - np.linalg.inv(Hessian(X_hat,beta,y)) @ gradient(X_hat,beta,y)
        err = abs(target_function(X_hat,beta,y) - original_lx)
        t += 1
    return beta,t,target_function(X_hat,beta,y)

newton_beta,newton_t,newton_target = Newton_Method(X_hat_train , y_train , eps = 1e-6)
print( "使用牛顿法迭代次数为:{t} \n目标函数最优值为：{target} \n最优解为：{beta}\n".format(t = newton_t,target = newton_target,beta = newton_beta))

使用牛顿法迭代次数为:9 
目标函数最优值为：30.526898976202812 
最优解为：[ 1.24547846  1.60078209  1.56240374  0.50129893  0.29353511  1.73881993
  0.72137369  0.44653756  0.72704434 -0.71371476]



# Section 5 : Sklearn 求解 Logistic Regression 模型的参数 $\beta$

In [8]:
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression().fit(X_train,y_train)
sklearn_beta = np.concatenate((logistic_regression.coef_.flatten(),logistic_regression.intercept_))
print("目标函数最优值为：{target} \n使用sklearn模型得到的参数beta为：{beta}\n".format(target = target_function(X_hat,sklearn_beta,y),beta = sklearn_beta))

目标函数最优值为：56.26488479882792 
使用sklearn模型得到的参数beta为：[ 1.09769359  1.15232508  1.2333593   0.51589683  0.40528952  1.46062189
  0.69647669  0.48455336  0.60369775 -0.78794857]



# Section 6 : 对比各种回归方式异同
将梯度下降法和牛顿法得到的系数放入sklearn对象中，方便使用sklearn中函数进行对比

In [9]:
gradient_decent = LogisticRegression().fit(X_train,y_train)
gradient_decent.coef_ = GD_beta[:-1].reshape(1,GD_beta.shape[0]-1)
gradient_decent.intercept_ = GD_beta[-1]
newton = LogisticRegression().fit(X_train,y_train)
newton.coef_ = newton_beta[:-1].reshape(1,newton_beta.shape[0]-1)
newton.intercept_ = newton_beta[-1]


### 对比回归系数 $ \beta$

In [10]:
print("GD方法系数：{gd}\nNewton方法系数：{nt}\nSklearn方法系数：{sk}\n".format(gd = GD_beta,nt = newton_beta,sk = sklearn_beta))

GD方法系数：[ 1.24668391  1.61365038  1.55144388  0.50037237  0.29338002  1.73905281
  0.72092771  0.44772339  0.72605074 -0.71245936]
Newton方法系数：[ 1.24547846  1.60078209  1.56240374  0.50129893  0.29353511  1.73881993
  0.72137369  0.44653756  0.72704434 -0.71371476]
Sklearn方法系数：[ 1.09769359  1.15232508  1.2333593   0.51589683  0.40528952  1.46062189
  0.69647669  0.48455336  0.60369775 -0.78794857]



### 对比准确率

In [11]:
GD_score = gradient_decent.score(X_train,y_train) #由于eps设置的小，所以测试集算出准确率相同
newton_score = newton.score(X_train,y_train)
sk_score = logistic_regression.score(X_train,y_train)
print("GD方法准确率：{gd}\nNewton法准确率：{nt}\nSklearn方法准确率：{sk}\n".format(gd = GD_score,nt = newton_score,sk = sk_score))

GD方法准确率：0.9761904761904762
Newton法准确率：0.9761904761904762
Sklearn方法准确率：0.978021978021978



### 结论
由上述结论可知，模型跟迭代次数有一定关系:

牛顿法的目标函数值是最小的，效果是最好的。

sklearn的目标函数值是最大的，效果最差。
