In [15]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
%matplotlib inline
import gc
gc.collect()

16

In [16]:
#乳腺癌分类数据集（二分类问题）
cancer = load_breast_cancer()

data = cancer.data[:, :10]
target = cancer.target

print(data.shape)
print(target.shape)

#标准化 若不标准化，则x值太大，在计算p1_xb时对初始值过于敏感，求dldl_dbetadbeta逆时，出现奇异矩阵，无法计算
#牛顿法其实限制很多 不稳定
ss = MinMaxScaler()
data = ss.fit_transform(data)

#拆分数据集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

#展示数据维度
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(569, 10)
(569,)
(455, 10)
(114, 10)
(455,)
(114,)


In [17]:
class LogisticRegression:
    def __init__(self):
        self.beta = None
        
    def fit(self, x, y, n_iters=100):
        n_examples = x.shape[0]

        extra = np.ones(n_examples)
        X = np.c_[x, extra]
        
        #西瓜书中采用牛顿法求解
        self.beta = np.random.random((X.shape[1],))

        #牛顿法迭代求解
        for n in range(n_iters):
            dl_dbeta = np.zeros(X.shape[1]) 
            dldl_dbetadbeta = np.zeros((X.shape[1], X.shape[1])) #hessian矩阵
            for i in range(n_examples):    #求和
                
                #防止exp溢出
                a = X[i].dot(self.beta) 
                if a>=0:
                    p1_xb = 1 / (1 + np.exp(-a))
                else:
                    p1_xb = np.exp(a) / (1 + np.exp(a))

                dl_dbeta += X[i]*(y_train[i] - p1_xb)                        #公式3.30
                
                tmp = X[i].reshape(-1,1)  
                dldl_dbetadbeta += tmp.dot(tmp.T)*p1_xb*(1-p1_xb)  #公式3.31

            try:
                self.beta -=  np.linalg.inv(dldl_dbetadbeta).dot(-dl_dbeta) #公式3.29
            except:      #通常当黑塞矩阵奇异的时候，说明梯度已经非常小了，也可以认为此时已经收敛了
                self.beta = self.beta
    
    def predict(self, x):
        n_examples = x.shape[0]

        extra = np.ones(n_examples)
        X = np.c_[x, extra]
        
        y1_proba = [] #记录是1的概率
        for i in range(n_examples):
            a = X[i].dot(self.beta)
            if a>=0:
                p1_xb = 1 / (1 + np.exp(-a))
            else:
                p1_xb = np.exp(a) / (1 + np.exp(a))
            y1_proba.append(p1_xb)

        y1_hat = [1 if i>0.5 else 0 for i in y1_proba] #概率转类别
        return np.array(y1_hat)
        
        
def accuracy_score(y, y_hat):
    return len(np.where(y==y_hat)[0]) / len(y)        
    


In [18]:
#numpy
lr = LogisticRegression()
lr.fit(x_train, y_train, n_iters=100)
y_hat = lr.predict(x_test)

accuracy_score(y_test, y_hat)

0.9473684210526315

In [19]:
#sklearn
from sklearn import linear_model
lr = linear_model.LogisticRegression(max_iter=100,solver='newton-cg')
lr.fit(x_train, y_train)
y_hat = lr.predict(x_test)
print(accuracy_score(y_test, y_hat))


0.9298245614035088
