In [818]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score

In [819]:
data = load_breast_cancer()

In [820]:
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X, X_train, X_test, y, y_train, y_test = X.T, X_train.T, X_test.T, y.reshape(1, -1), y_train.reshape(1, -1), y_test.reshape(1, -1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(30, 455)
(30, 114)
(1, 455)
(1, 114)


In [821]:
feature_num, m_total = X.shape
w = np.zeros((feature_num, 1))
b = 0.

In [822]:
def sigmoid(z):
    return 1. / (1. + np.exp(-z))

def calc_z(w, b, X):
    return w.T @ X + b

def loss(w, b, X, y_true, eps=1.e-15):
    z = calc_z(w, b, X)
    a = sigmoid(z)
    l = - np.mean((y_true * np.log(a + eps) + (1 - y_true) * np.log(1 - a + eps)))
    return l

def calc_loss_der(w, b, X, y):
    _, m = X.shape
    z = calc_z(w, b, X)
    a = sigmoid(z)
    dw = X @ (a - y).T / m
    db = np.mean(a - y)
    return dw, db

def predict(w, b, X, threshold=0.5):
    z = calc_z(w, b, X)
    a = sigmoid(z)
    return (a > threshold).astype(int)

In [823]:
def regress(w, b, X_train, y_train, alpha=0.0001, epoch=10, verbose=True):
    history = [loss(w, b, X_train, y_train)]
    for i in range(epoch):
        dw, db = calc_loss_der(w, b, X_train, y_train)
        w -= alpha * dw
        b -= alpha * db
        history.append(loss(w, b, X_train, y_train))
        if verbose:
            print(history[-1])
    return w, b, history


In [824]:
def score(w, b, X_train, X_test, y_train, y_test):
    y_pred = predict(w, b, X_train)
    print('train accuracy:', accuracy_score(y_train.flatten(), y_pred.flatten()))
    y_pred = predict(w, b, X_test)
    print('test accuracy:', accuracy_score(y_test.flatten(), y_pred.flatten()))

In [825]:
w, b, history = regress(w, b, X_train, y_train, alpha=0.000003, epoch=100, verbose=False)
score(w, b ,X_train, X_test, y_train, y_test)

train accuracy: 0.9032967032967033
test accuracy: 0.9122807017543859


For comparison, use logistic regression provided by sklearn to predict result.

In [830]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
clf.fit(X_train, y_train)

LogisticRegression()

In [831]:
y_pred = clf.predict(X_train)
print('train accuracy:', accuracy_score(y_train, y_pred))
y_pred = clf.predict(X_test)
print('test accuracy:', accuracy_score(y_test, y_pred))

train accuracy: 0.945054945054945
test accuracy: 0.9473684210526315
