In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit

def sigmoid(w, x):
    z = np.dot(x, w)
    return (1.0 / (1 + np.exp(-z)))

def cross_entropy_loss(y, yhat):
    return -np.sum(y * np.log(yhat) 
                   + (1 - y) * np.log(1 - yhat))

def gradient(x, y, yhat):
    return -np.dot(x.T, y - yhat)

def hessian(x, yhat):
    return np.dot((yhat * (1 - yhat) * x).T, x)

def newtons_method(X_train, y_train):
    w = np.zeros((n_features + 1, 1))

    yhat_train = sigmoid(w, X_train)
    error = np.Infinity
    delta = np.Infinity
    convergence = 1e-12

    while abs(delta) > convergence:
        yhat_train = sigmoid(w, X_train)
        g = gradient(X_train, y_train, yhat_train)
        h = hessian(X_train, yhat_train)

        w -= np.dot(np.linalg.inv(h), g)

        error_new = cross_entropy_loss(y_train, yhat_train)
        delta = error - error_new
        error = error_new
        
    return w
        
def accuracy(y, yhat):
    yhat = yhat > 0.5 # classify
    return np.sum(y == yhat) / len(yhat)

In [4]:
%%time

names = ['id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'malignancy']
df = pd.read_csv('breast-cancer-wisconsin.data.txt', names=names)
df['bias'] = 1 # dummy variable for bias 

y = df['malignancy']  
malignant, not_malignant = True, False
y = y.replace(4, malignant) 
y = y.replace(2, not_malignant)
y = y.values.reshape(len(y), 1)

n_features = len(names) - 2 # exclude id and gold standard
cols = df.columns.tolist()
X = df[cols[-1:] + cols[1:-2]] # bias and feature columns
X = X.replace('?', np.NaN) 
X = X.astype(float)
X = X.fillna(X.mean())
X = X.values

accuracies = []
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
for train, test in ss.split(X):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    w = newtons_method(X_train, y_train)
    
    yhat_test = sigmoid(w, X_test)
    accuracies.append(accuracy(y_test, yhat_test))

print('Accuracy: %.4f' % np.mean(np.array(accuracies)))

Accuracy: 0.9643
Wall time: 64 ms
