## Data Source

https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

## Implementation

Based on `main-titanic-vectorized.ipynb`

In [31]:
import pandas as pd
import numpy as np

data = pd.read_csv('./data/data.csv')
data = data.dropna(axis=1)
n_columns = len(data.columns)

# Normalize data
for col_idx in range(2, n_columns):
    data.iloc[:, col_idx] = (data.iloc[:, col_idx] - data.iloc[:, col_idx].min()) / (data.iloc[:, col_idx].max() - data.iloc[:, col_idx].min())

X = data.iloc[:, 2:n_columns]
y = data['diagnosis']
X = np.array(X)
X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
y = np.array(y)
y = np.where(y == 'M', 1, 0)
X.shape, y.shape

((569, 31), (569,))

In [32]:
# Train test split
np.random.seed(0)
indices = np.random.permutation(X.shape[0])
n_train = int(X.shape[0] * 0.8)
train_idx, test_idx = indices[:n_train], indices[n_train:]
X_train, X_test = X[train_idx, :], X[test_idx, :]
y_train, y_test = y[train_idx], y[test_idx]

In [33]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

class LossFunction:
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def loss(self, a : np.ndarray):
        prob = sigmoid(np.dot(self.X, a))
        values = - self.y * np.log(prob) - (1 - self.y) * np.log(1 - prob)
        return np.nansum(values)

    def gradient(self, a : np.ndarray):
        prob = sigmoid(np.dot(self.X, a))
        sub_coefficient = -(self.y - prob) 
        return np.dot(self.X.T, sub_coefficient)
    
    def precision(self, a : np.ndarray):
        prob = sigmoid(np.dot(self.X, a))
        prob = np.array(prob >= 0.5, dtype=np.int32)
        return np.sum(prob == self.y) / self.y.shape[0]

loss_func = LossFunction(X_train, y_train)


In [34]:
def gradient_descent(loss_func, starting_point, learning_rate = 0.001, num_steps = 800, precision=0.00001):
    cur_point = starting_point
    for i in range(num_steps):
        grad = loss_func.gradient(cur_point)
        print("Iteration {}: loss = {}, precision = {}".format(i, loss_func.loss(cur_point), loss_func.precision(cur_point)))
        cur_point = cur_point - learning_rate * grad
        if np.linalg.norm(grad) < precision:
            break
    return cur_point

optimal = gradient_descent(loss_func, np.zeros(X_train.shape[1]))

Iteration 0: loss = 315.3819671547751, precision = 0.36923076923076925
Iteration 1: loss = 309.08937159031245, precision = 0.6351648351648351
Iteration 2: loss = 303.40784199196673, precision = 0.6527472527472528
Iteration 3: loss = 298.09108824029846, precision = 0.6791208791208792
Iteration 4: loss = 293.0329964129459, precision = 0.7142857142857143
Iteration 5: loss = 288.18547635115397, precision = 0.7626373626373626
Iteration 6: loss = 283.5244890248445, precision = 0.778021978021978
Iteration 7: loss = 279.03602567940436, precision = 0.7934065934065934
Iteration 8: loss = 274.71032801656247, precision = 0.8285714285714286
Iteration 9: loss = 270.53951063080365, precision = 0.8483516483516483
Iteration 10: loss = 266.51658665887453, precision = 0.8505494505494505
Iteration 11: loss = 262.6350707399034, precision = 0.8615384615384616
Iteration 12: loss = 258.88881832566085, precision = 0.865934065934066
Iteration 13: loss = 255.27196084418875, precision = 0.8681318681318682
Iterati

In [35]:
# test on X_test

loss_func = LossFunction(X_test, y_test)
print("Test precision: {}".format(loss_func.precision(optimal)))

Test precision: 0.9912280701754386
