In [78]:
import numpy as np

class LogisticRegressionSGD:

    def __init__(self, learning_rate = 0.01, batch_size = 10, max_iterations = 1000):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.max_iterations = max_iterations
        self.weights = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def error_function(self, y, t):
        return -np.mean(t * np.log(y + 1e-9) + (1 - t) * np.log(1 - y + 1e-9))

    def train(self, X, t):
        N, d = X.shape
        self.weights = np.random.randn(d) * 0.01

        for j in range(self.max_iterations):
            indices = np.random.permutation(N)
            X_random, t_random = X[indices], t[indices]

            for i in range(0, N, self.batch_size):
                X_batch = X_random[i:i + self.batch_size]
                t_batch = t_random[i:i + self.batch_size]

                y_batch = self.sigmoid(X_batch @ self.weights).reshape(-1)
                t_batch = t_batch.reshape(-1)

                gradient = X_batch.T @ (y_batch - t_batch) / len(t_batch)

                self.weights -= self.learning_rate * gradient

            y_pred_train = self.sigmoid(X @ self.weights)
            y_pred_train = y_pred_train[: len(t)]  
            
            loss = self.error_function(y_pred_train, t)

    def predict_probability(self, X):
        return self.sigmoid(X @ self.weights)

    def predict(self, X):
        return (self.predict_probability(X) >= 0.5).astype(int)
        


In [79]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [80]:
data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 42, stratify = y_temp)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

unique, counts = np.unique(np.concatenate((y_train, y_val)), return_counts=True)
class_size = dict(zip(unique, counts))
class_size


{0: 180, 1: 303}

In [81]:
model = LogisticRegressionSGD(learning_rate = 0.1, batch_size = 20, max_iterations = 1000)
model.train(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.9767441860465116, 0.9642857142857143, 1.0, 0.9818181818181818)

The model correctly classified 97.7% of the test data.
96.4% of the time, the model correctly classified the benign cases as benign. Sometimes it misclassified the benign cases as bad.
There were zero false negatives, the model successfully identified all tumors.
The F1 score was 98.18% which means that the model was highly reliable for classification and it performed very well in precision and recall.