In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [115]:
class LogisticRegression:

    def __init__(self, params=None):
        self.params = params

    def sigmoid(self, x):
        """Applies the sigmoid function."""
        return np.where(x >= 0, 
                        1 / (1 + np.exp(-x)), 
                        np.exp(x) / (1 + np.exp(x)))
    
    def negative_log_likelihood(self, X, Y):
        """Computes the negative log-likelihood."""
        logits = X @ self.params
        return -np.sum(Y * np.log(self.sigmoid(logits)) + (1 - Y) * np.log(1 - self.sigmoid(logits)))

    def gradient(self, X, Y):
        """Computes the gradient of the negative log-likelihood function."""
        logits = X @ self.params
        errors = self.sigmoid(logits) - Y
        return X.T @ errors

    def default_learning_rate(self, X):
        """Computes the default learning rate using Hessian of the negative log-likelihood function."""
        logits = X @ self.params
        hessian_approx = np.sum(self.sigmoid(logits) * (1 - self.sigmoid(logits)))
        if hessian_approx > 1e-10:
            return 1 / hessian_approx
        else:
            return 1e-2

    def fit(self, X_train, Y_train, learning_rate=None, max_iter=10000, verbose=False):
        """Fits the model using gradient descent with specified learning rate or one calculated using the Hessian."""
        X = X_train.values
        Y = Y_train.to_numpy()
        n_features = X.shape[1]
        self.params = np.zeros(n_features)  # Initialize parameters

        if learning_rate is None:
            learning_rate = self.default_learning_rate(X)
        
        for iteration in range(max_iter):
            grad = self.gradient(X, Y)
            self.params -= learning_rate * grad  # Gradient descent update

            # Print the loss every 100 iterations if verbosity mode is active
            if verbose:
                if iteration % 100 == 0:
                    loss = self.negative_log_likelihood(X, Y)
                    print(f"Iteration {i}: Loss = {loss:.4f}")
        if verbose:
            print(f"Final iteration: Loss = {self.negative_log_likelihood(X, Y):.4f}")

    def predict_probs(self, X_train):
        """Predicts probabilities of labels for input data."""
        return self.sigmoid(X_train.values @ self.params)

    def predict(self, X_train, threshold=0.5):
        """Predicts class labels for input data with a given threshhold."""
        return (self.predict_probs(X_train) >= threshold).astype(int)

In [118]:
def test(logistic_regression_model, n_samples=1000, n_columns=2, true_params=None, 
         learning_rate=None, max_iter=1000, random_seed=None, verbose=False):
    """Function testing the accuracy of the logistic regression class"""
    random_state = np.random.default_rng(random_seed)
    if verbose:
        print(f"Shape of training data: ({n_samples}, {n_columns})")
    X_train = pd.DataFrame(random_state.normal(loc=0, scale=5, size=(n_samples, n_columns)))
    X_train["constant"] = 1
    if true_params is None:
        true_params = random_state.normal(loc=0, scale=5, size=n_columns+1)
    model = logistic_regression_model()
    Y_train = pd.Series((random_state.uniform(size=n_samples) < model.sigmoid(X_train.values @ true_params)).astype(int))
    model.fit(X_train, Y_train, learning_rate=learning_rate, max_iter=max_iter)
    if verbose:
        print(f"Correct parameters: {true_params}, Fitted parameters: {model.params}")
        
    predictions = model.predict(X_train)
    accuracy = np.mean(predictions == Y_train)
    if verbose:
        print(f"Model's accuracy:{accuracy}")
    return accuracy

In [119]:
test(LogisticRegression, n_samples=1000, n_columns=2, max_iter=10000, true_params=[5, -2, 3], random_seed=2, verbose=True)

Shape of training data: (1000, 2)
Correct parameters: [5, -2, 3], Fitted parameters: [ 4.41870322 -1.7140476   2.82231737]
Model's accuracy:0.972


np.float64(0.972)

In [120]:
test_numer = 100
accuracies = []
for i in range(test_numer):
    random_state = np.random.default_rng(i)
    n_samples = random_state.integers(100, 1000)
    n_columns= random_state.integers(1, 10)
    accuracies.append(test(LogisticRegression, n_samples=n_samples, n_columns=n_columns, max_iter=1000, random_seed=i, verbose=False))
print(np.mean(accuracies))

0.9879059179357647
