# Logistic regression

In [13]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
def sigmoid(X, theta):
    return 1/(1+np.exp(-X@theta))

In [3]:
def loglikelihood(theta, X, y):
    return -np.sum(y*np.log(sigmoid(X, theta)) + (1-y)*np.log(1-sigmoid(X, theta)))/X.shape[0]

In [4]:
df = pd.read_csv("../data/BreastCancer.csv")
df.drop(labels=["id", "Unnamed: 32"], axis=1, inplace=True)

# Change the labels M and B to 1 and 0 respectively
df.replace("M", 1, inplace=True)
df.replace("B", 0, inplace=True)

In [5]:
# Create feature vector X and class vector y
X = df.drop(labels=['diagnosis'], axis=1).to_numpy()
y = df['diagnosis'].to_numpy()

# Split the data into a 80% training and 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
def gradient_descent(X_train, y_train, learning_rate, num_iterations, threshold=1e-3, blog=False):
    """
    Use gradient descent to optimize regression parameters theta in order to find the best straight
    line for the given points

    :param points: points to fit the line
    :param learning_rate: learning rate used in the algorithm
    :param num_iterations: maximum number of iterations
    :param threshold: minimum difference between two sequential mean squared
        error values (default is 1e-3)
    :param blog: indicates to print the cost at every step (default is False)
    """

    # Init values
    theta = np.zeros(X_train.shape[1])
    m = X_train.shape[0]
    iteration = 0

    J = loglikelihood(theta, X_train, y_train)
    prev_J = np.inf

    # Loop until convergence or maximum number of iterations is reached
    while iteration < num_iterations and np.all(np.abs(J - prev_J) > threshold):

        new_thetas = np.zeros(len(theta))
        prev_J = J

        # Compute new theta's using the gradient
        for j, theta_j in enumerate(theta):
            new_thetas[j] = theta_j - learning_rate / m * np.sum(
                (sigmoid(X_train, theta) - y_train) * X_train[:, j]
            )

        theta = new_thetas
        # Compute new MSE
        J = loglikelihood(theta, X_train, y_train)

        if blog:
            print(f"{iteration}: {J}")

        iteration += 1

    return theta, J


In [8]:
X_train_scaled = StandardScaler().fit_transform(X_train)

theta_opt, J = gradient_descent(X_train_scaled, y_train, 1, 1e3, blog=True)

0: 0.1839334736126247
1: 0.14762858425386402
2: 0.13029026266271282
3: 0.12237317931385867
4: 0.11728024735654992
5: 0.11328934457317402
6: 0.10994902145089983
7: 0.10706906108956514
8: 0.10454225068132868
9: 0.10229786748414307
10: 0.10028512556386737
11: 0.09846573660598545
12: 0.09680993490853435
13: 0.09529407991787624
14: 0.09389908793533164
15: 0.09260934913436224
16: 0.09141195138863642
17: 0.09029610889576402
18: 0.08925273290649562
19: 0.08827410391658468


In [12]:
X_test_scaled = StandardScaler().fit_transform(X_test)

y_pred = [0 if h < 0.5 else 1 for h in sigmoid(X_test_scaled, theta_opt)]
accuracy = np.sum(y_pred == y_test)/y_test.shape[0]

print(accuracy)

0.9824561403508771
