# Assignment 11: Logistic Regression and kNN

## Data Preprocessing

In [35]:
import pandas
import numpy as np

labels_and_feats  = pandas.read_csv("data.csv", header=None, usecols=range(1, 32)).to_numpy()
np.random.shuffle(labels_and_feats)

label_map = {
    'B': 0,
    'M': 1,
}
labels = np.vectorize(lambda x: np.float32(label_map[x]))(labels_and_feats[:, 0])

feats = np.vectorize(lambda x: np.float32(x))(labels_and_feats[:, 1:31])

for i in range(feats.shape[1]):
    col = feats[:, i]
    feats[:, i] = (col - np.mean(col)) / np.std(col)

num_recs = feats.shape[0]
num_train = int(0.8 * num_recs)
num_test = num_recs - num_train

X_train = feats[0:num_train]
y_train = labels[0:num_train]

X_test = feats[num_train:][:num_test]
y_test = labels[num_train:][:num_test]

## Logistic Regression

In [36]:
def sigmoid(z: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-z))

def initialize_weights(num_feats: float) -> np.ndarray:
    return np.random.random(num_feats) - 0.5

def compute_cost(Xs: np.ndarray, ys: np.ndarray, W: np.ndarray, b: float) -> float:
    y_preds = predict_logistic_regression(Xs, W, b)
    return np.sum(-ys * np.log(y_preds) - (1 - ys) * np.log(1 - y_preds)), (np.count_nonzero(np.abs(ys - y_preds) < 0.5) / Xs.shape[0])

def optimize_weights(Xs: np.ndarray, ys: np.ndarray, W: np.ndarray, b: float, lr: float) -> None:
    y_preds = predict_logistic_regression(Xs, W, b)
    dy_preds = (1 - ys) / (1 - y_preds) - ys / y_preds

    z = np.dot(W, Xs.T) + b
    dzs = sigmoid(z) * (1 - sigmoid(z)) * dy_preds

    W -= lr * np.dot(dzs, Xs)
    b -= lr * np.sum(dzs)

def train_logistic_regression(Xs: np.ndarray, ys: np.ndarray, lr: float, num_epochs: int) -> None:
    W = initialize_weights(Xs.shape[1])
    b = 0

    for epoch in range(1, num_epochs + 1):
        optimize_weights(Xs, ys, W, b, lr)
        cost, acc = compute_cost(Xs, ys, W, b)

        print(f"Epoch {epoch:02d}/{num_epochs}: Cost: {cost:.2f}, Acc: {acc:.2f}")

    return W, b

def predict_logistic_regression(Xs: np.ndarray, W: np.ndarray, b: float) -> np.ndarray:
    z = np.dot(W, Xs.T) + b
    return sigmoid(z)

W, b = train_logistic_regression(X_train, y_train, 0.001, 20)

test_acc = np.count_nonzero(np.abs(y_test - predict_logistic_regression(X_test, W, b)) < 0.5) / y_test.shape[0]
print(f"Test Acc: {test_acc:.2f}")

Epoch 01/20: Cost: 147.94, Acc: 0.88
Epoch 02/20: Cost: 123.72, Acc: 0.89
Epoch 03/20: Cost: 109.40, Acc: 0.91
Epoch 04/20: Cost: 99.63, Acc: 0.92
Epoch 05/20: Cost: 92.49, Acc: 0.92
Epoch 06/20: Cost: 87.00, Acc: 0.93
Epoch 07/20: Cost: 82.61, Acc: 0.93
Epoch 08/20: Cost: 78.99, Acc: 0.93
Epoch 09/20: Cost: 75.95, Acc: 0.93
Epoch 10/20: Cost: 73.34, Acc: 0.94
Epoch 11/20: Cost: 71.08, Acc: 0.94
Epoch 12/20: Cost: 69.08, Acc: 0.95
Epoch 13/20: Cost: 67.31, Acc: 0.95
Epoch 14/20: Cost: 65.73, Acc: 0.95
Epoch 15/20: Cost: 64.29, Acc: 0.95
Epoch 16/20: Cost: 62.99, Acc: 0.95
Epoch 17/20: Cost: 61.81, Acc: 0.95
Epoch 18/20: Cost: 60.71, Acc: 0.95
Epoch 19/20: Cost: 59.71, Acc: 0.95
Epoch 20/20: Cost: 58.77, Acc: 0.95
Test Acc: 0.99


## kNN

In [39]:
def euclidean_distance(X1: np.ndarray, X2: np.ndarray) -> float:
    return np.sum(np.square(X1 - X2)) ** 0.5

def get_neighbors(X_train: np.ndarray, X_test_instance: np.ndarray, k: int) -> np.ndarray:
    return np.argsort(np.apply_along_axis(lambda X: euclidean_distance(X_test_instance, X), 1, X_train))[:k]

def predict_kNN(X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, k: int) -> np.ndarray:
    labels_from_idxs = np.vectorize(lambda idx: y_train[idx])
    max_count_label_from_unique_counts = lambda unique_counts: unique_counts.values[np.argsort(unique_counts.counts)[-1]]
    return np.apply_along_axis(lambda X: max_count_label_from_unique_counts(np.unique_counts(labels_from_idxs(get_neighbors(X_train, X, k)))), 1, X_test)

for k in [1, 3, 5, 7]:
    test_acc = np.count_nonzero((predict_kNN(X_train, y_train, X_test, k) - y_test) == 0) / y_test.shape[0]
    print(f"k={k}: Test Acc: {test_acc:.2f}")

k=1: Test Acc: 0.96
k=3: Test Acc: 0.97
k=5: Test Acc: 0.99
k=7: Test Acc: 0.99


## Comparison and Analysis

- Both models perform very well, perhaps because the dataset is linearly separable (the logit of the logistic regression model, and the euclidean distance of the kNN model would benefit from this).
- For the kNN model, we can observe that even small values of k (3 and 5), perform well, however, the larger values (5 and 7) perform even better.
- Logistic Regression
    - Strengths
        - Computationally cheap
        - Model weights provide interpretability
    - Weaknesses
        - May diverge during training
        - Poor accuracy on multi-class prediction tasks
- kNN
    - Strengths
        - Matter of convergence/divergence is not an issue
        - Decent accuracy on multi-class prediction tasks
    - Weaknesses
        - Computationally expensive
        - Difficult to interpret