# Logistic regression

- Newton's model

In [233]:
import numpy as np

raw_X = np.loadtxt(path('data/logistic_x.txt')) # m x n
raw_y = np.loadtxt(path('data/logistic_y.txt')) # 1 x m

y = np.array([1 if v == 1 else 0 for v in raw_y])

x_0 = np.ones(X.shape[0]).reshape(-1, 1) # (m,) => (m, 1)
X = np.concatenate((x_0, raw_X), axis=1) # m, n+1

theta = np.zeros(X.shape[1])

print(X[:5])
print(y)
print(theta)

[[ 1.          1.3432504  -1.3311479 ]
 [ 1.          1.8205529  -0.6346681 ]
 [ 1.          0.98632067 -1.8885762 ]
 [ 1.          1.9443734  -1.635452  ]
 [ 1.          0.97673352 -1.3533151 ]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[0. 0. 0.]


## Newton's method

Update rule: $\theta := \theta - H^{-1} \nabla_{\theta} l(\theta)$

Partial derivative vector and Hessian:
$$
\begin{aligned}
\nabla_{\theta} \ell (\theta)_{j} &= \sum_{i = 1}^m (y^{(i)} - h_{\theta}(x^{(i)}))x_j^{(i)} \\
\\
H_{kj} &= \frac{\partial^2 \ell(\theta)}{\partial \theta_k \partial \theta_j} \\
&= \sum_{i = 1}^m x_j^{(i)} x_k^{(i)} g(\theta^T x^{(i)}) (1 - g(\theta^T x^{(i)}))
\end{aligned}
$$

In [247]:
def sigmoid(z):
    '''vectorized sigmoid'''
    return 1 / (1 + np.exp(-z))

def hypothesis(theta, X):
    '''vectorized hypothesis, X = design matrix'''
    return sigmoid(X @ theta) # result is length m vector

def partials(theta, X, y):
    '''vectorized partial derivative'''
    h = hypothesis(theta, X)
    residuals = (y - h)
    # jth index of this vector = sum over all training: res * jth feature
    return residuals @ X # result is length n vector

def hessian(theta, X, y):
    prod = hypothesis(theta, X) * (1 - hypothesis(theta, X)) # m-vector
    D = np.diag(prod) # construct diagonal matrix of sigmoid products
    return -X.T @ D @ X # error: was missing the - sign!! spent so long on this

# for y = {-1, 1}
# def cost(theta, X, y):
#     m = X.shape[0]
#     return np.sum(np.log(1 + np.exp( (-y - (X @ theta) ))) / m

# def log_likelihood(theta, X, y):
    

In [248]:
print(partials(theta, X, y))
print(hessian(theta, X, y))
print(hypothesis(theta, X))

[ -2.         101.14803662 118.76289677]
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 0.]


In [249]:
def newton(theta, X, y, threshold = 0, max_iter = 15):

    delta = 1000
#     cost_history = []
    iterations = 0
    
    while delta >= threshold and iterations <= max_iter:
        args = (theta, X, y)
        theta -= np.linalg.pinv(hessian(*args)) @ partials(*args)
#         c = cost(*args)
#         cost_history.append(c)
        delta = 10
        iterations += 1
        print(theta)
        
    return theta

In [250]:
theta = np.zeros(X.shape[1])
theta = newton(theta, X, y)
print(theta)

[-1.50983811  0.43509696  0.62161752]
[-2.21834632  0.64372727  0.95944716]
[-2.55431051  0.74137714  1.13493588]
[-2.61847133  0.75979248  1.1707512 ]
[-2.62050954  0.76037096  1.17194549]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
[-2.6205116   0.76037154  1.17194674]
