# Logistic Regression Fundamental


### Step 1: Make prediction through forward propagation

In [351]:
import pandas as pd
import numpy as np

N = 100  # 100 samples
D = 2    # 2 dimensions/features

X = np.random.randn(N, D)

X[:50,:] = X[:50,:] - 2*np.ones((50, D))
X[50:,:] = X[50:,:] + 2*np.ones((50, D))

T = np.array([0]*50 + [1]*50)

print(X.shape)
ones = np.ones((N,1))
Xb = np.concatenate((X, ones), axis=1)
print(Xb.shape)

# randomly initialize weights
w = np.random.randn(D + 1)


(100, 2)
(100, 3)


In [206]:
def forward(X, W):
    '''
    Compute the output of the Logistic Regression Network
    '''
    return sigmoid(X.dot(W))

def sigmoid(z):
    return 1/(1+np.exp(-z))

In [207]:
Y = forward(Xb, w)
predictions = np.round(Y)
print(predictions)

[ 0.  0.  1.  0.  0.  1.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.
  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.]


In [208]:
def classification_rate(Y, P):
    return np.mean(Y==P)


print("score:", classification_rate(predictions, T))

score: 0.4


### Step 2: Training the Model

* Loss function for Logistic Regression: Cross Entropy Error:

$$ J = -{tlog(y) + (1-t)log(1-y)} $$

where t = target (actual label), y = output of logistic (predicted output)

* If t=1, only first term matters, if t=0, only second term matters
* Multiple Training Examples:
<img src="images/cross_entropy.png" alt="Drawing" style="width:60%;height:60%"/>


In [209]:
def cross_entropy(T, Y):
    J = 0;
    N = len(T)
    for i in range(N):
        if T[i] == 1:
            J-=np.log(Y[i])
        else:
            J-=np.log(1-Y[i])
    return J / N

# def cross_entropy(T, Y):
#     return -np.mean(T*np.log(Y) + (1-T)*np.log(1-Y))

In [210]:
print("Cross entropy:", cross_entropy(T, Y))

Cross entropy: 175.743019268


In [211]:
w_b = np.array([0,4,4]) # Bayes classifier weights
Y_ = forward(Xb, w_b)
print("Cross entropy:", cross_entropy(T, Y_))

Cross entropy: 26.2565022792


**Optimize the weights**
* Gradient descent throught back propragation
* Vectorized form:
<img src="images/vectorize_form.png" alt="Drawing" style="width:30%;height:30%"/>
* Vectorized matrix form:
    * X is N x D, X.T (tranpose X) is D x N 
    * Y, T are N x 1, (Y-T) is still N x 1
    * Multiply X.T (Y-T): 
        * shape is (D x N)(N x 1) --> (D x 1), which is the correct shape for W
        * N gets summed over
<img src="images/vectorize_matrix_form.png" alt="Drawing" style="width:30%;height:30%"/>
* Vectorized form for bias term:
<img src="images/vectorize_form_bias.png" alt="Drawing" style="width:57%;height:57%"/>

In [214]:
learning_rate = 0.1
epochs = 100

for i in range(epochs):
    if i % 10 == 0:
        print(i, cross_entropy(T, Y))
        
    # gradient descent through back propragation updating the weights
    w += learning_rate * Xb.T.dot(T - Y)
    
    # forward propagation calculating the output
    Y = sigmoid(Xb.dot(w))
 
print("final w:", w)

0 0.0890789325707
10 0.0882156405002
20 0.0875952355412
30 0.0871141456062
40 0.0867136110172
50 0.0863601223747
60 0.08603449975
70 0.0857257329599
80 0.0854274848448
90 0.0851360978369
final w: [  8.49223425  12.97322782  -1.11169253]


**Implement Logistic Regression fit function**

In [224]:
def fit(X, T, epochs=100, learning_rate=0.1):
    
    N, D = X.shape
    
    # add bias 
    ones = np.ones((N,1))
    Xb = np.concatenate((X, ones), axis=1)

    # randomly initialize weights
    w = np.random.randn(D + 1)
    
#     Y = forward(Xb, w)

    # training weights using gradient descent
    for i in range(epochs):
        Y = forward(Xb, w)
        if i % 10 == 0:
            print(i, cross_entropy(T, Y))
        
        w += learning_rate * Xb.T.dot(T - Y)
#         Y = forward(Xb, w)
 
    print("final w:", w)
 

0 276.637844572
10 1.18118374042
20 0.373551821885
30 0.190401454182
40 0.12386126001
50 0.0910378734668
60 0.0718393809707
70 0.059367537013
80 0.0506755978232
90 0.044307457976
final w: [ 14.28726757  19.63086634  -1.915264  ]


In [None]:
fit(X, T)

In [353]:
def fit_ridge(X, T, epochs=100, learning_rate=0.1, lamba=0.1):
    
    N, D = X.shape
    
    # add bias 
    ones = np.ones((N,1))
    Xb = np.concatenate((X, ones), axis=1)

    # randomly initialize weights
    w = np.random.randn(D + 1)
    
    Y = forward(Xb, w)

    # training weights using gradient descent
    for i in range(epochs):
        if i % 10 == 0:
            print(i, cross_entropy(T, Y))
        
        w += learning_rate * (Xb.T.dot(T - Y) - lamba * w)
#         w += learning_rate * (np.dot((T - Y).T, Xb) - lamba * w)
        Y = forward(Xb, w)
 
    print("final w:", w)


0 55.8385093915
10 0.00123981685528
20 0.0024845510581
30 0.00470411134372
40 0.00845191410794
50 0.014451426845
60 0.0235428591108
70 0.0365268816369
80 0.0538802169219
90 0.0754043690019
final w: [ 2.92456498  2.93457206  0.22886414]


In [None]:
fit_ridge(X, T, epochs=100)