# Vectorization
You try to get rid of for loops
$$
z = w^tx + b
$$

In [None]:
# non-vectorized
z = 0
x = [1,1,1,1,1,1,1]
w = [1,1,1,1,1,1,1]
b = 1
for i in range(len(x)):
    z += w[i] * x[i]
z+=b
z

In [None]:
# vectorized
import numpy as np
np.dot(x,w)+b

In [None]:
import time
x = np.random.rand(10000000)
w = np.random.rand(10000000)

In [None]:
start = time.time()
z = np.dot(x,w)+b
end = time.time()
print('vectorized version takes ', (end-start), ' seconds')
print(z)

In [None]:
start = time.time()
z = 0
for i in range(len(x)):
    z += w[i] * x[i]
z+=b
end = time.time()
print('un-vectorized version takes ', (end-start), ' seconds')
print(z)

In [None]:
time.time?

![figures/vectorization.png](figures/vectorization.png)

### Possible problems with  vectorization

In [None]:
import numpy as np
a = np.random.randn(3)
print(a)
print("\nProblem in Shape :", a.shape)
print('\tnot certain, \n\ttoo much felxibility')

### Use expilict non-ambigous shape for matrices

In [None]:
a = np.random.randn(3,1)
print(a)

In [None]:
print("\nShape :", a.shape)
print('\tcertain')

In [None]:
print(a.T)
print("\nShape :", a.T.shape)
print('\tcertain')

## Logistic Regression Cost Function
The loss function computes the error for a single training example; the cost function is the average of the loss functions of the entire training set.

$$
L(a,y) = - (y*log(a) - (1-y)*log(1-a))
$$

Cost function
$$J(w,b) = (1/m) * \sum(L(a[i],y[i]))$$

$z = w^tx + b$ and $a = \sigma(z) \in [0,1]$
 - $y=1 \Rightarrow L(a,y) = -log(a)$
 - $y=0 \Rightarrow L(a,y) = -log(1-a)$
 

In [None]:
a = np.arange(0.01,1,0.01)
L1 = -np.log(a)
L0 = -np.log(1-a)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(a,L1,)
plt.plot(a,L0)
plt.grid(True) 
plt.ylabel('Cost Function'); 
plt.xlabel('hypothesis outputs a')
plt.legend(("y=1", "y=0"), shadow=True, fancybox=True)

### Lets calculate the derivatives $\frac{dL}{dw}$ and $\frac{dL}{db}$



>"Derivative, what are you doing?"
> - "I am trying to make a difference" 
 

Output loss function
$$
\frac{dL}{da} = \frac{-y}{a} + \frac{1-y}{1-a}
$$
One step back
$$
\frac{dL}{dz} = \frac{dL}{da} \frac{da}{dz} = \frac{dL}{da} \sigma(z) (1-\sigma(z)) = \frac{dL}{da} a(1-a)
$$
if yo do the math
$$
\frac{dL}{dz} = \frac{dL}{da} a(1-a) = [\frac{-y}{a} + \frac{1-y}{1-a}]a(1-a) = -y(1-a) + (1-y)a = a -y
$$

One more step back
$$
\frac{dL}{dw} = \frac{dL}{da}\frac{da}{dz}\frac{dz}{dw} = (a-y)\frac{dz}{dw} = (a-y)x
$$

> after you calculate $\frac{dL}{dz} = (a-y)$ you just multiply it by x to get $\frac{dL}{dw}$

and also

$$
\frac{dL}{db} = \frac{dL}{da}\frac{da}{dz}\frac{dz}{db} = (a-y)\frac{dz}{db} = (a-y)
$$

In [None]:
a

![figures/logisticregression.png](figures/logisticregression.png)

In [None]:
#Data
x1 = np.array([1,1])
x2 = np.array([1,0])
x3 = np.array([0,1])
x4 = np.array([0,0])
#Parameters
b = 1
w = np.array([[1],
              [1]])

In [None]:
w.shape

In [None]:
# Data matrix
X = np.stack((x1,x2,x3,x4)).T
X

In [None]:
X.shape

In [None]:
Z = np.dot(w.T, X)
Z

In [None]:
# Broadcasting
Z += b
Z

In [None]:
# output based on threshold: AND operation
A = Z > 2
A = A.astype(int)
A

In [None]:
def uAnd(x, w = np.array([1,1]), b = 1):
    """outputs AND operation"""
    Z = np.dot(w.T, x) + b
    A = Z > 2
    return A.astype(int)

In [None]:
print(uAnd([1,1]))
print(uAnd([1,0]))
print(uAnd([0,1]))
print(uAnd([1,0]))

In [None]:
print(uAnd(X))

#### Question: How to make 'or'?
Use same w and b!!

![figures/LRimplementation.png](figures/LRimplementation.png)

In [41]:
# n=2: number of features 
# m=4: number of observations

# X: Data is a nxm matrix
#    Each column c of X[c,:] is another observation
#    Each row corresponds to a feature 
X = np.array([
        [1, 1, 0, 0,1,1,1,1],
        [1, 0, 1, 0,1,0,0,0]
    ])
#True labels for and operation
y = np.array([[1,0,0,0,1,0,0,0]])

# w is a column vector with shape (n,1)
w = np.array([[1],
              [1]])
# Bias
b = 1

print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)
print("Shape of w: ", w.shape)

Shape of X:  (2, 8)
Shape of y:  (1, 8)
Shape of w:  (2, 1)


In [42]:
n, m = X.shape# n: number of features and m: number of observations

Z = np.dot(w.T,X) + b     #Forward Pass: Vectorization Then broadcasting, Z shape is (1, m)
A = 1 / 1 + np.exp(-Z)    #Sigmoid     :Vectorization, A shape is (1, m)
dLdz = A - y              #Compute derivative : dLdz shape is (1, m)

# Parameters to be optimized: w, b
dLdw = np.dot(X,dLdz.T)/m #Vectorization, dw shape is (Nx, 1) 
dLdb = dLdz.sum()/m         #Vectorization, dz shape is (1, 1)

print("Shape of Z: ", Z.shape)
print("Shape of A: ", A.shape)
print("Shape of dLdz: ", dLdz.shape)
print("Shape of dLdw: ", dLdw.shape)
print("Shape of dLdb: ", dLdb.shape)
print(n,m)

Shape of Z:  (1, 8)
Shape of A:  (1, 8)
Shape of dLdz:  (1, 8)
Shape of dLdw:  (2, 1)
Shape of dLdb:  ()
2 8


In [43]:
print("Z: ", Z)
print("A: ", A)
print("dLdz: ", dLdz)
print("dLdw: ", dLdw)
print("dLdb: ", dLdb)

Z:  [[3 2 2 1 3 2 2 2]]
A:  [[ 1.04978707  1.13533528  1.13533528  1.36787944  1.04978707  1.13533528
   1.13533528  1.13533528]]
dLdz:  [[ 0.04978707  1.13533528  1.13533528  1.36787944  0.04978707  1.13533528
   1.13533528  1.13533528]]
dLdw:  [[ 0.58011441]
 [ 0.15436368]]
dLdb:  0.893016249261


In [67]:
import numpy as np
class logisticRegression():
    def __init__(self, X, y, alpha = 0.01):
        self.alpha = alpha # learning rate
        self.X, self.y = X, y
        self.n, self.m = X.shape # n: number of features and m: number of observations
        self.w = np.random.randn(n,1) # w is a column vector with shape (n,1)
        self.b = 0.5
        
    def fit(self, n_iterations=1000):
        Z = np.dot(self.w.T,self.X) + self.b     #Vectorized output Z with shape is (1, m)
        A = 1 / 1 + np.exp(-Z)    #Vectorized Sigmoid for A with shape is (1, m)
        dLdz = A - self.y              #Compute derivative : dLdz shape is (1, m)
        
        # Parameters to be optimized: w, b
        dLdw = np.dot(self.X,dLdz.T)/self.m #Vectorization, dw shape is (Nx, 1) 
        dLdb = dLdz.sum()/self.m         #Vectorization, dz shape is (1, 1)
        
        for i in range(n_iterations):
            self.w = self.w - self.alpha * dLdw
            self.b = self.b - self.alpha * dLdb
    
    def predict(self, newX):
        Z = np.dot(self.w.T,newX) + self.b 
        A = 1 / 1 + np.exp(-Z)
        return np.round(A)
        
        

In [68]:
lr = logisticRegression(X,y)
lr.fit()
lr.predict(np.array([[1,1,0,0],[1,1,0,0]]))

array([[  8.36521848e+13,   8.36521848e+13,   3.47697400e+06,
          3.47697400e+06]])

In [69]:
lr.w, lr.b

(array([[-10.94795233],
        [ -6.04806377]]), -15.061672570008051)

In [66]:
lr.predict(X)

array([[ 796.,  484.,   17.,   11.,  796.,  484.,  484.,  484.]])