In [7]:
# Here I am going to try to answer two questions I had
# The example uses logistic classification for determining examples x to be 0 or 1
#1. Can theta predict 1 well even though we have prediction of 0's bringing
#   the middle theta down and how different is it from updating theta per example vs theta per whole training set?
#2. Why does it work?


In [8]:
import numpy as np

In [12]:
ones = np.array([[0, 1, 0]])
zeros = np.repeat(np.array([[1, 1, 0]]).ravel()[None], 3, axis=0)

In [13]:
ones

array([[0, 1, 0]])

In [14]:
zeros

array([[1, 1, 0],
       [1, 1, 0],
       [1, 1, 0]])

In [15]:
x = np.vstack((ones.ravel()[None], zeros))
y = np.array([[1, 0, 0, 0]])

In [16]:
x

array([[0, 1, 0],
       [1, 1, 0],
       [1, 1, 0],
       [1, 1, 0]])

In [17]:
y

array([[1, 0, 0, 0]])

In [18]:
from scipy.special import expit

def h(x, theta):
    return expit(x.dot(theta.T))

def get_cost(x, theta, y):
    m = y.shape[1]
    term_1 = -y.dot(np.log(h(x, theta)))
    term_2 = (1 - y).dot(np.log(1 - h(x, theta)))
    return 1 / m * (term_1 - term_2)


def get_cost_vector_sum(x_vector, theta, y_indiv):
    term_1 = -y_indiv * np.log(h(x_vector, theta))
    print("#########################  -y_indiv is ", -y_indiv)
    print("x_vector * theta is \n", x_vector * theta)
    if (y_indiv[0] == 1):
        print("np.log(h(x_vector, theta) is ", np.log(h(x_vector, theta)))
    term_2 = (1 - y_indiv) * np.log(1 - h(x_vector, theta))
    if (y_indiv[0] == 0):
        print("np.log(1 -h(x_vector, theta) is ", np.log(1 - h(x_vector, theta)))
    return term_1 - term_2

def get_cost_vector_sum_clean(x_vector, theta, y_indiv):
    term_1 = -y_indiv * np.log(h(x_vector, theta))
    term_2 = (1 - y_indiv) * np.log(1 - h(x_vector, theta))
    return term_1 - term_2


def gradient_descent_vector(x_vector, theta, y_indiv):
    hx = h(x_vector, theta)
    print("hx is ", hx)
    error = hx - y_indiv
    print("error is ", error)
    print("")
    return error.dot(x_vector)

def gradient_descent_vector_clean(x_vector, theta, y_indiv):
    hx = h(x_vector, theta)
    error = hx - y_indiv
    return error.dot(x_vector)


def gradient_descent(x, theta, y):
    m = y.shape[1]
    hx = h(x, theta)
    error = hx - y.T
    return 1 / m * error.T.dot(x)

In [19]:
repeat = 10

In [22]:
def first_question_indiv():
    theta = np.zeros((1, x.shape[1]))
    alpha = 1
    print("initial cost is ", get_cost(x, theta, y))
    for i in range(y.shape[1] * repeat):
        i %= y.shape[1]
        gradient_vector = gradient_descent_vector_clean(x[i][None], theta, y[0][i][None])
        theta = theta - alpha * gradient_vector

    print("\nfinal cost ", get_cost(x, theta, y))
    print("----theta is ", theta)
    print("now printing final cost vectors")
    for i in range(y.shape[1]):
        cost_vector_sum = get_cost_vector_sum_clean(x[i][None], theta, y[0][i][None])
        hx = h(x[i][None], theta)
        print("hx is ", hx)
        print("cost_vector_sum is ", cost_vector_sum)
        print()

In [23]:
first_question_indiv()

initial cost is  [[0.69314718]]

final cost  [[0.12250137]]
----theta is  [[-3.86226378  0.9171501   0.        ]]
now printing final cost vectors
hx is  [[0.71446106]]
cost_vector_sum is  [[0.33622678]]

hx is  [[0.04996796]]
cost_vector_sum is  [[0.05125957]]

hx is  [[0.04996796]]
cost_vector_sum is  [[0.05125957]]

hx is  [[0.04996796]]
cost_vector_sum is  [[0.05125957]]



In [24]:
# the above shows that even though the majority of examples are [1 1 0], it starts converging to a minimum
# in terms of the cost function. If I increase repeat, it'll be even close to zero

In [25]:
repeat = 1000

In [26]:
first_question_indiv()

initial cost is  [[0.69314718]]

final cost  [[0.00125525]]
----theta is  [[-13.11287837   5.80198744   0.        ]]
now printing final cost vectors
hx is  [[0.99698756]]
cost_vector_sum is  [[0.00301699]]

hx is  [[0.00066778]]
cost_vector_sum is  [[0.000668]]

hx is  [[0.00066778]]
cost_vector_sum is  [[0.000668]]

hx is  [[0.00066778]]
cost_vector_sum is  [[0.000668]]



In [27]:
# okay, lets try out the whole training set this time

In [28]:
def first_question_whole():
    theta = np.zeros((1, x.shape[1]))
    alpha = 1
    print("initial cost is ", get_cost(x, theta, y))
    for i in range(y.shape[1] * repeat):
        descent = gradient_descent(x, theta, y)
        theta = theta - alpha * descent

    print("\nfinal cost ", get_cost(x, theta, y))
    print("----theta is ", theta)
    print("now printing final cost vectors")
    for i in range(y.shape[1]):
        cost_vector_sum = get_cost_vector_sum_clean(x[i][None], theta, y[0][i][None])
        hx = h(x[i][None], theta)
        print("hx is ", hx)
        print("cost_vector_sum is ", cost_vector_sum)
        print()

In [29]:
# this is for repeat = 1000
first_question_whole()

initial cost is  [[0.69314718]]

final cost  [[0.00126252]]
----theta is  [[-13.10091562   5.79700349   0.        ]]
now printing final cost vectors
hx is  [[0.99697255]]
cost_vector_sum is  [[0.00303204]]

hx is  [[0.00067245]]
cost_vector_sum is  [[0.00067267]]

hx is  [[0.00067245]]
cost_vector_sum is  [[0.00067267]]

hx is  [[0.00067245]]
cost_vector_sum is  [[0.00067267]]



In [30]:
repeat = 10

In [31]:
# this is for repeat = 10
first_question_whole()

initial cost is  [[0.69314718]]

final cost  [[0.13225419]]
----theta is  [[-3.63849549  0.90117202  0.        ]]
now printing final cost vectors
hx is  [[0.71119029]]
cost_vector_sum is  [[0.34081524]]

hx is  [[0.06080658]]
cost_vector_sum is  [[0.06273383]]

hx is  [[0.06080658]]
cost_vector_sum is  [[0.06273383]]

hx is  [[0.06080658]]
cost_vector_sum is  [[0.06273383]]



In [32]:
# so they both work in reducing the cost and better predicting 

In [33]:
# now the question of why does it work

In [36]:
def second_question_indiv():
    theta = np.zeros((1, x.shape[1]))
    alpha = 1
    print("initial cost is ", get_cost(x, theta, y))
    print("\n")
    for i in range(y.shape[1] * repeat):
        i = i % y.shape[1]
        cost_vector_sum = get_cost_vector_sum(x[i][None], theta, y[0][i][None])
        gradient_vector = gradient_descent_vector(x[i][None], theta, y[0][i][None])
        print("theta is ", theta)
        print("cost_vector_sum is ", cost_vector_sum)
        print("get_cost is ", get_cost(x, theta, y))
        print("gradient_vector is ", gradient_vector)
        theta = theta - alpha * gradient_vector
        print()

    print("\nfinal cost ", get_cost(x, theta, y))
    print("----theta is ", theta)
    print("now printing final cost vectors")
    for i in range(y.shape[1]):
        cost_vector_sum = get_cost_vector_sum_clean(x[i][None], theta, y[0][i][None])
        hx = h(x[i][None], theta)
        print("prediction is ", hx)
        print("cost_vector_sum is ", cost_vector_sum)
        print()

In [37]:
# let's see what happens to cost, theta and gradients as we run logistic regression
second_question_indiv()

initial cost is  [[0.69314718]]


#########################  -y_indiv is  [-1]
x_vector * theta is 
 [[0. 0. 0.]]
np.log(h(x_vector, theta) is  [[-0.69314718]]
hx is  [[0.5]]
error is  [[-0.5]]

theta is  [[0. 0. 0.]]
cost_vector_sum is  [[0.69314718]]
get_cost is  [[0.69314718]]
gradient_vector is  [[ 0.  -0.5  0. ]]

#########################  -y_indiv is  [0]
x_vector * theta is 
 [[0.  0.5 0. ]]
np.log(1 -h(x_vector, theta) is  [[-0.97407698]]
hx is  [[0.62245933]]
error is  [[0.62245933]]

theta is  [[0.  0.5 0. ]]
cost_vector_sum is  [[0.97407698]]
get_cost is  [[0.84907698]]
gradient_vector is  [[0.62245933 0.62245933 0.        ]]

#########################  -y_indiv is  [0]
x_vector * theta is 
 [[-0.62245933 -0.12245933  0.        ]]
np.log(1 -h(x_vector, theta) is  [[-0.38850402]]
hx is  [[0.32192951]]
error is  [[0.32192951]]

theta is  [[-0.62245933 -0.12245933  0.        ]]
cost_vector_sum is  [[0.38850402]]
get_cost is  [[0.48044057]]
gradient_vector is  [[0.32192951 0.32

In [38]:
# examine the cases where -y_indiv is [1], in the first go around, the error which is pred - label is -0.5 
# in the second round, it's actually bigger since pred was .34434475 which is farther from 1 then .5
# this happened because for the next training examples [1 1 0], [1 1 0], [1 1 0], the labels were 0 and it 
# decreased the second theta, because it's helping the example classify as 1, whereas the truth(label) is 0

# However, look at the third and subsequent rounds, the errors are -0.6275, -0.574, -0.5163, -0.463...-0.311
# The reason why is that the first theta, which is a feature of things that are labeled 0 and those not of 1
# compensates, and thus our one layer neural net is finding the correlation between the 
# training set and labels over time even though there are inverse overlaps(the middle theta). 

# Additionally for more detailed explanation, the first theta gets affected only by training examples of label 0
# and thus "soaks up" more negative gradients over time in order to counter effect the problem of the theta 2
# being increased to correlate the training examples of label 1 to 1
# This is evident if you look at the last theta as well ----theta is  [[-3.86226378  0.9171501   0.        ]]

In [39]:
# finally, let's answer the question of how much using the whole training set to update theta changes things
# 

In [40]:
def second_question_whole():
    theta = np.zeros((1, x.shape[1]))
    alpha = 1
    print("initial cost is ", get_cost(x, theta, y))
    for i in range(y.shape[1] * repeat):
        descent = gradient_descent(x, theta, y)
        print("----theta is ", theta)
        theta = theta - alpha * descent

    print("\nfinal cost ", get_cost(x, theta, y))
    print("----theta is ", theta)
    print("now printing final cost vectors")
    for i in range(y.shape[1]):
        cost_vector_sum = get_cost_vector_sum_clean(x[i][None], theta, y[0][i][None])
        hx = h(x[i][None], theta)
        print("hx is ", hx)
        print("cost_vector_sum is ", cost_vector_sum)
        print()


In [41]:
second_question_whole()

initial cost is  [[0.69314718]]
----theta is  [[0. 0. 0.]]
----theta is  [[-0.375 -0.25   0.   ]]
----theta is  [[-0.63648385 -0.37093973  0.        ]]
----theta is  [[-0.83709712 -0.42363149  0.        ]]
----theta is  [[-1.00273349 -0.43817988  0.        ]]
----theta is  [[-1.14628645 -0.42977653  0.        ]]
----theta is  [[-1.27480179 -0.40683679  0.        ]]
----theta is  [[-1.39246075 -0.37441346  0.        ]]
----theta is  [[-1.50190936 -0.33573083  0.        ]]
----theta is  [[-1.60490733 -0.29294051  0.        ]]
----theta is  [[-1.70267152 -0.24752573  0.        ]]
----theta is  [[-1.79607041 -0.20053277  0.        ]]
----theta is  [[-1.88574043 -0.15271133  0.        ]]
----theta is  [[-1.97215879 -0.10460373  0.        ]]
----theta is  [[-2.05569077 -0.05660394  0.        ]]
----theta is  [[-2.13662145 -0.00899782  0.        ]]
----theta is  [[-2.21517742  0.03800857  0.        ]]
----theta is  [[-2.29154209  0.08426866  0.        ]]
----theta is  [[-2.36586651  0.1296805

In [43]:
# the end results are pretty similar
# when examining theta, I'm going to view the thetas from the individuals update every four times(set size)
# vs the whole training set

#indv set
#theta is  [[0. 0. 0.]]
#theta is  [[-1.1439918 -0.6439918  0.       ]]
#theta is  [[-1.6774822  -0.52182695  0.        ]]
#theta is  [[-2.08005631 -0.29682618  0.        ]]
#theta is  [[-2.42250962 -0.06561303  0.        ]]
#theta is  [[-2.72522964  0.14806432  0.        ]]
#theta is  [[-2.99703489  0.33931047  0.        ]]
#theta is  [[-3.2431771   0.50914524  0.        ]]
#theta is  [[-3.46748227  0.660234    0.        ]]
#theta is  [[-3.67300426  0.79539906  0.        ]]


#whole set
#----theta is  [[0. 0. 0.]]
#----theta is  [[-0.375 -0.25   0.   ]]
#----theta is  [[-0.63648385 -0.37093973  0.        ]]
#----theta is  [[-0.83709712 -0.42363149  0.        ]]
#----theta is  [[-1.00273349 -0.43817988  0.        ]]
#----theta is  [[-1.14628645 -0.42977653  0.        ]]
#----theta is  [[-1.27480179 -0.40683679  0.        ]]
#----theta is  [[-1.39246075 -0.37441346  0.        ]]
#----theta is  [[-1.50190936 -0.33573083  0.        ]]
#----theta is  [[-1.60490733 -0.29294051  0.        ]]
#----theta is  [[-1.70267152 -0.24752573  0.        ]]
#----theta is  [[-1.79607041 -0.20053277  0.        ]]
#----theta is  [[-1.88574043 -0.15271133  0.        ]]
#----theta is  [[-1.97215879 -0.10460373  0.        ]]
#----theta is  [[-2.05569077 -0.05660394  0.        ]]
#----theta is  [[-2.13662145 -0.00899782  0.        ]]
#----theta is  [[-2.21517742  0.03800857  0.        ]]
#----theta is  [[-2.29154209  0.08426866  0.        ]]


In [44]:
# as you can see, it took going through and updating theta using batch(whole training set), many more times that
# just going through and updating theta each time you encounter an example.
# this is because theta is being updated by a more weighted gradient and since there is more examples with
# 0 vs 1, the middle theta's update to a positive direction is weighted down by the overall sum of 
# the update for the training set, which is initially wanting it go in the negative direction since there are
# more 1's. However, again, once the first theta "soaks up" enough of the negative gradients over time, the neural net
# can once again, start changing the middle theta to a positive direction since the loss is eventually bigger
# for the middle theta, if it stays negative, since the loss is smaller for the first theta since it got updated
# so in the negative direction.

# this concludes that both individual updates and batch updates work. Here is the last theta of the batch update
#----theta is  [[-3.59205821  0.87400384  0.        ]] 
# vs the last theta of the individual update
#----theta is  [[-3.86226378  0.9171501   0.        ]]