In [1]:
#Forward Multiplicative Gate

def forMulGate(x,y):
    return x*y

In [2]:
forMulGate(-1.99,2.99)

-5.950100000000001

In [5]:
#Searching randomly for the values of x and y that will improve ciruit output
import random
x = -2
y = 3
tweak_amount = 0.01
best_x = x
best_y = y
best_out = -float('inf')

for i in range(100):
    x_try = x + tweak_amount * (random.random() * 2 - 1)
    y_try = y + tweak_amount * (random.random() * 2 - 1)
    
    out = forMulGate(x_try,y_try)
#     print(x_try,y_try,out)
    if out > best_out:
        best_out = out
        best_x = x_try
        best_y = y_try

print(best_x)
print(best_y)

-1.992285139476791
2.991595251530799


In [7]:
""" Searching using Gradient descent
"""
###### A simple derivative
x=-2;y=3
out = forMulGate(x,y)
h= 0.00001

# Derivative w.r.t x
xph= x + h
out1 = forMulGate(xph,y)
x_derivative = (out1 - out)/h

# Derivative w.r.t y
yph= y + h
out2 = forMulGate(x,yph)
y_derivative = (out2 - out)/h

print(x_derivative,y_derivative)



3.000000000064062 -2.0000000000131024


In [67]:
step_size = 0.01

x = x + step_size * x_derivative
y = y + step_size * y_derivative
forMulGate(x,y)

-5.870599999997832

In [68]:
# Finding analytic Gradient

x = -2; y = 3
out = forMulGate(x,y)

x_gradient = y
y_gradient = x

step_size = 0.01

x = x + step_size * x_derivative
y = y + step_size * y_derivative
forMulGate(x,y)

-5.870599999997832

INPUT: We are given a circuit, some inputs and compute an output value.
OUTPUT: We are then interested finding small changes to each input (independently) that would make the output higher.
Strategy #1: One silly way is to randomly search for small pertubations of the inputs and keep track of what gives the highest increase in output.
Strategy #2: We saw we can do much better by computing the gradient. Regardless of how complicated the circuit is, the numerical gradient is very simple (but relatively expensive) to compute. We compute it by probing the circuit’s output value as we tweak the inputs one at a time.
Strategy #3: In the end, we saw that we can be even more clever and analytically derive a direct expression to get the analytic gradient. It is identical to the numerical gradient, it is fastest by far, and there is no need for any tweaking.

In [3]:
""" Chain Rule (Example of : f(x,y,z)=(x+y)z) """

def forMulGate(x,y):
    return x*y

def forAddGate(x,y):
    return x+y

def forwardCircuit(x,y,z):
    q = forAddGate(x,y)
    f = forMulGate(q,z)
    return f

x = - 2; y = 5; z = -4
forwardCircuit(x,y,z)


### Gradient computation

#Inital Conditions
x = - 2; y = 5; z = -4
q = forAddGate(x,y)
f = forMulGate(q,z)

#MUL Gate
derivative_f_wrt_z = q
derivative_f_wrt_q = z

#ADD gate
derivative_q_wrt_x = 1
derivative_q_wrt_y = 1


#Cahin rule
derivative_f_wrt_x = derivative_f_wrt_q * derivative_q_wrt_x
derivative_f_wrt_y = derivative_f_wrt_q * derivative_q_wrt_y

print(derivative_f_wrt_x,derivative_f_wrt_y,derivative_f_wrt_z)


-4 -4 3


In [7]:
#Updated x,y,z

x = - 2; y = 5; z = -4
step_size = 0.01

x = x + step_size * derivative_f_wrt_x
y = y + step_size * derivative_f_wrt_y
z = z + step_size * derivative_f_wrt_z
print(x,y,z)

q = forAddGate(x,y)
f = forMulGate(q,z)

print(q,f)

-2.04 4.96 -3.97
2.92 -11.5924


Lets recap once again what we learned:

In the previous chapter we saw that in the case of a single gate (or a single expression), we can derive the analytic gradient using simple calculus. We interpreted the gradient as a force, or a tug on the inputs that pulls them in a direction which would make this gate’s output higher.

In case of multiple gates everything stays pretty much the same way: every gate is hanging out by itself completely unaware of the circuit it is embedded in. Some inputs come in and the gate computes its output and the derivate with respect to the inputs. The only difference now is that suddenly, something can pull on this gate from above. That’s the gradient of the final circuit output value with respect to the ouput this gate computed. It is the circuit asking the gate to output higher or lower numbers, and with some force. The gate simply takes this force and multiplies it to all the forces it computed for its inputs before (chain rule). This has the desired effect:

If a gate experiences a strong positive pull from above, it will also pull harder on its own inputs, scaled by the force it is experiencing from above
And if it experiences a negative tug, this means that circuit wants its value to decrease not increase, so it will flip the force of the pull on its inputs to make its own output value smaller.

In [8]:
#Sanity check using Numerical gradient

x = -2; y = 5; z = -4

h = 0.0001
x_derivative = (forwardCircuit(x+h,y,z) - forwardCircuit(x,y,z))/ h
y_derivative = (forwardCircuit(x,y+h,z) - forwardCircuit(x,y,z))/ h
z_derivative = (forwardCircuit(x,y,z+h) - forwardCircuit(x,y,z))/ h

print(x_derivative,y_derivative,z_derivative)

-3.9999999999906777 -3.9999999999906777 3.000000000010772


In [12]:
""" Example: Single Neuron
f(x,y,a,b,c)=σ(ax+by+c)
"""

class Unit():
    
    def __init__(self,value,grad):
        self.value = value
        self.grad = grad
        print(self.value)

class multiplyGate():
    
    def forward(self,u0,u1):
        self.u0 = u0
        self.u1 = u1
        self.utop = Unit(u0.value * u1.value, 0.0)
        return self.utop
    
    def backward(self):
        self.u0.grad = self.u0.grad + (self.u1.value * self.utop.grad) 
        self.u1.grad = self.u1.grad + (self.u0.value * self.utop.grad) 
    
class addGate():
    
    def forward(self,u0,u1):
        self.u0 = u0
        self.u1 = u1
        self.utop = Unit(u0.value + u1.value, 0.0)
        return self.utop
    
    def backward(self):
        self.u0.grad = self.u0.grad + (self.utop.grad) 
        self.u1.grad = self.u1.grad + (self.utop.grad) 

import numpy as np

class sigmoidGate():
    
    def sig(self,x):
        return 1/(1+np.exp(-x))
    
    def forward(self,u0):
        self.u0 = u0
        self.utop = Unit(self.sig(self.u0.value), 0.0)
        return self.utop
    
    def backward(self):
        s = self.sig(self.u0.value)
        self.u0.grad = self.u0.grad + (s * (1 - s)) * self.utop.grad


In [14]:
# Create input units
a = Unit(1.0, 0.0)
b = Unit(2.0, 0.0)
c = Unit(-3.0, 0.0)
x = Unit(-1.0, 0.0)
y = Unit(3.0, 0.0)

# Create the gates
mulg0 = multiplyGate()
mulg1 = multiplyGate()
addg0 = addGate()
addg1 = addGate()
sg0 = sigmoidGate()


def forwardNeuron():
    ax = mulg0.forward(a,x)
    by = mulg1.forward(b,y)
    axpby = addg0.forward(ax,by)
    axpbypc = addg1.forward(axpby,c)
    s = sg0.forward(axpbypc)
    return s
s = forwardNeuron()

print(s.value)



1.0
2.0
-3.0
-1.0
3.0
-1.0
6.0
5.0
2.0
0.8807970779778823
0.8807970779778823


In [15]:
s.grad = 1.0
sg0.backward()
addg1.backward()
addg0.backward()
mulg1.backward()
mulg0.backward()


In [20]:
print(a.grad,b.grad,c.grad,x.grad,y.grad)

step_size = 0.01
a.value += step_size * a.grad
b.value += step_size * b.grad
c.value += step_size * c.grad
x.value += step_size * x.grad
y.value += step_size * y.grad

-0.10499358540350662 0.31498075621051985 0.10499358540350662 0.10499358540350662 0.20998717080701323


In [19]:
s = forwardNeuron()
print(s.value)

-0.9979012306572276
6.013655780294242
5.015754549637014
2.0168044854910496
0.8825501816218984
0.8825501816218984


In [23]:
def forwardCircuitFast(a,b,c,x,y):
    return 1/(1+np.exp(- (a*x+b*y+c)))

a = 1;b = 2; c = -3; x = -1; y = 3
h = 0.0001
a_grad = (forwardCircuitFast(a+h,b,c,x,y) - forwardCircuitFast(a,b,c,x,y))/h
b_grad = (forwardCircuitFast(a,b+h,c,x,y) - forwardCircuitFast(a,b,c,x,y))/h
c_grad = (forwardCircuitFast(a,b,c+h,x,y) - forwardCircuitFast(a,b,c,x,y))/h
x_grad = (forwardCircuitFast(a,b,c,x+h,y) - forwardCircuitFast(a,b,c,x,y))/h
y_grad = (forwardCircuitFast(a,b,c,x,y+h) - forwardCircuitFast(a,b,c,x,y))/h

print(a_grad,b_grad,c_grad,x_grad,y_grad)

-0.10499758359205913 0.3149447748351797 0.10498958734506125 0.10498958734506125 0.2099711788272618


### Becoming a Backprop Ninja

In [None]:
""" Ex-1 Multiplication Gate """
x = a*b

#Backward pass
da = b * dx # Gradient of a
db = a * dx # Gradient of b


""" Ex-2 Addition Gate """
x = a + b

#Backward pass
da = 1.0 * dx # Gradient of a
db = 1.0 * dx # Gradient of b

""" Ex-3 x = a + b + c """
q = a + b
x = q + c

#Backward pass
dc = 1.0 * dx
dq = 1.0 * dx
da = 1.0 * dq  #(or) da = 1.0 * dx
db = 1.0 * dq  #(or) db = 1.0 * dx


""" Ex-4 x = a * b + c (comining gates)"""
x = a * b + c

da = db * dx
db = da * dx
dc = 1.0 * dx

""" Ex-5 Neuron """
q = a * x + b * y + c
f = sig(q) # Sigmoid function

df = 1
dq = (f * (1-f)) * df


da = x * dq
db = y * dq
dx = a * dq
dy = b * dq
dc = 1.0 * dq



""" Ex-6 x = a*a """
x = a*a

da = 2 * a * dx

""" Ex-7 x= a*a + b*b + c*c """
x= a*a + b*b + c*c

da = 2*a*dx
db = 2*b*dx
dc = 2*c*dx

""" Ex-8 x= ((ab+c)d)^2 """

x1 = a*b + c
x2 = x1 * d
x = x2 * x2

dx2 = 2 * x2 * dx
dd = x1 * dx2
dx1 = d * dx2
da = b * dx1
db = a * dx1
dc = 1.0 * dx1


""" Ex-8 division """
x = 1.0/a

da = -(1.0)/(a*a)


""" Ex-8 divison 2 """
x= (a+b)/(c+d)

x1 = (a+b)
x2 = (c+d)
x3 = 1.0/x2
x = x1*x3


dx1 = x3 * dx
dx3 = x1 * dx
dx2 = (-(1.0)/(x2 * x2)) * dx3
da = 1.0 * dx1
db = 1.0 * dx1
dc = 1.0 * dx2
dd = 1.0 * dx2

""" Ex-8 max """
x = max(a,b)

da = 1.0 * dx if a == x else 0.0
db = 1.0 * dx if b == x else 0.0

""" Ex-8 ReLU """
x = max(a,0)

da = 1.0 * dx if a>0 else 0.0


Everything we’ve done in this chapter comes down to this: We saw that we can feed some input through arbitrarily complex real-valued circuit, tug at the end of the circuit with some force, and backpropagation distributes that tug through the entire circuit all the way back to the inputs. If the inputs respond slightly along the final direction of their tug, the circuit will “give” a bit along the original pull direction. Maybe this is not immediately obvious, but this machinery is a powerful hammer for Machine Learning.

# Machine Learning

## SVM

In [None]:

class Circuit():
    
    self.mulg0 = multiplyGate()
    self.mulg1 = multiplyGate()
    self.addg0 = addGate()
    self.addg1 = addGate()
    
    def forward(self,x,y,a,b,c):
        self.ax = self.mulg0.forward(a,x)
        self.by = self.mulg1.forward(b,y)
        self.axpby = self.addg0.forward(self.ax,self.by)
        self.axpbypc = self.addg1.forward(self.axpby,c)
        
        return self.axpbypc
    
    def backward(gradient_top):
        self.axpbyc.grad = gradient_top
        self.addg1.backward()
        self.addg0.backward()
        self.mulg1.backward()
        self.mulg0.backward()

In [None]:
#SVM class

class SVM():
    
    self.a = Unit(1.0,0.0)
    self.b = Unit(-2.0,0.0)
    self.c = Unit(-1.0,0.0)
    
    self.circuit = Circuit()
    
    def forward(self,x,y):
        self.unit_out = self.circuit.forward(x,y,self.a,self.b,self.c)
        return self.unit_out
    
    def backward(label):
        
        self.a.grad = 0
        self.b.grad = 0
        self.c.grad = 0
        
        pull = 0.0
        if(label == 1 && self.unit_out.value < 1):
            pull = 1
        if(label == -1 && self.unit_out.value > -1):
            pull = -1
            
        self.circuit.backward(pull)
        
        self.a.grad += - self.a.value
        self.b.grad += - self.b.value
        
        def learnFrom(x,y,label):
            self.forward(x,y)
            self.backward(label)
            self.parameterUpdate()
            
        
