### Implementation of gradient descent with Numpy

In [32]:
import numpy as np 

#### Variables
- We use linear regression formula f= w * x 

In [40]:
X= np.array([1,2,3,4], dtype=np.float32)    #input 
Y= np.array([2,4,6,8], dtype=np.float32)    # ground_truth
w=0.0   #weight parameter 


In [41]:
def forward(x):         #forward pass function to compute output 
    return w * x 

def loss(y_hat, y): #calculate loss from model prediction y_hat and actual output y 
    return ((y_hat-y)**2).mean()    #average MSE error with 1/n 

#### Calculating the gradient 
MSE= 1/N * (w*x-y)^2 

Gradient = 1/N * 2(wx-y)*x = dl/dw   

Because our parameter w is 1 x 1, so when we take the mean, the gradient of dl/dw goes from 1x4 vector to a 1x1 element. Need to be careful of the gradient dimensions

In [42]:
def gradient(x,y,y_hat):
    return np.multiply(2*x, y_hat-y).mean()

In [43]:
print("Prediction before training f([1,2,3,4]):", forward(X))
error= loss(forward(X), Y)
print("Error before training:", error)
grad = gradient(X,Y,forward(X))
print("Gradient before training:", grad)

Prediction before training f([1,2,3,4]): [0. 0. 0. 0.]
Error before training: 30.0
Gradient before training: -30.0


Training over iterations 

In [47]:
n_iter= 10 
LEARNING_RATE= 0.01 
for epoch in range(n_iter):
    #generate prediction 
    y_hat= forward(X)
    #loss 
    error= loss(y_hat, Y)
    #gradient 
    dw= gradient(X,Y,y_hat)
    #update weights 
    w -= LEARNING_RATE* dw  #upate weight to the opposite direction of the gradient (element wise change)
    print(f"epoch: {epoch+1}:\n w={w:.3f}, loss= {error:.8f}")
print(f"Prediction after training: {forward(X)}")


epoch: 1:
 w=1.987, loss= 0.00174685
epoch: 2:
 w=1.989, loss= 0.00126211
epoch: 3:
 w=1.991, loss= 0.00091188
epoch: 4:
 w=1.992, loss= 0.00065882
epoch: 5:
 w=1.993, loss= 0.00047601
epoch: 6:
 w=1.994, loss= 0.00034391
epoch: 7:
 w=1.995, loss= 0.00024848
epoch: 8:
 w=1.996, loss= 0.00017952
epoch: 9:
 w=1.996, loss= 0.00012971
epoch: 10:
 w=1.997, loss= 0.00009371
Prediction after training: [1.9969954 3.993991  5.9909863 7.987982 ]


### PyTorch with Gradient Descent using Autograd package

In [48]:
import torch 

In [53]:
X=torch.tensor([1,2,3,4], dtype=torch.float32)  #require_grad = False by default 
Y= torch.tensor([2,4,6,8], dtype=torch.float32)
w= torch.tensor(0.0,dtype=torch.float32, requires_grad=True)    #out parameter w 

# forward pass 
def forward(x): 
    return w * x    #element wise multiplication 

# loss function 
def loss(y_hat, y):
    return ((y_hat-y)**2).mean()    # MSE error

print("Prediction before training f([1,2,3,4]):", forward(X))
error= loss(forward(X), Y)
print("Error before training:", error)


Prediction before training f([1,2,3,4]): tensor([0., 0., 0., 0.], grad_fn=<MulBackward0>)
Error before training: tensor(30., grad_fn=<MeanBackward0>)


In [57]:
LEARNING_RATE= 0.01 
n_iter=10

for epoch in range (n_iter):
    #forward pass 
    y_hat= forward(X)

    #calculate loss 
    error= loss(y_hat, Y)

    #calculate gradient with Autograd package 
    error.backward()    #dw is automatically calculated and stored in w.grad 

    #update w 
    with torch.no_grad(): #not included in gradient computation
        w-= LEARNING_RATE* w.grad   #gradient auto computed by Autograd
    
    #empty the gradient of w because they accumulate in error.backward() 
    w.grad.zero_()

    print(f"epoch: {epoch+1}:\n w={w:.3f}, loss= {error:.8f}")

print(f"Prediction after training: {forward(X)}")
    

epoch: 1:
 w=1.987, loss= 0.00174685
epoch: 2:
 w=1.989, loss= 0.00126211
epoch: 3:
 w=1.991, loss= 0.00091188
epoch: 4:
 w=1.992, loss= 0.00065882
epoch: 5:
 w=1.993, loss= 0.00047601
epoch: 6:
 w=1.994, loss= 0.00034392
epoch: 7:
 w=1.995, loss= 0.00024848
epoch: 8:
 w=1.996, loss= 0.00017952
epoch: 9:
 w=1.996, loss= 0.00012971
epoch: 10:
 w=1.997, loss= 0.00009371
Prediction after training: tensor([1.9970, 3.9940, 5.9910, 7.9880], grad_fn=<MulBackward0>)


#### Gradient Descent with Optimizer 

In [64]:
X=torch.tensor([1,2,3,4], dtype=torch.float32)  #require_grad = False by default 
Y= torch.tensor([2,4,6,8], dtype=torch.float32)
w= torch.tensor(0.0,dtype=torch.float32, requires_grad=True)    #out parameter w 

# forward pass 
def forward(x): 
    return w * x    #element wise multiplication 
LEARNING_RATE=0.001
n_iter= 10
loss= torch.nn.MSELoss() 
optimizer= torch.optim.SGD([w], lr=LEARNING_RATE)

for epoch in range(n_iter):
    #prediction
    y_hat= forward(X)
    #loss 
    error= loss(y_hat, Y)

    #calculate gradient 
    error.backward()

    #update weight through our optimizer 
    optimizer.step()    #optimize the gradients 

    #clear out our gredients 
    optimizer.zero_grad() 

    print(f"epoch: {epoch+1}:\n w={w:.3f}, loss= {error:.8f}")

print(f"Prediction after training: {forward(X)}")


epoch: 1:
 w=0.030, loss= 30.00000000
epoch: 2:
 w=0.060, loss= 29.10675049
epoch: 3:
 w=0.089, loss= 28.24009705
epoch: 4:
 w=0.117, loss= 27.39924622
epoch: 5:
 w=0.146, loss= 26.58343506
epoch: 6:
 w=0.173, loss= 25.79191589
epoch: 7:
 w=0.201, loss= 25.02395821
epoch: 8:
 w=0.228, loss= 24.27886963
epoch: 9:
 w=0.254, loss= 23.55596542
epoch: 10:
 w=0.281, loss= 22.85458755
Prediction after training: tensor([0.2805, 0.5611, 0.8416, 1.1222], grad_fn=<MulBackward0>)
