In [4]:
# Tensor - Warm-up:numpy
# -*- coding:utf-8 -*-
import numpy as np
import math

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
    # Forward pass:compute predicted y
    # y = a + bx + cx^2 + dx^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b}x + {c}x^2 + {d}x^3')

99 281.17632209823404
199 199.5501705579192
299 142.4662811376773
399 102.51910060181923
499 74.5468626309526
599 54.948285291219975
699 41.20893657528268
799 31.571970575705826
899 24.809027618825684
999 20.0606930011762
1099 16.72530443325826
1199 14.38139523485781
1299 12.733557535431348
1399 11.574625827799935
1499 10.75924245092921
1599 10.185366957099408
1699 9.781333877726217
1799 9.496788483345005
1899 9.296334848831195
1999 9.155082150324205
Result: y = 0.01897170172864683 + 0.8527719846656407x + -0.003272934238670997x^2 + -0.09276585349316349x^3


In [10]:
# PyTorch: Tensor

# -*- coding: utf-8 -*-
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
    
    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()
    
    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 3898.492431640625
199 2592.6630859375
299 1725.7352294921875
399 1150.0379638671875
499 767.630615234375
599 513.5413818359375
699 344.659912109375
799 232.37489318847656
899 157.69309997558594
999 108.00337982177734
1099 74.92901611328125
1199 52.90510559082031
1299 38.23316192626953
1399 28.45458221435547
1499 21.93402099609375
1599 17.58387565612793
1699 14.680091857910156
1799 12.740701675415039
1899 11.444633483886719
1999 10.577943801879883
Result: y = -0.019980119541287422 + 0.820310652256012 x + 0.0034469044767320156 x^2 + -0.08814851194620132 x^3


In [21]:
# Autograd
# When using autograd, the forward pass of your network will define a computational graph

# -*- coding: utf-8 -*-
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to 
# compute gradients with respect to these Tensors during the backward pass
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynominal. we need 4 weights: y = a + bx + cx^2 + dx^3
# Setting requires_grad=True indicates that we want to compute gradients with respect to these Tensors during the backward pass.
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y using operations on Tensors
    y_pred = a + b * x + c * x ** 2 + d * x ** 3
    
    # Compute and print loss using operations on Tensors
    # Now loss is a Tensor of shape (1,)
    # loss.items() gets the scalar value held in the loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
    
    # Use autograd to compute the backward pass. This call will compute the gradient of loss with respect to all
    # Tensors with requires_grad=True.
    # After this call a.grad, b.grad, c.grad, d.grad will be Tesnors holding the gradient of loss with respect to a, b, c, d respectively.
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad
        
        # Manully zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None
        
print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 5720.466796875
199 3977.04638671875
299 2768.25830078125
399 1929.34814453125
499 1346.5977783203125
599 941.4244995117188
699 659.4710693359375
799 463.0989074707031
899 326.2210693359375
999 230.73779296875
1099 164.08035278320312
1199 117.51292419433594
1299 84.95793151855469
1399 62.18383026123047
1499 46.24205017089844
1599 35.07603454589844
1699 27.25060272216797
1799 21.76329803466797
1899 17.913490295410156
1999 15.211201667785645
Result: y = -0.08047949522733688 + 0.8326709270477295 x + 0.013884052634239197 x^2 + -0.08990665525197983 x^3


In [22]:
# PyTorch: Define new autograd functions
# -*- coding : utf-8 -*-
import torch
import math

class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)
        
dtype = torch.float
device = torch.device("cuda:0")

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For this example, we need
# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
# not too far from the correct result to ensure convergence.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.

a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)

learning_rate = 5e-6
for t in range(2000):
    # To apply our Function, we use Function.apply method. We alias this as 'P3'
    P3 = LegendrePolynomial3.apply
    
    # Forward pass: compute predicted y using operations;
    # we compute P3 using our custom autograd operation.
    y_pred = a + b * P3(c + d * x)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
    
    # Use autograd to compute the backward pass.
    loss.backward()
    
    # Update weights using gradient descent
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad
        
        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')


99 214.2296905517578
199 147.53099060058594
299 102.63783264160156
399 72.34272766113281
499 51.86297607421875
599 38.002010345458984
699 28.61264419555664
799 22.24827003479004
899 17.932327270507812
999 15.004491806030273
1099 13.0177001953125
1199 11.669212341308594
1299 10.753830909729004
1399 10.132356643676758
1499 9.71036434173584
1599 9.42381477355957
1699 9.22919750213623
1799 9.097031593322754
1899 9.007257461547852
1999 8.946285247802734
Result: y = -1.132859570041056e-10 + -2.208261013031006 * P3(-4.4554464284640716e-11 + 0.25548428297042847 x)


In [40]:
# PyTorch:nn
# -*- coding: utf-8 -*-
import torch
import math

# Create Tensors tp hold input and outputs
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
# (3,), for this case, broadcasting semantics will apply to obtain a tensor
# of shape (2000, 3) 

# Use the nn package to define our model as a sequence of layers. 
# nn.Sequential is a Module which contains other Modules, and applies them in sequence to produce its output. 
# The Linear Module computes output from input using a linear function, and holds internal Tensors for its weight and bias.
# The Flatten layer flatens the output of the linear layer to a 1D tensor, to match the shape of `y`.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

# The nn package also contains definitions of popular loss functions; 
# in this case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):

    # Forward pass: compute predicted y by passing x to the model. 
    # Module objects override the __call__ operator so you can call them like functions. 
    # When doing so you pass a Tensor of input data to the Module and it produces a Tensor of output data.
    y_pred = model(xx)
    
    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Backward pass: compute gradient of loss with respect to all the learning parameters of the model.
    # Internally, the parameters of each Module are stored in Tensor with requires_grad=True,
    # so this call will compute gradients for all learnable parameters in the model.
    loss.backward()
    
    # Update the weights using gradient descent. Each parameter is a Tensor, 
    # so we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    
# You can access the first layer of 'model' like accessing the first item of a list
linear_layer = model[0]
    
# For linear layer, its parameters are stored as `weight` and `bias`.
print(f"Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + \
{linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3")

99 723.571533203125
199 481.4560546875
299 321.3550720214844
399 215.4867401123047
499 145.4801025390625
599 99.1874008178711
699 68.57588195800781
799 48.33369827270508
899 34.94819641113281
999 26.096834182739258
1099 20.243677139282227
1199 16.37320327758789
1299 13.813765525817871
1399 12.121288299560547
1499 11.002110481262207
1599 10.261998176574707
1699 9.772605895996094
1799 9.448976516723633
1899 9.234972953796387
1999 9.093460083007812
Result: y = 0.0004967047716490924 + 0.8405892252922058 x + -8.569059718865901e-05 x^2 + -0.0910329595208168 x^3


In [47]:
# PyTorch: optim
# -*- coding: utf-8 -*-
import torch
import math

# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

for t in range(2000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    # model.zero_grad() # also model.zero_grad() can be used
    
    # Backward pass: compute gradient of the loss with respect to model parameters
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its parameters
    optimizer.step()

linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + \
{linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 3619.53369140625
199 2330.880126953125
299 1836.126220703125
399 1480.8514404296875
499 1188.1561279296875
599 945.32275390625
699 743.3233032226562
799 575.8994750976562
899 438.266357421875
999 325.9455871582031
1099 234.63442993164062
1199 161.29855346679688
1299 104.43608093261719
1399 62.82135009765625
1499 35.086944580078125
1599 18.795944213867188
1699 11.458581924438477
1799 9.249364852905273
1899 8.924577713012695
1999 8.912392616271973
Result: y = 0.0005184923647902906 + 0.8566716909408569 x + 0.0005210209055803716 x^2 + -0.09277936816215515 x^3


In [56]:
# PyTorch: Custom nn Modules
# For comlex model, can define your own Modeuls by nn.Module and 
# define a forward instead of sequence of existing Modules

# -*- coding: utf-8 -*-
import torch
import math

class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
    
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
    
    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'
    

 # Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = Polynomial3()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 3975.397216796875
199 2688.24072265625
299 1820.58447265625
399 1235.1533203125
499 839.7623901367188
599 572.4539794921875
699 391.5525817871094
799 268.99908447265625
899 185.8855438232422
999 129.45811462402344
1099 91.1063232421875
1199 65.01091003417969
1299 47.23489761352539
1399 35.11223602294922
1499 26.835609436035156
1599 21.178281784057617
1699 17.30684471130371
1799 14.654511451721191
1899 12.835271835327148
1999 11.586068153381348
Result: y = -0.04373353719711304 + 0.8250649571418762 x + 0.007544761057943106 x^2 + -0.08882477134466171 x^3


In [74]:
#PyTorch: Control Flow + Weight Sharing

# -*- coding: utf-8 -*-
import random
import torch
import math

class DynamicNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(()))
    
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 4, 5
        and reuse the e parameter to compute the contribution of these orders.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same parameter many
        times when defining a computational graph.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y
    
    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + ' \
                f'{self.e.item()} x^4 ? + {self.e.item()} x^5 ?'

# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = DynamicNet()
print(model)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)

for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

DynamicNet()
99 62420520.0
199 1332840.125
299 4954.85791015625
399 4458.6494140625
499 5308.33056640625
599 4889.14501953125
699 3563.900634765625
799 3256.008056640625
899 3288.347412109375
999 3176.693115234375
1099 3017.99072265625
1199 2898.8603515625
1299 2668.00634765625
1399 2918.82568359375
1499 2567.76171875
1599 2471.395263671875
1699 2858.795166015625
1799 2269.5537109375
1899 2656.509521484375
1999 2102.899658203125
Result: y = -0.2559147775173187 + 2.239171266555786 x + 0.03527602553367615 x^2 + -0.2964957654476166 x^3 + 0.00017770705744624138 x^4 ? + 0.00017770705744624138 x^5 ?
