In [None]:
# https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e
# Data Generation

import numpy as np

In [None]:
np.random.seed(42)

x = np.random.rand(100,1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

# shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

In [None]:
# Uses the first 80 random indices for train
train_idx = idx[:80]

# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

In [None]:
# gradient descent
# inital start values
a = np.random.randn(1)
b = np.random.randn(1)

print(a,b)

In [None]:
# Set learning rate
lr = 1e-1
lr

In [None]:
# Define number of epoches
n_epochs = 100

for epoch in range(n_epochs):
    # Compute our model's predicated output
    yhat = a + b * x_train

    # How wrong is our model? That's the error
    # It is a regression, so it computes mean 
    # squared error (MSE)    
    error = (y_train - yhat)
    loss = (error ** 2).mean()

    # Computes gradient for both "a" and "b" parameters
    a_grad = -2 * error.mean()
    b_grad = -2 * (x_train * error).mean()

    # Updates parameters using gradients and learning rate
    a = a - lr * a_grad
    b = b - lr * b_grad

print(a, b)

In [None]:
 # Sanity check: do we get the same results as our 
 # gradient descent?
 from sklearn.linear_model import LinearRegression
 linr = LinearRegression()
 linr.fit(x_train, y_train)
 print(linr.intercept_, linr.coef_[0])

In [None]:
# A scalar (a single number) has zero dimensions, 
# a vector has one dimension, 
# a matrix has two dimensions and 
# a tensor has three or more dimensions. That’s it!
# But, to keep things simple, it is commonplace to 
# call vectors and matrices tensors as well — so, 
# from now on, everything is either a scalar or a tensor.
import torch
import torch.optim as optim
import torch.nn as nn
# from torchviz import make_dot


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# Our data was in numpy arrays, but we need to transform
# them to pytorch's tensors and then we send them to
# the chosen device
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

# Here we can see the difference - notice that .type() is
# more useful since it also tells us WHERE the tensor is (device)
print(type(x_train), type(x_train_tensor), x_train_tensor.type())

In [None]:
# First
# Initialized parameters "a" and "b" randomly, ALMOST as we did
# in numpy since we want to apply gradient descent to these 
# parameters, 
# we need to set REQUIRES_GRAD = TRUE
a = torch.randn(1, requires_grad=True, dtype=torch.float)
b = torch.randn(1, requires_grad=True, dtype=torch.float)
print(a, b)

# Second
# But what is we want to run it on a GPU? We could just send them
# to device, right?
a = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
b = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
print(a, b)
# Sorry, but NO! The to(device) "shadows" the gradient

# Third
# We can either create regular tensors and send them to the device
a = torch.randn(1, dtype=torch.float).to(device)
b = torch.randn(1, dtype=torch.float).to(device)
# and THEN set them as requiring gradients ...device
a.requires_grad_()
b.requires_grad_()
# In PyTorch, every method that ends with an underscore (_) makes 
# changes in-place, meaning, they will modify the underlying variable.
print(a, b)

In [None]:
# Although the last approach worked fine, it is much better to 
# assign tensors to a device at the moment of their creation.
# RECOMMENDED!
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

In [None]:
# Autograd
# Autograd is PyTorch’s automatic differentiation package. 
# Thanks to it, 
# we don’t need to worry about partial derivatives, 
# chain rule or anything like it.
# So, how do we tell PyTorch to do its thing and compute 
# all gradients? 
# That’s what backward() is good for.

# Do you remember the starting point for computing the gradients? 
# It was the loss, 
# as we computed its partial derivatives w.r.t. our parameters. 
# Hence, we need to invoke the backward() method from 
# the corresponding Python variable, 
# like, loss.backward().

# What about the actual values of the gradients? 
# We can inspect them by 
# looking at the grad attribute of a tensor.

# If you check the method’s documentation, it clearly states 
# that gradients are accumulated. 
# So, every time we use the gradients to update the parameters, 
# we need to zero the gradients afterwards. And that’s what 
# zero_() is good for.

lr = 1e-1
n_epochs = 100

# sets the seed for generating random numbers
torch.manual_seed(42)

a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

for epoch in range(n_epochs):

    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()

    # No more mannual computation of gradients
    # We just tell PyTorch to work its way BACKWARDs from the specified loss!
    loss.backward()
    # Let's check the computed gradients ...
    #print(a.grad)
    #print(b.grad)

    # We need to use NO_GRAD to keep the update out of the gradient 
    # computation
    # Why is that? It boils down to the DYNAMIC GRAPH that PyTorch 
    # uses ...
    with torch.no_grad():
        a -= lr * a.grad
        b -= lr * b.grad

    # PyTorch is "clingy" to its computed gradients, we need to tell
    # it to let it go ... 
    a.grad.zero_()
    b.grad.zero_()

print(a, b)

In [None]:
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

yhat = a + b * x_train_tensor
error = y_train_tensor - yhat
loss = (error ** 2).mean()

In [None]:
# from torchviz import make_dot

In [None]:
# make_dot(yhat)

In [None]:
# So far, we’ve been manually updating the parameters using the 
# computed gradients. 
# That’s probably fine for two parameters… but what if we had 
# a whole lot of them?! 
# We use one of PyTorch’s optimizers, like SGD or Adam.
# An optimizer takes the parameters we want to update, 
# the learning rate 
# we want to use (and possibly many other hyper-parameters as well!) 
# and 
# performs the updates through its step() method
# Besides, we also don’t need to zero the gradients one by one anymore. 
# We just invoke the optimizer’s zero_grad() method and that’s it!
# 
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=float, device=device)
b = torch.randn(1, requires_grad=True, dtype=float, device=device)
print(a, b)

lr = 1e-1
n_epochs = 100

# Define a SGD optimizer to update the parameters
optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()

    loss.backward()

    # No more manual update!
    # with torch.no_grad():
    #   a -= lr * a.grad
    #   b -= lr * b.grad
    optimizer.step()

    # No more telling PyTorch to let gradients go!
    # a.grad.zero_()
    # b.grad.zero_()
    optimizer.zero_grad()

print(a, b)

In [None]:
# Loss
# We now tackle the loss computation. As expected, PyTorch got us covered once again. 
# There are many loss functions to choose from, depending on the task at hand. 
# Since ours is a regression, we are using the Mean Square Error (MSE) loss.
# Notice that nn.MSELoss actually creates a loss function for us — 
# it is NOT the loss function itself. Moreover, you can specify a reduction 
# method to be applied, 
# that is, how do you want to aggregate the results for individual points — 
# you can average them (reduction=’mean’) or simply sum them up (reduction=’sum’).

torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

print(a, b)



In [None]:
lr = 1e-1
n_epochs = 100

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor

    # No more manual loss!
    loss = loss_fn(y_train_tensor, yhat)

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

print(a, b)

In [None]:
# Model
# In PyTorch, a model is represented by a regular Python class that 
# inherits from the Module class.
# The most fundamental methods it needs to implement are:
# __init__(self): it defines the parts that make up the model —in our case, 
# two parameters, a and b.
# You are not limited to defining parameters, though… models can contain 
# other models (or layers) 
# as its attributes as well, so you can easily nest them. We’ll see 
# an example of this shortly as well.
# forward(self, x): it performs the actual computation, that is, 
# it outputs a prediction, given the input x.
# You should NOT call the forward(x) method, though. You should call 
# the whole model itself, 
# as in model(x) to perform a forward pass and output predictions.

class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # To make "a" and "b" real parameters of the model, we need to
        # wrap them with nn.Parameter
        # to tell PyTorch these tensors should be considered parameters of 
        # the model they are an attribute of.
        # Why should we care about that? By doing so, we can use our model’s 
        # parameters() method to retrieve an iterator over all model’s parameters, 
        # even those parameters of nested models, that we can use to 
        # feed our optimizer (instead of building a list of parameters ourselves!).
        # Moreover, we can get the current values for all parameters 
        # using our model’s state_dict() method.
        self.a = nn.Parameter(torch.randn(1, 
                                requires_grad=True, 
                                dtype=torch.float))
        self.b = nn.Parameter(torch.randn(1, 
                                requires_grad=True, 
                                dtype=torch.float))

    def forward(self, x):
        # Computes teh outputs / predications
        return self.a + self.b * x

In [None]:
# IMPORTANT: we need to send our model to the same device where the data is. 
# If our data is made of GPU tensors, our model must “live” inside the GPU as well.
# We can use all these handy methods to change our code, which should be 
# looking like this:
torch.manual_seed(42)

# Now we can created a model and send it at once to the device
model = ManualLinearRegression().to(device)

# We can also inspect its parameters using its state_dict
print(model.state_dict())

lr = 1e-1
n_epochs = 100

loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)

for epoch in range(n_epochs):

    # In PyTorch, models have a train() method which, somewhat disappointingly, 
    # does NOT perform a training step. Its only purpose is to set the model to 
    # training mode. Why is this important? Some models may use mechanisms like Dropout, 
    # for instance, which have distinct behaviors in training and evaluation phases.
    model.train()
    # No more manual predication!
    #  yhat = a + b * x_tensor
    yhat = model(x_train_tensor)

    loss = loss_fn(y_train_tensor, yhat)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

print(model.state_dict())

In [None]:
# In our model, we manually created two parameters to perform a linear regression. 
# Let’s use PyTorch’s Linear model as an attribute of our own, thus creating 
# a nested model.

# In the __init__ method, we created an attribute that contains 
# our nested Linear model.
# In the forward() method, we call the nested model itself to 
# perform the forward pass 
# (notice, we are not calling self.linear.forward(x)!).

class LayerLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # Instead of our custom parameters, we use a Linear layer with single
        # input and single output
        self.linear = nn.Linear(1, 1)

    def forward(self, x):
        # Now it only takes a call to the layer to make predications
        return self.linear(x)

In [None]:
torch.manual_seed(42)
# Now we can created a model and send it at once to the device
model = LayerLinearRegression().to(device)
# We can also inspect its parameters using its state_dict
print(model.state_dict())

lr = 1e-1
n_epochs = 100

loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)

for epoch in range(n_epochs):

    model.train()

    yhat = model(x_train_tensor)
    loss = loss_fn(y_train_tensor, yhat)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

print(model.state_dict())

In [None]:
torch.manual_seed(42)
model = nn.Sequential(nn.Linear(1, 1)).to(device)
print(model.state_dict())

lr = 1e-1
n_epochs = 100

loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)

for epoch in range(n_epochs):
    model.train()
    yhat = model(x_train_tensor)
    loss = loss_fn(y_train_tensor, yhat)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print(model.state_dict())

In [None]:
# Training Step
# So far, we’ve defined an optimizer, a loss function and a model. 
# Scroll up a bit and take a quick look at the code inside the loop. 
# Would it change if we were using a different optimizer, or loss, 
# or even model? 
# If not, how can we make it more generic?
# Well, I guess we could say all these lines of code perform a training 
# step, 
# given those three elements (optimizer, loss and model),the features 
# and the labels.
# So, how about writing a function that takes those three elements and 
# returns another function that performs a training step, taking a set 
# of 
# features and labels as arguments and returning the corresponding loss?
# Then we can use this general-purpose function to build a train_step() 
# function 
# to be called inside our training loop. Now our code should look 
# like this… 
# see how tiny the training loop is now?
def make_train_step(model, loss_fn, optimizer):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        # Sets model to TRAIN mode
        model.train()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        # Computes gradients
        loss.backward()
        # Updates parameters and zeros gradients
        optimizer.step()
        optimizer.zero_grad()
        # Return the loss
        return loss.item()
    # Returns the function that will be called inside the train loop
    return train_step

In [None]:
# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
losses = []

# For each epoch ... 
for epoch in range(n_epochs):
    # Performs one train step and returns the corresponding loss 
    loss = train_step(x_train_tensor, y_train_tensor)

print(model.state_dict())

In [None]:
# PyThorch Dataset
from torch.utils.data import Dataset, TensorDataset

class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

# Wait, is this a CPU tensor now? Why? Where is .to(device)?
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

In [None]:
type(train_data)

In [None]:
# DataLoader
# if we want to go serious about all this, we must use mini-batch gradient descent. 
# Thus, we need mini-batches. Thus, we need to slice our dataset accordingly. 

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)

In [None]:
next(iter(train_loader))

In [None]:
losses = []
train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        # the dataset "lives" in the CPU, so do our mini-batches
        # therefore, we need to send those mini-batches to the
        # device where the model "lives"
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
print(model.state_dict())

In [None]:
# Estimate a and b
# torch.manual_seed(42)

#model = ManualLinearRegression().to(device) 
model = nn.Sequential(nn.Linear(1, 1)).to(device)
loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=1e-1)
train_step = make_train_step(model, loss_fn, optimizer)
print(model.state_dict())

In [None]:
# PyTorch’s random_split() method is an easy and familiar way of performing 
# a training-validation split. Just keep in mind that, in our example,
# we need to apply it to the whole dataset (not the training dataset 
# we built in two sections ago).

from torch.utils.data.dataset import random_split 

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

train_dataset, val_dataset = random_split(dataset, [80, 20])

train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)


In [None]:
# are two small, yet important, things to consider:
# torch.no_grad(): even though it won’t make a difference in our 
# simple model, 
# it is a good practice to wrap the validation inner loop with 
# this context manager 
# to disable any gradient calculation that you may inadvertently 
# trigger — 
# gradients belong in training, not in validation steps;
# eval(): the only thing it does is setting the model to evaluation 
# mode 
# (just like its train() counterpart did), so the model can adjust 
# its behavior regarding some operations, like Dropout.

training_losses = []
validation_losses = []

train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
    batch_losses = []
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        loss = train_step(x_batch, y_batch)
        batch_losses.append(loss)
    training_loss = np.mean(batch_losses)
    training_losses.append(training_loss)
    #losses.append(loss)

    with torch.no_grad():
        val_losses = []
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)

            model.eval()
            
            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())
        validation_loss = np.mean(val_losses)
        validation_losses.append(validation_loss)
        #val_losses.append(val_loss.item())
    print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
print(model.state_dict())

In [None]:
import matplotlib.pyplot as plt 
plt.plot(np.arange(len(training_losses)), training_losses, "-b", 
         np.arange(len(validation_losses)), validation_losses, "-r")
#plt.plot(np.arange(len(losses)), losses, "-b", 
#         np.arange(len(val_losses)), val_losses, "-r")