##### Epic Tutorial: https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e

In [1]:
import numpy as np

In [3]:
# Data Generation
np.random.seed(42)
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

In [4]:
# Initializes parameters "a" and "b" randomly
np.random.seed(42)
a = np.random.randn(1)
b = np.random.randn(1)

print(a, b)

# Sets learning rate
lr = 1e-1
# Defines number of epochs
n_epochs = 1000

for epoch in range(n_epochs):
    # Computes our model's predicted output
    yhat = a + b * x_train
    
    # How wrong is our model? That's the error! 
    error = (y_train - yhat)
    # It is a regression, so it computes mean squared error (MSE)
    loss = (error ** 2).mean()
    
    # Computes gradients for both "a" and "b" parameters
    a_grad = -2 * error.mean()
    b_grad = -2 * (x_train * error).mean()
    
    # Updates parameters using gradients and the learning rate
    a = a - lr * a_grad
    b = b - lr * b_grad
    
print(a, b)

# Sanity Check: do we get the same results as our gradient descent?
from sklearn.linear_model import LinearRegression
linr = LinearRegression()
linr.fit(x_train, y_train)
print(linr.intercept_, linr.coef_[0])

[0.49671415] [-0.1382643]
[1.02354094] [1.96896411]
[1.02354075] [1.96896447]


In [69]:
import torch
import torch.optim as optim
import torch.nn as nn
from torchviz import make_dot

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Our data was in Numpy arrays, but we need to transform them into PyTorch's Tensors
# and then we send them to the chosen device
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

# Here we can see the difference - notice that .type() is more useful
# since it also tells us WHERE the tensor is (device)
print(type(x_train), type(x_train_tensor), x_train_tensor.type()) # GPU tensor

<class 'numpy.ndarray'> <class 'torch.Tensor'> torch.cuda.FloatTensor


In [9]:
# dont need to compute gradient manually
lr = 1e-1
n_epochs = 1000

torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()
    loss.backward()
    print(a.grad)
    print(b.grad)
    with torch.no_grad(): # temporarily set all the requires_grad flag to false
        a -= lr * a.grad
        b -= lr * b.grad
    a.grad.zero_()
    b.grad.zero_()
    
print(a, b)

tensor([-3.3881], device='cuda:0')
tensor([-1.9439], device='cuda:0')
tensor([-2.5268], device='cuda:0')
tensor([-1.5005], device='cuda:0')
tensor([-1.8796], device='cuda:0')
tensor([-1.1666], device='cuda:0')
tensor([-1.3935], device='cuda:0')
tensor([-0.9151], device='cuda:0')
tensor([-1.0283], device='cuda:0')
tensor([-0.7254], device='cuda:0')
tensor([-0.7541], device='cuda:0')
tensor([-0.5822], device='cuda:0')
tensor([-0.5483], device='cuda:0')
tensor([-0.4741], device='cuda:0')
tensor([-0.3938], device='cuda:0')
tensor([-0.3922], device='cuda:0')
tensor([-0.2780], device='cuda:0')
tensor([-0.3301], device='cuda:0')
tensor([-0.1912], device='cuda:0')
tensor([-0.2829], device='cuda:0')
tensor([-0.1262], device='cuda:0')
tensor([-0.2469], device='cuda:0')
tensor([-0.0777], device='cuda:0')
tensor([-0.2193], device='cuda:0')
tensor([-0.0414], device='cuda:0')
tensor([-0.1981], device='cuda:0')
tensor([-0.0144], device='cuda:0')
tensor([-0.1816], device='cuda:0')
tensor([0.0056], dev

tensor([-0.0279], device='cuda:0')
tensor([0.0141], device='cuda:0')
tensor([-0.0275], device='cuda:0')
tensor([0.0139], device='cuda:0')
tensor([-0.0271], device='cuda:0')
tensor([0.0136], device='cuda:0')
tensor([-0.0267], device='cuda:0')
tensor([0.0134], device='cuda:0')
tensor([-0.0263], device='cuda:0')
tensor([0.0132], device='cuda:0')
tensor([-0.0259], device='cuda:0')
tensor([0.0130], device='cuda:0')
tensor([-0.0255], device='cuda:0')
tensor([0.0128], device='cuda:0')
tensor([-0.0251], device='cuda:0')
tensor([0.0126], device='cuda:0')
tensor([-0.0247], device='cuda:0')
tensor([0.0125], device='cuda:0')
tensor([-0.0244], device='cuda:0')
tensor([0.0123], device='cuda:0')
tensor([-0.0240], device='cuda:0')
tensor([0.0121], device='cuda:0')
tensor([-0.0236], device='cuda:0')
tensor([0.0119], device='cuda:0')
tensor([-0.0233], device='cuda:0')
tensor([0.0117], device='cuda:0')
tensor([-0.0229], device='cuda:0')
tensor([0.0115], device='cuda:0')
tensor([-0.0226], device='cuda:0')

tensor([0.0023], device='cuda:0')
tensor([-0.0046], device='cuda:0')
tensor([0.0023], device='cuda:0')
tensor([-0.0045], device='cuda:0')
tensor([0.0023], device='cuda:0')
tensor([-0.0044], device='cuda:0')
tensor([0.0022], device='cuda:0')
tensor([-0.0044], device='cuda:0')
tensor([0.0022], device='cuda:0')
tensor([-0.0043], device='cuda:0')
tensor([0.0022], device='cuda:0')
tensor([-0.0042], device='cuda:0')
tensor([0.0021], device='cuda:0')
tensor([-0.0042], device='cuda:0')
tensor([0.0021], device='cuda:0')
tensor([-0.0041], device='cuda:0')
tensor([0.0021], device='cuda:0')
tensor([-0.0040], device='cuda:0')
tensor([0.0020], device='cuda:0')
tensor([-0.0040], device='cuda:0')
tensor([0.0020], device='cuda:0')
tensor([-0.0039], device='cuda:0')
tensor([0.0020], device='cuda:0')
tensor([-0.0039], device='cuda:0')
tensor([0.0019], device='cuda:0')
tensor([-0.0038], device='cuda:0')
tensor([0.0019], device='cuda:0')
tensor([-0.0037], device='cuda:0')
tensor([0.0019], device='cuda:0')


tensor([-0.0008], device='cuda:0')
tensor([0.0004], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0004], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0004], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0004], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0004], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0004], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0007], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0006], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0006], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0006], device='cuda:0')
tensor([0.0003], device='cuda:0')
tensor([-0.0006], device='cuda:0')

tensor([-0.0001], device='cuda:0')
tensor([6.4231e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([6.3159e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([6.2273e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([6.1347e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([6.0582e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.9600e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.8705e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.7837e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.6868e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.5869e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.5023e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.4283e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.3387e-05], device='cuda:0')
tensor([-0.0001], device='cuda:0')
tensor([5.2570e-

tensor([-2.4954e-05], device='cuda:0')
tensor([1.2611e-05], device='cuda:0')
tensor([-2.4608e-05], device='cuda:0')
tensor([1.2333e-05], device='cuda:0')
tensor([-2.4277e-05], device='cuda:0')
tensor([1.2227e-05], device='cuda:0')
tensor([-2.3868e-05], device='cuda:0')
tensor([1.2118e-05], device='cuda:0')
tensor([-2.3473e-05], device='cuda:0')
tensor([1.1960e-05], device='cuda:0')
tensor([-2.3111e-05], device='cuda:0')
tensor([1.1732e-05], device='cuda:0')
tensor([-2.2784e-05], device='cuda:0')
tensor([1.1500e-05], device='cuda:0')
tensor([-2.2470e-05], device='cuda:0')
tensor([1.1255e-05], device='cuda:0')
tensor([-2.2158e-05], device='cuda:0')
tensor([1.1197e-05], device='cuda:0')
tensor([-2.1774e-05], device='cuda:0')
tensor([1.1108e-05], device='cuda:0')
tensor([-2.1408e-05], device='cuda:0')
tensor([1.0944e-05], device='cuda:0')
tensor([-2.1092e-05], device='cuda:0')
tensor([1.0894e-05], device='cuda:0')
tensor([-2.0701e-05], device='cuda:0')
tensor([1.0646e-05], device='cuda:0')

tensor([-5.0471e-06], device='cuda:0')
tensor([2.6161e-06], device='cuda:0')
tensor([-4.9580e-06], device='cuda:0')
tensor([2.5821e-06], device='cuda:0')
tensor([-4.8858e-06], device='cuda:0')
tensor([2.5565e-06], device='cuda:0')
tensor([-4.8155e-06], device='cuda:0')
tensor([2.5011e-06], device='cuda:0')
tensor([-4.7556e-06], device='cuda:0')
tensor([2.5053e-06], device='cuda:0')
tensor([-4.6529e-06], device='cuda:0')
tensor([2.4876e-06], device='cuda:0')
tensor([-4.5736e-06], device='cuda:0')
tensor([2.4019e-06], device='cuda:0')
tensor([-4.5378e-06], device='cuda:0')
tensor([2.4252e-06], device='cuda:0')
tensor([-4.4265e-06], device='cuda:0')
tensor([2.3986e-06], device='cuda:0')
tensor([-4.3471e-06], device='cuda:0')
tensor([2.3651e-06], device='cuda:0')
tensor([-4.2728e-06], device='cuda:0')
tensor([2.3558e-06], device='cuda:0')
tensor([-4.1875e-06], device='cuda:0')
tensor([2.3199e-06], device='cuda:0')
tensor([-4.1242e-06], device='cuda:0')
tensor([2.1812e-06], device='cuda:0')

tensor([-1.0040e-06], device='cuda:0')
tensor([5.5600e-07], device='cuda:0')
tensor([-1.0434e-06], device='cuda:0')
tensor([6.6496e-07], device='cuda:0')
tensor([-9.7021e-07], device='cuda:0')
tensor([5.3365e-07], device='cuda:0')
tensor([-1.0058e-06], device='cuda:0')
tensor([6.4494e-07], device='cuda:0')
tensor([-9.3086e-07], device='cuda:0')
tensor([5.2759e-07], device='cuda:0')
tensor([-9.6869e-07], device='cuda:0')
tensor([6.5519e-07], device='cuda:0')
tensor([-8.7474e-07], device='cuda:0')
tensor([5.4669e-07], device='cuda:0')
tensor([-9.0536e-07], device='cuda:0')
tensor([6.3749e-07], device='cuda:0')
tensor([-8.4157e-07], device='cuda:0')
tensor([5.1083e-07], device='cuda:0')
tensor([-8.8313e-07], device='cuda:0')
tensor([6.2957e-07], device='cuda:0')
tensor([-8.0210e-07], device='cuda:0')
tensor([5.2433e-07], device='cuda:0')
tensor([-8.2317e-07], device='cuda:0')
tensor([6.1933e-07], device='cuda:0')
tensor([-7.6508e-07], device='cuda:0')
tensor([4.7870e-07], device='cuda:0')

tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')
tensor([-5.7358e-07], device='cuda:0')
tensor([5.4762e-07], device='cuda:0')

In [20]:
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

yhat = a + b * x_train_tensor
error = y_train_tensor - yhat
loss = (error ** 2).mean()
# make_dot(yhat)

In [22]:
# dont need to update parameters and set zero for gradients manually
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

lr = 1e-1
n_epochs = 1000

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()

    loss.backward()    
    
    # No more manual update!
    # with torch.no_grad():
    #     a -= lr * a.grad
    #     b -= lr * b.grad
    optimizer.step()
    
    # No more telling PyTorch to let gradients go!
    # a.grad.zero_()
    # b.grad.zero_()
    optimizer.zero_grad()
    
print(a, b)

tensor([0.1940], device='cuda:0', requires_grad=True) tensor([0.1391], device='cuda:0', requires_grad=True)
tensor([1.0235], device='cuda:0', requires_grad=True) tensor([1.9690], device='cuda:0', requires_grad=True)


In [70]:
# dont need to set loss func manually
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

lr = 1e-1
n_epochs = 1000

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    
    # No more manual loss!
    # error = y_tensor - yhat
    # loss = (error ** 2).mean()
    loss = loss_fn(y_train_tensor, yhat)

    loss.backward()    
    optimizer.step()
    optimizer.zero_grad()
    
print(a, b)

tensor([0.1940], device='cuda:0', requires_grad=True) tensor([0.1391], device='cuda:0', requires_grad=True)
tensor([1.0235], device='cuda:0', requires_grad=True) tensor([1.9690], device='cuda:0', requires_grad=True)


In [106]:
# start to build a model
class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # To make "a" and "b" real parameters of the model, we need to wrap them with nn.Parameter
        self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        
    def forward(self, x):
        # Computes the outputs / predictions
        return self.a + self.b * x

In [107]:
torch.manual_seed(42)

# Now we can create a model and send it at once to the device
model = ManualLinearRegression().to(device)
# We can also inspect its parameters using its state_dict
print(model.state_dict()) # get the current values for all parameters

lr = 1e-1
n_epochs = 1000

loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr) # parameters() : retrieve an iterator over all model’s parameters

for epoch in range(n_epochs):
    # What is this?!?
    model.train() # set the model to training mode

    # No more manual prediction!
    # yhat = a + b * x_tensor
    yhat = model(x_train_tensor)
    
    loss = loss_fn(y_train_tensor, yhat)
    loss.backward()    
    optimizer.step()
    optimizer.zero_grad()
    
print(model.state_dict())

OrderedDict([('a', tensor([0.3367], device='cuda:0')), ('b', tensor([0.1288], device='cuda:0'))])
OrderedDict([('a', tensor([1.0235], device='cuda:0')), ('b', tensor([1.9690], device='cuda:0'))])


In [62]:
# nested model
class LayerLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # Instead of our custom parameters, we use a Linear layer with single input and single output
        self.linear = nn.Linear(1, 1)
                
    def forward(self, x):
        # Now it only takes a call to the layer to make predictions
        return self.linear(x)

In [97]:
# get a list of all parameters
[*LayerLinearRegression().parameters()]

[Parameter containing:
 tensor([[-0.4062]], requires_grad=True), Parameter containing:
 tensor([0.6634], requires_grad=True)]

In [30]:
LayerLinearRegression().parameters()

<generator object Module.parameters at 0x000002808240EB88>

In [108]:
# build a training step to minimize the loop
def make_train_step(model, loss_fn, optimizer):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        # Sets model to TRAIN mode
        model.train()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        # Computes gradients
        loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        optimizer.zero_grad()
        # Returns the loss
        return loss.item()
    
    # Returns the function that will be called inside the train loop
    return train_step

# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
losses = []

# For each epoch...
for epoch in range(n_epochs):
    # Performs one train step and returns the corresponding loss
    loss = train_step(x_train_tensor, y_train_tensor)
    losses.append(loss)
    
# Checks model's parameters
print(model.state_dict())

OrderedDict([('a', tensor([1.0235], device='cuda:0')), ('b', tensor([1.9690], device='cuda:0'))])


In [88]:
losses

[3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636505127,
 3.8999626636

In [34]:
# Dataset
from torch.utils.data import Dataset, TensorDataset

class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

# Wait, is this a CPU tensor now? Why? Where is .to(device)?
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.7713]), tensor([2.4745]))
(tensor([0.7713]), tensor([2.4745]))


In [52]:
# Data loader
from torch.utils.data import DataLoader
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True) # each time perform is different

In [53]:
# return a list containing two tensors, one for the features, another one for the labels
next(iter(train_loader))

[tensor([[0.2921],
         [0.7320],
         [0.6842],
         [0.1560],
         [0.8948],
         [0.3664],
         [0.8287],
         [0.0055],
         [0.8022],
         [0.0977],
         [0.8631],
         [0.9507],
         [0.0254],
         [0.1220],
         [0.1196],
         [0.3252]]), tensor([[1.5848],
         [2.4732],
         [2.3492],
         [1.2901],
         [2.7393],
         [1.7093],
         [2.7388],
         [1.0632],
         [2.6229],
         [1.4417],
         [2.9128],
         [2.8715],
         [1.0785],
         [1.2406],
         [1.3214],
         [1.7291]])]

In [42]:
losses = []
train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        # the dataset "lives" in the CPU, so do our mini-batches
        # therefore, we need to send those mini-batches to the
        # device where the model "lives"
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
print(model.state_dict())

OrderedDict([('a', tensor([1.0300], device='cuda:0')), ('b', tensor([1.9724], device='cuda:0'))])


In [43]:
model.state_dict()

OrderedDict([('a', tensor([1.0300], device='cuda:0')),
             ('b', tensor([1.9724], device='cuda:0'))])

In [44]:
# random_split
from torch.utils.data.dataset import random_split

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

train_dataset, val_dataset = random_split(dataset, [80, 20])

train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)

In [74]:
losses = []
val_losses = []
train_step = make_train_step(model, loss_fn, optimizer)

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            
            model.eval()

            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())

print(model.state_dict())

OrderedDict([('a', tensor([1.0234], device='cuda:0')), ('b', tensor([1.9694], device='cuda:0'))])


In [75]:
val_losses

[0.006585801485925913,
 0.006713002920150757,
 0.006419164594262838,
 0.006548687815666199,
 0.0067115724086761475,
 0.006412731949239969,
 0.006831628270447254,
 0.006514499429613352,
 0.006474392022937536,
 0.006651032716035843,
 0.006858211942017078,
 0.006600898690521717,
 0.006616558879613876,
 0.006494350731372833,
 0.006777647882699966,
 0.006517117377370596,
 0.006254221778362989,
 0.006527803838253021,
 0.006379834841936827,
 0.006541030015796423,
 0.006497163325548172,
 0.006428246386349201,
 0.006102553103119135,
 0.006159006152302027,
 0.006592012941837311,
 0.006090415176004171,
 0.006257259286940098,
 0.006405841093510389,
 0.006981156300753355,
 0.006321539171040058,
 0.006769771222025156,
 0.006555234082043171,
 0.0065987976267933846,
 0.006625098641961813,
 0.006531473249197006,
 0.0065105631947517395,
 0.0062184808775782585,
 0.0064619677141308784,
 0.006634061224758625,
 0.00636137742549181,
 0.006646141409873962,
 0.006743823178112507,
 0.006669320166110992,
 0.0065