2021 Takahiro Shinozaki @ Tokyo Tech

Quick introduction of neural network training using optimizer

References:

    https://pytorch.org/docs/stable/optim.html



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import matplotlib.pyplot as plt

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
  print('CUDA（GPU）is available')
  device = 'cuda'
else:
  print('CUDA（GPU）is not available')
  device = 'cpu'

In [None]:
# Generate simulated data
NumSamples = 10000
torch.manual_seed(0)
# two dimensional data
x = (torch.randn(NumSamples,2)*2.0).to(device)
# label is three categories (0,1,2)
y = ((torch.sin(x[:,0])*0.9+torch.cos(x[:,1])*0.9).long()+1).to(device)
plt.scatter(x[:,0].to('cpu'),x[:,1].to('cpu'),c=y.to('cpu'))
print(x.dtype)
print(y.dtype)

In [None]:
# Prepare train, dev, test sets
x_train = x[0:int(NumSamples * 0.8)]
x_dev = x[int(NumSamples * 0.8):int(NumSamples * 0.9)]
x_test = x[int(NumSamples * 0.9):NumSamples]

y_train = y[0:int(NumSamples * 0.8)]
y_dev = y[int(NumSamples * 0.8):int(NumSamples * 0.9)]
y_test = y[int(NumSamples * 0.9):NumSamples]

In [None]:
# An alternative test set using meshgrid. 
a = b = torch.linspace(-5, 5, 101)
x_mesh = torch.flatten(torch.stack(torch.meshgrid(a,b),2),0,1).to(device)
y_mesh = ((torch.sin(x_mesh[:,0])*0.9+torch.cos(x_mesh[:,1])*0.9).long()+1).to(device)

In [None]:
# Prepare data loader for mini-batch training
BatchSize = 15 

dataset_train = TensorDataset(x_train, y_train)
dataset_dev = TensorDataset(x_dev, y_dev)
dataset_test = TensorDataset(x_test, y_test)

loader_train = DataLoader(dataset_train, batch_size=BatchSize, shuffle=True, drop_last=True)
loader_dev = DataLoader(dataset_dev, batch_size=BatchSize)
loader_test = DataLoader(dataset_test, batch_size=BatchSize)

In [None]:
# Define neural network
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(2, 100)
        self.layer2 = nn.Linear(100,20)
        self.layer_out = nn.Linear(20,3)

    def forward(self, z):
        z = nn.Sigmoid()(self.layer1(z))
        z = nn.Sigmoid()(self.layer2(z))
        z = self.layer_out(z)  # Pre-sigmoid output for nn.CrossEntropyLoss
        return z


In [None]:
# Make an instance of the neural network
model = NeuralNetwork().to(device)
params = model.state_dict()
print('params =', params)
print(params['layer1.weight'].device)

In [None]:
# Prepare an optimizer
# optimizer = optim.SGD(model.parameters(), lr=0.02, weight_decay=0.001)
optimizer = optim.SGD(model.parameters(), lr=0.02)
criterion = nn.CrossEntropyLoss()

In [None]:
# Define a step of batch processing
def train_step(x, y):
    model.train() # set train mode

    out = model(x) # forward propagation
    optimizer.zero_grad() # clear gradients
    loss = criterion(out, y)
    loss.backward() # calculate gradient

    optimizer.step() # update network parameters

    with torch.no_grad(): # no gradient computation
        num_crr = (y == torch.argmax(out,1)).sum()

    return (loss.item(), num_crr.item()) # touple of loss and correct count

def test_step(x, y):
    model.eval() # set evaluation mode

    out = model(x)
    loss = criterion(out, y)

    with torch.no_grad():
        num_crr = (y == torch.argmax(out,1)).sum()

    return (loss.item(), num_crr.item())


In [None]:
# Train the model
NumEpocs = 50
log_train = []
log_dev = []
for epoch in range(NumEpocs):
    train_loss_total = 0.0
    train_num_crr_total = 0.0
    dev_loss_total = 0.0
    dev_num_crr_total = 0.0
    num_train_sample = 0
    num_dev_sample = 0

    # mini-batch processings
    for x, y in loader_train:
        loss, num_crr = train_step(x, y)
        train_loss_total += loss
        train_num_crr_total += num_crr
        num_train_sample += len(y)
            
    for x, y in loader_dev:
        loss, num_crr = test_step(x, y)
        dev_loss_total += loss
        dev_num_crr_total += num_crr
        num_dev_sample += len(y)
            
    train_loss_avg = train_loss_total / num_train_sample
    train_crr_avg = train_num_crr_total / num_train_sample
    dev_loss_avg = train_loss_total / num_train_sample
    dev_crr_avg = train_num_crr_total / num_train_sample

    log_train.append(train_loss_avg)
    log_dev.append(dev_loss_avg)

    print(f'[Epoch {epoch+1:3d}]' \
          f' tr_loss: {train_loss_avg:.5f}, tr_crr: {train_crr_avg:.5f}' \
          f' dev_loss: {dev_loss_avg:.5f}, dev_crr: {dev_crr_avg:.5f}')

print('Done training')
# print(model.state_dict())

In [None]:
# Evaluation using the test set
model.eval() # set evaluation mode
out = model(x_test)
c = torch.argmax(out, 1)
test_crr = (y_test == c).sum().to('cpu')/torch.tensor(y_test.size())
print('test_crr =', test_crr.item())
plt.scatter(x_test[:,0].to('cpu'),x_test[:,1].to('cpu'),c=c.to('cpu'))

In [None]:
# Distribution of errors
plt.scatter(x_test[:,0].to('cpu'),x_test[:,1].to('cpu'),c=(c-y_test).to('cpu'))
plt.colorbar()

In [None]:
# Systematic evaluation of the input domain using the mesh data
model.eval() # set evaluation mode
out = model(x_mesh)
c = torch.argmax(out, 1)
mesh_crr = (y_mesh == c).sum().to('cpu')/torch.tensor(y_mesh.size())
print('mesh_crr =', mesh_crr.item())
plt.scatter(x_mesh[:,0].to('cpu'),x_mesh[:,1].to('cpu'),c=c.to('cpu'))

In [None]:
# Distribution of errors
plt.scatter(x_mesh[:,0].to('cpu'),x_mesh[:,1].to('cpu'),c=(c-y_mesh).to('cpu'))
plt.colorbar()