# Using PyTorch Modules

The basic way to create a neural net in PyTorch is to specify all the matrices and vectors for weights/biases directly. Below, we see this approach used for a small neural net for classifying MNIST digits.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd;
from scipy.stats import zscore
import torch as torch;

import numpy as np

import torchvision.datasets as datasets


np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#read in the dataset, convert to numpy

n_classes = 10;
n_features = 28*28;

full_train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=None)
full_test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=None)

x_train = full_train_dataset.data.numpy().reshape(-1,n_features).astype(dtype=float)/255.0;
x_test = full_test_dataset.data.numpy().reshape(-1,n_features).astype(dtype=float)/255.0;

y_train_cat = full_train_dataset.targets.numpy()
y_test_cat = full_test_dataset.targets.numpy()

dd_y=pd.DataFrame(data=y_train_cat)
y_train=pd.get_dummies(dd_y,columns = list(dd_y.columns)).to_numpy().astype(dtype=float);

dd_y=pd.DataFrame(data=y_test_cat)
y_test=pd.get_dummies(dd_y,columns = list(dd_y.columns)).to_numpy().astype(dtype=float);


# permute the dataset
permutation = np.random.permutation(x_train.shape[0]);
x_train=x_train[permutation,:]
y_train=y_train[permutation,:]

# select smaller subset (out of 50000)
subset_size = 50000;
x_train=x_train[:subset_size,:]
y_train=y_train[:subset_size,:]


n_train=x_train.shape[0];

# create tensor variables for data, we do not need gradient w.r.t. to them
t_x_test=torch.tensor(x_test,requires_grad=False,device=device);
t_y_test=torch.tensor(y_test,requires_grad=False,device=device);


# number of activations (neurons) in the hidden layer
n_hidden = 100;


# create weights for layer 1 (W1, b1) and layer 2 (W2, b2)
initialW1=0.01*np.random.randn(n_features,n_hidden)
initialW2=0.01*np.random.randn(n_hidden,n_classes)

W1 = torch.tensor(initialW1,requires_grad=True,device=device);
b1 = torch.zeros((1,n_hidden),requires_grad=True,device=device);

W2 = torch.tensor(initialW2,requires_grad=True,device=device);
b2 = torch.zeros((1,n_classes),requires_grad=True,device=device);


# this optimizer will do gradient descent for us
# experiment with learning rate and optimizer type
learning_rate = 0.0001;
# note that we have to add all weights&biases, for both layers, to the optimizer
optimizer = torch.optim.SGD([W1,b1,W2,b2],lr=learning_rate)
#optimizer = torch.optim.Adam([W1,b1,W2,b2],lr=learning_rate)

# experiment with batch size (small batch size needs small learning rate)
batch_size=32;
n_epochs = 30;

for i in range(n_epochs):

    #permute the training set for each epoch
    #doing this on CPU. Alterntively, might be done on GPU, on the tensor version of x_train, y_train
    permutation = np.random.permutation(x_train.shape[0]);
    x_train=x_train[permutation,:]
    y_train=y_train[permutation,:]

    accuracy = 0.0;

    #go through the training set in batches of size batch_size
    for j in range(0,n_train,batch_size):
        t_x_train = torch.tensor(x_train[j:j+batch_size,:],requires_grad=False,device=device);
        t_y_train = torch.tensor(y_train[j:j+batch_size,:],requires_grad=False,device=device);

        # clear previous gradient calculations
        optimizer.zero_grad();
        
        # calculate model predictions
        first_layer = torch.matmul(t_x_train,W1)+b1
        activations_first_layer = 2.0 / (1.0 + torch.exp(-first_layer)) -1.0;

        # done with first layer, it will serve as input to the second layer
        second_layer = torch.matmul(activations_first_layer,W2)+b2
        activations_second_layer = 1.0 / (1.0 + torch.exp(-second_layer));
        # done with second layer

        # but we need to normalize it (get softmax) for cross-entropy loss/risk
        sum_activations = torch.sum(activations_second_layer,dim=1,keepdim=True)
        normalized_activations = torch.div(activations_second_layer, sum_activations);
        risk = -1.0 * torch.mean(torch.sum(torch.multiply(t_y_train,torch.log(normalized_activations)),dim=1 ) );

        # calculate gradients
        risk.backward();
        
        # take the gradient step
        optimizer.step();
        

        batch_risk=risk.item();

        true_class = np.argmax(t_y_train.detach().cpu().numpy(),axis=1)
        pred_class = np.argmax(normalized_activations.detach().cpu().numpy(),axis=1)
        accuracy += np.count_nonzero(true_class == pred_class);
        
    # after all the batches in this epoch are done, we calculate test set risk and accuracy
    # we don't need gradients, so we turn them off
        
    with (torch.no_grad()):
        first_layer = torch.matmul(t_x_test,W1)+b1
        activations_first_layer = 2.0 / (1.0 + torch.exp(-first_layer)) -1.0;
        second_layer = torch.matmul(activations_first_layer,W2)+b2
        activations_second_layer = 1.0 / (1.0 + torch.exp(-second_layer));
        sum_activations = torch.sum(activations_second_layer,dim=1,keepdim=True)
        test_normalized_activations = torch.div(activations_second_layer, sum_activations);
        
        #calculate loss

        test_risk = -1.0 * torch.mean(torch.sum(torch.multiply(t_y_test,torch.log(test_normalized_activations)),dim=1 ) );

        test_true_class = np.argmax(t_y_test.detach().cpu().numpy(),axis=1)
        test_pred_class = np.argmax(test_normalized_activations.detach().cpu().numpy(),axis=1)
        test_accuracy = np.count_nonzero(test_true_class == test_pred_class)/test_pred_class.shape[0];
        test_error = 1.0 - test_accuracy;
        
    accuracy = accuracy / float(x_train.shape[0])
    print(i,batch_risk,test_risk.item(),accuracy,test_accuracy)

This is the same code as above. We just created a pytorch Module to encapsulate the linear layer - that is, hold the W & b, and operations on it, inside a class.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd;
from scipy.stats import zscore
import torch as torch;

import numpy as np

import torchvision.datasets as datasets


np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#read in the dataset, convert to numpy

n_classes = 10;
n_features = 28*28;

full_train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=None)
full_test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=None)

x_train = full_train_dataset.data.numpy().reshape(-1,n_features).astype(dtype=np.float32)/255.0;
x_test = full_test_dataset.data.numpy().reshape(-1,n_features).astype(dtype=np.float32)/255.0;

y_train_cat = full_train_dataset.targets.numpy()
y_test_cat = full_test_dataset.targets.numpy()

dd_y=pd.DataFrame(data=y_train_cat)
y_train=pd.get_dummies(dd_y,columns = list(dd_y.columns)).to_numpy().astype(dtype=np.float32);

dd_y=pd.DataFrame(data=y_test_cat)
y_test=pd.get_dummies(dd_y,columns = list(dd_y.columns)).to_numpy().astype(dtype=np.float32);


# permute the dataset
permutation = np.random.permutation(x_train.shape[0]);
x_train=x_train[permutation,:]
y_train=y_train[permutation,:]

# select smaller subset (out of 50000)
subset_size = 50000;
x_train=x_train[:subset_size,:]
y_train=y_train[:subset_size,:]


n_train=x_train.shape[0];

# create tensor variables for data, we do not need gradient w.r.t. to them
t_x_test=torch.tensor(x_test,requires_grad=False,device=device);
t_y_test=torch.tensor(y_test,requires_grad=False,device=device);

# create a layer - we inherit from torch.nn.Module
class MyLinear(torch.nn.Module):
    def __init__(self,dim_in,dim_out):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.dim_in=dim_in;
        self.dim_out=dim_out;
        self.weight = torch.nn.Parameter(torch.Tensor(dim_in,dim_out))
        self.bias = torch.nn.Parameter(torch.zeros(dim_out))
        torch.nn.init.kaiming_uniform_(self.weight);

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return torch.addmm(self.bias, x, self.weight)

    
    def extra_repr(self):
        """
        This function will allow the layer to "present itself" if we want to print it
        """
        return 'out={}, in={}'.format(
            self.dim_out, self.dim_in
        )


# number of activations (neurons) in the hidden layer
n_hidden = 100;


lin1=MyLinear(dim_in=n_features,dim_out=n_hidden).to(device)
lin2=MyLinear(dim_in=n_hidden,dim_out=n_classes).to(device)



# this optimizer will do gradient descent for us
# experiment with learning rate and optimizer type
learning_rate = 0.0001;
# note that we have to add all weights&biases, for both layers, to the optimizer
optimizer = torch.optim.Adam(list(lin1.parameters())+list(lin2.parameters()),lr=learning_rate)

# experiment with batch size (small batch size needs small learning rate)
batch_size=32;
n_epochs = 30;

for i in range(n_epochs):

    #permute the training set for each epoch
    #doing this on CPU. Alterntively, might be done on GPU, on the tensor version of x_train, y_train
    permutation = np.random.permutation(x_train.shape[0]);
    x_train=x_train[permutation,:]
    y_train=y_train[permutation,:]

    accuracy = 0.0;

    #go through the training set in batches of size batch_size
    for j in range(0,n_train,batch_size):
        t_x_train = torch.tensor(x_train[j:j+batch_size,:],requires_grad=False,device=device);
        t_y_train = torch.tensor(y_train[j:j+batch_size,:],requires_grad=False,device=device);

        # clear previous gradient calculations
        optimizer.zero_grad();
        
        # calculate model predictions
        first_layer = lin1(t_x_train)
        activations_first_layer = 2.0 / (1.0 + torch.exp(-first_layer)) -1.0;

        # done with first layer, it will serve as input to the second layer
        second_layer = lin2(activations_first_layer)
        activations_second_layer = 1.0 / (1.0 + torch.exp(-second_layer));
        # done with second layer

        # but we need to normalize it (get softmax) for cross-entropy loss/risk
        sum_activations = torch.sum(activations_second_layer,dim=1,keepdim=True)
        normalized_activations = torch.div(activations_second_layer, sum_activations);
        risk = -1.0 * torch.mean(torch.sum(torch.multiply(t_y_train,torch.log(normalized_activations)),dim=1 ) );

        # calculate gradients
        risk.backward();
        
        # take the gradient step
        optimizer.step();
        

        batch_risk=risk.item();

        true_class = np.argmax(t_y_train.detach().cpu().numpy(),axis=1)
        pred_class = np.argmax(normalized_activations.detach().cpu().numpy(),axis=1)
        accuracy += np.count_nonzero(true_class == pred_class);
        
    # after all the batches in this epoch are done, we calculate test set risk and accuracy
    # we don't need gradients, so we turn them off
        
    with (torch.no_grad()):
        first_layer = lin1(t_x_test)
        activations_first_layer = 2.0 / (1.0 + torch.exp(-first_layer)) -1.0;
        second_layer = lin2(activations_first_layer)
        activations_second_layer = 1.0 / (1.0 + torch.exp(-second_layer));
        sum_activations = torch.sum(activations_second_layer,dim=1,keepdim=True)
        test_normalized_activations = torch.div(activations_second_layer, sum_activations);
        
        #calculate loss

        test_risk = -1.0 * torch.mean(torch.sum(torch.multiply(t_y_test,torch.log(test_normalized_activations)),dim=1 ) );

        test_true_class = np.argmax(t_y_test.detach().cpu().numpy(),axis=1)
        test_pred_class = np.argmax(test_normalized_activations.detach().cpu().numpy(),axis=1)
        test_accuracy = np.count_nonzero(test_true_class == test_pred_class)/test_pred_class.shape[0];
        test_error = 1.0 - test_accuracy;
        
    accuracy = accuracy / float(x_train.shape[0])
    print(i,batch_risk,test_risk.item(),accuracy,test_accuracy)

It's cleaner to create the whole network as a single Module, and use already existing pytorch Modules for the individual layers.

We will also use dataloader that simplifies creating random batches of data for training.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd;
from scipy.stats import zscore
import torch as torch;

import numpy as np

import torchvision.datasets as datasets
from torchvision import transforms


np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#read in the dataset, convert to numpy

n_classes = 10;
n_features = 28*28;

full_train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]))
full_test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]))

batch_size=32;

trainloader = torch.utils.data.DataLoader(full_train_dataset, batch_size=batch_size,shuffle=True)
testloader = torch.utils.data.DataLoader(full_test_dataset, batch_size=batch_size,shuffle=False)

# create the whole neural network as one module(inherit from nn.Module)
# inside, reuse existing PyTorch modules, like nn.Linear
class MyNet(torch.nn.Module):
    # architecture of the network is specified in the constructor
    def __init__(self,dim_in,dim_out,dim_hidden):
        super().__init__()
        self.dim_in=dim_in;
        self.dim_out=dim_out;
        self.dim_hidden=dim_hidden;
        self.lin1 = torch.nn.Linear(dim_in,dim_hidden);
        self.a1 = torch.nn.functional.tanh;
        self.lin2 = torch.nn.Linear(dim_hidden,dim_out)
        self.a2 = torch.nn.functional.log_softmax;
    
    # here we specify the computation (forward phase of training) how "x" is transfered into output "y"
    def forward(self, x):
        x = self.lin1(x);
        x = self.a1(x);
        x = self.lin2(x);
        x = self.a2(x);
        return x;


    def extra_repr(self):
        return 'out={}, in={}'.format(
            self.dim_out, self.dim_in
        )


# number of activations (neurons) in the hidden layer
n_hidden = 100;


model=MyNet(dim_in=n_features,dim_hidden=n_hidden,dim_out=n_classes).to(device)

criterion = torch.nn.NLLLoss().to(device)


# this optimizer will do gradient descent for us
# experiment with learning rate and optimizer type
learning_rate = 0.0001;
# note that we have to add all weights&biases, for both layers, to the optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

# experiment with batch size (small batch size needs small learning rate)
batch_size=32;
n_epochs = 30;

for i in range(n_epochs):

    for j, data in enumerate(trainloader):
      
        inputs, labels = data        
        inputs=inputs.to(device);
        labels=labels.to(device);
        
        optimizer.zero_grad();

        outputs = model(inputs.view(-1,28*28));
        risk = criterion(outputs, labels);
  
        # calculate gradients
        risk.backward();
        
        # take the gradient step
        optimizer.step();
        

        batch_risk=risk.item();
    with (torch.no_grad()):
      correct = 0;
      for j, data in enumerate(testloader):
        
          inputs, labels = data        
          inputs=inputs.to(device);
          labels=labels.to(device);
          outputs = model(inputs.view(-1,28*28));
          pred = outputs.data.max(1, keepdim=True)[1]
          correct += pred.eq(labels.data.view_as(pred)).sum().item();
    print(i, batch_risk, correct / len(testloader.dataset))


In most cases, we don't even need to create a class, we can just use nn.Sequential to contain the list of layers.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd;
from scipy.stats import zscore
import torch as torch;

import numpy as np

import torchvision.datasets as datasets
from torchvision import transforms


np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#read in the dataset, convert to numpy

n_classes = 10;
n_features = 28*28;

full_train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor() )
full_test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor() )

batch_size=64;

trainloader = torch.utils.data.DataLoader(full_train_dataset, batch_size=batch_size,shuffle=True)
testloader = torch.utils.data.DataLoader(full_test_dataset, batch_size=batch_size,shuffle=False)



# number of activations (neurons) in the hidden layer
n_hidden = 100;
"""
In simple cases (like: layer after layer forming a sequence)
we can use a PyTorch container module
instead of manually defining a new module to define our network
"""
#
model = torch.nn.Sequential(
    torch.nn.Linear(n_features,n_hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(n_hidden,n_classes),
    torch.nn.LogSoftmax(dim=1)
    ).to(device)


criterion = torch.nn.NLLLoss().to(device)


# this optimizer will do gradient descent for us
# experiment with learning rate and optimizer type
learning_rate = 0.001;
# note that we have to add all weights&biases, for both layers, to the optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

# experiment with batch size (small batch size needs small learning rate)
n_epochs = 30;

for i in range(n_epochs):

    for j, data in enumerate(trainloader):
      
        inputs, labels = data        
        inputs=inputs.to(device);
        labels=labels.to(device);
        
        optimizer.zero_grad();

        outputs = model(inputs.view(-1,28*28));
        risk = criterion(outputs, labels);
  
        # calculate gradients
        risk.backward();
        
        # take the gradient step
        optimizer.step();
        

        batch_risk=risk.item();
    with (torch.no_grad()):
      correct = 0;
      for j, data in enumerate(testloader):
        
          inputs, labels = data        
          inputs=inputs.to(device);
          labels=labels.to(device);
          outputs = model(inputs.view(-1,28*28));
          pred = outputs.data.max(1, keepdim=True)[1]
          correct += pred.eq(labels.data.view_as(pred)).sum().item();
    print(i, batch_risk, correct / len(testloader.dataset))



# Convolutional Networks for Images

Insead of just linear layers that don't consider spatial arrangement of pixels, we should use convolutional layers and pooling layers. We added BatchNorm and Dropout.

We also added a learning rate Scheduler.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd;
from scipy.stats import zscore
import torch as torch;

import numpy as np

import torchvision.datasets as datasets
from torchvision import transforms

import torch.nn as nn;
import torch.nn.functional as F;

np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#read in the dataset, convert to numpy


full_train_dataset = datasets.MNIST(root='./data', train=True, download=True, 
                             transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]) )
full_test_dataset = datasets.MNIST(root='./data', train=False, download=True,
                             transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]) )

batch_size=64;

trainloader = torch.utils.data.DataLoader(full_train_dataset, batch_size=batch_size,shuffle=True)
testloader = torch.utils.data.DataLoader(full_test_dataset, batch_size=batch_size,shuffle=False)

class ConvNetWithBatchNorm(nn.Module):
    def __init__(self): 
        super(ConvNetWithBatchNorm, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=3, kernel_size=5),         # (N, 1, 28, 28) -> (N,  3, 24, 24)
            nn.ReLU(),
            nn.AvgPool2d(kernel_size=2, stride=2),  # (N, 3, 24, 24) -> (N,  3, 12, 12)
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=3),
            nn.BatchNorm2d(num_features=6)           # (N, 3, 12, 12) -> (N,  6, 10, 10) 
        )
        self.features1 = nn.Sequential(
            nn.ReLU(),
            nn.AvgPool2d(kernel_size=2, stride=2)   # (N, 6, 10, 10) -> (N,  6, 5, 5)
        )
        self.classifier = nn.Sequential(
            nn.Linear(150, 50),         # (N, 150) -> (N, 50)
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(50,10)            # (N, 50) -> (N, 10)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.features1(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return F.log_softmax(x)

model=ConvNetWithBatchNorm().to(device);
criterion = F.nll_loss;


# this optimizer will do gradient descent for us
# experiment with learning rate and optimizer type
learning_rate = 0.001;
# note that we have to add all weights&biases, for both layers, to the optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

n_epochs = 30;
num_updates = n_epochs*int(np.ceil(len(trainloader.dataset)/batch_size))
print(num_updates)
warmup_steps=1000;
def warmup_linear(x):
    if x < warmup_steps:
        lr=x/warmup_steps
    else:
        lr=max( (num_updates - x ) / (num_updates - warmup_steps), 0.)
    return lr;
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warmup_linear);

# experiment with batch size (small batch size needs small learning rate)


for i in range(n_epochs):

    for j, data in enumerate(trainloader):
      
        inputs, labels = data        
        inputs=inputs.to(device);
        labels=labels.to(device);
        
        optimizer.zero_grad();

        outputs = model(inputs);
        risk = criterion(outputs, labels);
  
        # calculate gradients
        risk.backward();
        
        # take the gradient step
        optimizer.step();
        scheduler.step();


        

        batch_risk=risk.item();
    with (torch.no_grad()):
      correct = 0;
      for j, data in enumerate(testloader):
        
          inputs, labels = data        
          inputs=inputs.to(device);
          labels=labels.to(device);
          outputs = model(inputs);
          pred = outputs.data.max(dim=1, keepdim=True)[1]
          correct += pred.eq(labels.data.view_as(pred)).sum().item();
    print(i, batch_risk, correct / len(testloader.dataset))


Almost the same network, but on a more complicated CIFAR10 dataset.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# train using GPU, if not available on your machine, use google colab.

import pandas as pd;
from scipy.stats import zscore
import torch as torch;

import numpy as np

import torchvision.datasets as datasets
from torchvision import transforms

import torch.nn as nn;
import torch.nn.functional as F;

np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#read in the dataset


num_classes=10;

transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])])

full_train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, 
                             transform=transform )
full_test_dataset = datasets.CIFAR10(root='./data', train=False, download=True,
                             transform=transform )

batch_size=64;

trainloader = torch.utils.data.DataLoader(full_train_dataset, batch_size=batch_size,shuffle=True)
testloader = torch.utils.data.DataLoader(full_test_dataset, batch_size=batch_size,shuffle=False)

# create a neural network (inherit from nn.Module)
class ConvNetWithBatchNorm(nn.Module):
    # architecture of the network is specified in the constructor
    def __init__(self): 
        super(ConvNetWithBatchNorm, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5),         
            nn.ReLU(),
            nn.AvgPool2d(kernel_size=2, stride=2),  
            nn.Conv2d(in_channels=6, out_channels=12, kernel_size=3),
            nn.BatchNorm2d(num_features=12)           
        )
        self.features1 = nn.Sequential(
            nn.ReLU(),
            nn.AvgPool2d(kernel_size=2, stride=2)   
        )
        self.classifier = nn.Sequential(
            nn.Linear(12*6*6, 50),         
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(50,num_classes)            
        )
        
    # here we specify the computation (forward phase of training) how "x" is transfered into output "y"
    def forward(self, x):
        x = self.features(x)
        x = self.features1(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return F.log_softmax(x)

    # constructor and forward() - that is all we need, the rest is implemented in the nn.Module and we inherit it

# create an instance of the network
model=ConvNetWithBatchNorm().to(device);
criterion = F.nll_loss;


# this optimizer will do gradient descent for us
# experiment with learning rate and optimizer type
learning_rate = 0.001;
# note that we have to add all weights&biases, for both layers, to the optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

# we add a learning rate scheduler, which will modify the learning rate during training
# will initially start low, then increase it ("warm up"), and then gradually descrease it
n_epochs = 30;
num_updates = n_epochs*int(np.ceil(len(trainloader.dataset)/batch_size))
print(num_updates)
warmup_steps=1000;
def warmup_linear(x):
    if x < warmup_steps:
        lr=x/warmup_steps
    else:
        lr=max( (num_updates - x ) / (num_updates - warmup_steps), 0.)
    return lr;
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warmup_linear);



for i in range(n_epochs):

    for j, data in enumerate(trainloader):
      
        inputs, labels = data        
        inputs=inputs.to(device);
        labels=labels.to(device);
        
        optimizer.zero_grad();

        #forward phase - predictions by the model
        outputs = model(inputs);
        #forward phase - risk/loss for the predictions
        risk = criterion(outputs, labels);
  
        # calculate gradients
        risk.backward();
        
        # take the gradient step
        optimizer.step();
        scheduler.step();


        

        batch_risk=risk.item();
    with (torch.no_grad()):
      correct = 0;
      for j, data in enumerate(testloader):
        
          inputs, labels = data        
          inputs=inputs.to(device);
          labels=labels.to(device);
          outputs = model(inputs);
          pred = outputs.data.max(dim=1, keepdim=True)[1]
          correct += pred.eq(labels.data.view_as(pred)).sum().item();
    print(i, batch_risk, correct / len(testloader.dataset))
    

Instead of training our own network, we can reuse pre-trained ResNet18, and just fine-tune it.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd;
from scipy.stats import zscore
import torch as torch;

import numpy as np

import torchvision.datasets as datasets
import torchvision.models as models
from torchvision import transforms

import torch.nn as nn;
import torch.nn.functional as F;

np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#read in the dataset, convert to numpy

num_classes=10;

transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])])

full_train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, 
                             transform=transform )
full_test_dataset = datasets.CIFAR10(root='./data', train=False, download=True,
                             transform=transform )

batch_size=64;

trainloader = torch.utils.data.DataLoader(full_train_dataset, batch_size=batch_size,shuffle=True)
testloader = torch.utils.data.DataLoader(full_test_dataset, batch_size=batch_size,shuffle=False)


model = models.resnet18(pretrained=True).to(device)

num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes).to(device)

print(model)

criterion = F.nll_loss;


# this optimizer will do gradient descent for us
# experiment with learning rate and optimizer type
learning_rate = 0.0001;
# note that we have to add all weights&biases, for both layers, to the optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

n_epochs = 3;
num_updates = n_epochs*int(np.ceil(len(trainloader.dataset)/batch_size))
print(num_updates)
warmup_steps=200;
def warmup_linear(x):
    if x < warmup_steps:
        lr=x/warmup_steps
    else:
        lr=max( (num_updates - x ) / (num_updates - warmup_steps), 0.)
    return lr;
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warmup_linear);

# experiment with batch size (small batch size needs small learning rate)


for i in range(n_epochs):

    for j, data in enumerate(trainloader):
      
        inputs, labels = data        
        inputs=inputs.to(device);
        labels=labels.to(device);
        
        optimizer.zero_grad();

        outputs = F.log_softmax(model(inputs),dim=1);
        risk = criterion(outputs, labels);
  
        # calculate gradients
        risk.backward();
        
        # take the gradient step
        optimizer.step();
        scheduler.step();


        

        batch_risk=risk.item();
    with (torch.no_grad()):
      correct = 0;
      for j, data in enumerate(testloader):
        
          inputs, labels = data        
          inputs=inputs.to(device);
          labels=labels.to(device);
          outputs = F.log_softmax(model(inputs),dim=1);
          pred = outputs.data.max(dim=1, keepdim=True)[1]
          correct += pred.eq(labels.data.view_as(pred)).sum().item();
    print(i, batch_risk, correct / len(testloader.dataset))


# Transformer Models for Text

Setup and installation

In [None]:
!wget https://raw.githubusercontent.com/huggingface/transformers/v4.5.1-release/examples/text-classification/run_glue.py -qq

In [None]:
!pip install transformers datasets -qq
!pip install wandb -qq


In [None]:
import wandb
wandb.login()

Let's try our first Transformer model for text. We use run_glue.py script to fine-tune towards MRPC task (paraphrases)

In [None]:
%env TASK_NAME=MRPC

!python run_glue.py \
  --model_name_or_path bert-base-cased \
  --task_name $TASK_NAME \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 32 \
  --learning_rate 0 \
  --num_train_epochs 1 \
  --output_dir my_mrpc_init \
  --overwrite_output_dir

Does it work?

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("my_mrpc_init")
model = AutoModelForSequenceClassification.from_pretrained("my_mrpc_init")
classes = ["not paraphrase", "is paraphrase"]
sequence_0 = "Tall mountains frequently have snow on top"
sequence_1 = "It's rare to have a beach at the top of the mountain"
sequence_2 = "Snow is often at the top of high mountains"
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
not_paraphrase_classification_logits = model(**not_paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

Let train for a bit longer (3 epochs)

Go to Runtime->Change Runtime Type and select GPU. Try also running it on CPU, and observe the difference in running time.


In [None]:
%env TASK_NAME=MRPC

!python run_glue.py \
  --model_name_or_path bert-base-cased \
  --task_name $TASK_NAME \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 32 \
  --learning_rate 2e-5 \
  --num_train_epochs 3 \
  --output_dir my_mrpc \
  --overwrite_output_dir

Now we can load the trained model again and see how well it works


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("my_mrpc")
model = AutoModelForSequenceClassification.from_pretrained("my_mrpc")
classes = ["not paraphrase", "is paraphrase"]
sequence_0 = "Tall mountains frequently have snow on top"
sequence_1 = "It's rare to have a beach at the top of the mountain"
sequence_2 = "Snow is often at the top of high mountains"
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
not_paraphrase_classification_logits = model(**not_paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

For standard tasks, we don't have to train outselves, there are pre-trained, fine-tuned models ready for download

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
classes = ["not paraphrase", "is paraphrase"]
sequence_0 = "Tall mountains frequently have snow on top"
sequence_1 = "It's rare to have a beach at the top of the mountain"
sequence_2 = "Snow is often at the top of high mountains"
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
not_paraphrase_classification_logits = model(**not_paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
print(model)

Let's try a different task: sentiment analysis. We'll use a pre-trained, fine-tuned model again. Observe the size of the downloaded model.

In [None]:
from transformers import pipeline
nlp = pipeline("sentiment-analysis")
result = nlp("The huggingface library is quite comprehensive and simple to use")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
result = nlp("Coding it all in pytorch would be a dounting task")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

Transformers can also be used to generate text. For example, to create summaries. Again, observe the size of the downloaded model.

In [None]:
from transformers import pipeline
import textwrap

to_tokenize = "The idea that aliens had frequented our planet had been circulating among ufologists since the postwar years, when a Polish emigre, George Adamski, claimed to have rendezvoused with a race of kindly, Nordic-looking Venusians who were disturbed by the domestic and interplanetary effects of nuclear-bomb tests. In the summer of 1947, an alien spaceship was said to have crashed near Roswell, New Mexico. Conspiracy theorists believed that vaguely anthropomorphic bodies had been recovered there, and that the crash debris had been entrusted to private military contractors, who raced to unlock alien hardware before the Russians could. (Documents unearthed after the fall of the Soviet Union suggested that the anxiety about an arms race supercharged by alien technology was mutual.) All of this, ufologists claimed, had been covered up by Majestic 12, a clandestine, para-governmental organization convened under executive order by President Truman. President Kennedy was assassinated because he planned to level with Premier Khrushchev; Kennedy had confided in Marilyn Monroe, thereby sealing her fate. Representative Steven Schiff, of New Mexico, spent years trying to get to the bottom of the Roswell incident, only to die of 'cancer'."

# Initialize the HuggingFace summarization pipeline
summarizer = pipeline("summarization",model="t5-base", tokenizer="t5-base")
summarized = summarizer(to_tokenize, min_length=40, max_length=150)



In [None]:
# Print original text
print(textwrap.fill(to_tokenize, 80))


In [None]:
# Print summarized text
summary=summarized[0]['summary_text']
print(textwrap.fill(summary, 80))

Let's look at the code that actually uses the model (T5) directly.

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

to_tokenize = "The idea that aliens had frequented our planet had been circulating among ufologists since the postwar years, when a Polish emigre, George Adamski, claimed to have rendezvoused with a race of kindly, Nordic-looking Venusians who were disturbed by the domestic and interplanetary effects of nuclear-bomb tests. In the summer of 1947, an alien spaceship was said to have crashed near Roswell, New Mexico. Conspiracy theorists believed that vaguely anthropomorphic bodies had been recovered there, and that the crash debris had been entrusted to private military contractors, who raced to unlock alien hardware before the Russians could. (Documents unearthed after the fall of the Soviet Union suggested that the anxiety about an arms race supercharged by alien technology was mutual.) All of this, ufologists claimed, had been covered up by Majestic 12, a clandestine, para-governmental organization convened under executive order by President Truman. President Kennedy was assassinated because he planned to level with Premier Khrushchev; Kennedy had confided in Marilyn Monroe, thereby sealing her fate. Representative Steven Schiff, of New Mexico, spent years trying to get to the bottom of the Roswell incident, only to die of 'cancer'."

model = AutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer.encode("summarize: " + to_tokenize, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

In [None]:
print(inputs)

In [None]:
print(outputs)

In [None]:
print(tokenizer.decode(outputs[0]))

Let finally try translation

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

to_tokenize = "The idea that aliens had frequented our planet had been circulating among ufologists since the postwar years, when a Polish emigre, George Adamski, claimed to have rendezvoused with a race of kindly, Nordic-looking Venusians who were disturbed by the domestic and interplanetary effects of nuclear-bomb tests. In the summer of 1947, an alien spaceship was said to have crashed near Roswell, New Mexico. Conspiracy theorists believed that vaguely anthropomorphic bodies had been recovered there, and that the crash debris had been entrusted to private military contractors, who raced to unlock alien hardware before the Russians could. (Documents unearthed after the fall of the Soviet Union suggested that the anxiety about an arms race supercharged by alien technology was mutual.) All of this, ufologists claimed, had been covered up by Majestic 12, a clandestine, para-governmental organization convened under executive order by President Truman. President Kennedy was assassinated because he planned to level with Premier Khrushchev; Kennedy had confided in Marilyn Monroe, thereby sealing her fate. Representative Steven Schiff, of New Mexico, spent years trying to get to the bottom of the Roswell incident, only to die of 'cancer'."

model = AutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer.encode("translate English to German: " + to_tokenize, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(inputs, max_length=512, min_length=40, num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0]))

# Tensorflow

We are using V1 compatibility, to illustrate static computational graph.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import tensorflow as tf;
tf.compat.v1.disable_eager_execution()

minimum=[-.25,2]

# THIS SCRIPT PERFORMS PROJECTED GRADIENT DESCENT ON FUNCTION F, 
# ASSUMING Q(the feasible region) is w1>=0, w2>=0


def f(w):
    shiftedW=w-np.array(minimum);
    return tf.reduce_sum(tf.multiply(shiftedW,shiftedW));

#define starting value of W for gradient descent
#here, W is a 2D vector
initialW=np.random.randn(2)

#create a shared variable (i.e. a variable that persists between calls to a tensorflow function)
w = tf.Variable(initialW,name="w");

#define output of applying f to w
#out goal will be to minimize f(w), i.e. find w with lowest possible f(w)
z=f(w);

# if you want more accurate result, replace step size 0.01 with something smaller
optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.01)
train = optimizer.minimize(z)


#initialize tensorflow session
sess = tf.compat.v1.Session()
sess.run(tf.compat.v1.global_variables_initializer())

with sess:
    # hard-coded number of steps, could be too little, may need to be increased
    for i in range(300):
      #perform gradient step
      train.run();
      #get the numpy vector with current value of w
      w_value=w.eval();
      # run proximal operator (here it's simple, just replace negative values with 0)
      new_w_value=np.maximum(w_value,0);
      print((w_value,new_w_value))
      # update tensorflow value using numpy value
      new_w_assign = tf.compat.v1.assign(w,new_w_value);
      sess.run(new_w_assign);

#sess.close()

print("True minimum: "+str(np.maximum(minimum,0)));
print("Found minimum:"+str(new_w_value));
