In [None]:
%run ../talktools.py

In [None]:
# do this step below to get lightning, lightning bolts, etc.
!pip install lightning-bolts torchvision

# Classification with pytorch Lightning

*AY 128/256 (UC Berkeley, 2018-2021)*

We saw last week how to do regression problems with neural nets in `keras`. Here we'll work with the pytorch equivalent, called pytorch lightning. Let's now explore classification, on images. Let's introduce the [FashionMNIST](https://github.com/zalandoresearch/fashion-mnist#labels) dataset: 70k small (28$\times$28) images of 10 different types of clothing.

<img src="https://github.com/zalandoresearch/fashion-mnist/blob/master/doc/img/fashion-mnist-sprite.png?raw=true" width="80%">

Each training and test example is assigned to one of the following labels:

| Label | Description |
| --- | --- |
| 0 | T-shirt/top |
| 1 | Trouser |
| 2 | Pullover |
| 3 | Dress |
| 4 | Coat |
| 5 | Sandal |
| 6 | Shirt |
| 7 | Sneaker |
| 8 | Bag |
| 9 | Ankle boot |

Tensorflow has a simple method to get this data locally

In [None]:
def output_label(label):
    output_mapping = {
                 0: "T-shirt/Top",
                 1: "Trouser",
                 2: "Pullover",
                 3: "Dress",
                 4: "Coat", 
                 5: "Sandal", 
                 6: "Shirt",
                 7: "Sneaker",
                 8: "Bag",
                 9: "Ankle Boot"
                 }
    input = (label.item() if type(label) == torch.Tensor else label)
    return output_mapping[input]

In [None]:
import datetime, os
import numpy as np
import warnings
import pandas as pd
import matplotlib.pyplot as plt

from IPython.external import mathjax

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torchmetrics.functional import accuracy
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

# use a GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("pytorch version:", torch.__version__)

Let's get the training and testing data

In [None]:
%%bash
wget "https://github.com/fpleoni/fashion_mnist/blob/master/fashion-mnist_train.csv?raw=true" --output-document=fashion-mnist_train.csv
wget "https://github.com/fpleoni/fashion_mnist/blob/master/fashion-mnist_test.csv?raw=true" --output-document=fashion-mnist_test.csv

In [None]:
train_csv = pd.read_csv("./fashion-mnist_train.csv")
test_csv = pd.read_csv("./fashion-mnist_test.csv")

In [None]:
batch_size = 128

class FashionDataset(Dataset):
    """User defined class to build a datset using Pytorch class Dataset."""
    
    def __init__(self, data, transform = None):
        """Method to initilaize variables.""" 
        self.fashion_MNIST = list(data.values)
        self.transform = transform
        
        label = []
        image = []
        
        for i in self.fashion_MNIST:
             # first column is of labels.
            label.append(i[0])
            image.append(i[1:])
        self.labels = np.asarray(label)
        # Dimension of Images = 28 * 28 * 1. where height = width = 28 and color_channels = 1.
        self.images = np.asarray(image).reshape(-1, 28, 28, 1).astype('float32')

    def __getitem__(self, index):
        label = self.labels[index]
        image = self.images[index]
        
        if self.transform is not None:
            image = self.transform(image)

        return image, label

    def __len__(self):
        return len(self.images)

# Transform data into Tensor that has a range from 0 to 1
train_set = FashionDataset(train_csv, transform=transforms.Compose([transforms.ToTensor()]))
test_set = FashionDataset(test_csv, transform=transforms.Compose([transforms.ToTensor()]))

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=2)
test_loader = DataLoader(train_set, batch_size=batch_size, num_workers=2)

In [None]:
a = next(iter(train_loader))
a[0].size()

In [None]:
len(train_set)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

image, label = next(iter(train_set))
plt.axis('off')

plt.imshow(image.squeeze(), cmap=plt.cm.gray_r, interpolation='nearest')
output_label(label)

To learn a model to predict the class of a given image, we could treat this 28$\times$28 image as 1d input, like a stellar spectrum:

In [None]:
ind=20
_ = plt.plot(a[0][ind].numpy().reshape(-1))
plt.ylabel("normalized flux")
plt.xlabel("1D pixel index")

But this *clearly* jumbles the inherent spatial structure and local correlations found in natural images. Using just Dense layers in a NN we'd effectively be asking the network to learn these correlations.

## Convolutional Neural Nets (ConvNets)

NNs built for images (or more generally, inputs with spatial structure).

### Key Ideas: 
  - layers see only parts of each image (effectively all other weights are zero).
  - some layers do simple operations on previous layers to reduce dimensionality (e.g., take the largest value in a a 3x3 range)
  - "Every Layer has a simple API: It transforms an input 3D volume to an output 3D volume with some differentiable function that may or may not have parameters."
 
<img src="http://cs231n.github.io/assets/cnn/cnn.jpeg">

<img src="http://cs231n.github.io/assets/cnn/depthcol.jpeg">

"An example input volume in red (e.g. a 32x32x3 CIFAR-10 image), and an example volume of neurons in the first Convolutional layer. Each neuron in the convolutional layer is connected only to a local region in the input volume spatially, but to the full depth (i.e. all color channels). Note, there are multiple neurons (5 in this example) along the depth, all looking at the same region in the input - see discussion of depth columns in text below. "

cf. http://cs231n.github.io/convolutional-networks/

<img src="data/f2.png">
Source: http://www.nature.com/nature/journal/v521/n7553/fig_tab/nature14539_F2.html

### Filter banks

  http://setosa.io/ev/image-kernels/

### Pooling

<img src="http://cs231n.github.io/assets/cnn/pool.jpeg" width="40%">
<img src="http://cs231n.github.io/assets/cnn/maxpool.jpeg" width="40%">
Source: http://cs231n.github.io/convolutional-networks/

In [None]:
nb_classes = 10

demo_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size)

batch = next(iter(demo_loader))
images, labels = batch
print(type(images), type(labels))
print(images.shape, labels.shape)

In [None]:
y = torch.nn.functional.one_hot(labels)
y

We want our output predictions to look like a "probability" of belonging to one of the 10 classes. And, importantly, we'd like to make sure that the probability over all classes sums to unity. One way to do this is to scale the outputs of the last layer using a [`softmax`](https://en.wikipedia.org/wiki/Softmax_function):

$$
{\rm softmax}(\vec s) = \frac{e^{s_i}}{\sum_i e^{s_i}}
$$

So if the (unnormalized) prediction from am NN for an image is:

In [None]:
s = np.random.normal(size=(10,))
s

Then the softmax scaling gives us:

In [None]:
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

print(softmax(s))
np.testing.assert_almost_equal(softmax(s).sum(), 1.0)

In [None]:
from scipy.special import softmax as sp_softmax
sp_softmax(s)

We'll use the "categorical cross-entropy" loss:

<img src="https://gombru.github.io/assets/cross_entropy_loss/softmax_CE_pipeline.png">
Source: https://gombru.github.io/2018/05/23/cross_entropy_loss/

In [None]:
# perfect match ... use a small ϵ to avoid taking log(0) since lim x log x -> 0 as x->0
print("loss with a perfect match:", -(y[0].double() @ np.log(y[0].double() + 1e-16)).numpy())
print("loss with a predicted match:", -(y[0].double() @ np.log(softmax(s) + 1e-16)).numpy())

## Building a CNN

In [None]:
import pytorch_lightning as pl

In [None]:
class mycnn(pl.LightningModule):

    def __init__(self):
        super().__init__()

        # set this to an example input size to the see a summary
        # see https://pytorch-lightning.readthedocs.io/en/latest/common/debugging.html
        self._example_input_array = torch.randn((1, 1, 28, 28))

        # define the layers here
        # Conv2d(in_channels, out_channels, kernel_size)
        # see https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3),
            
            # see https://github.com/sksq96/pytorch-summary/issues/55#issuecomment-471844028
            # to understand why pytorch and keras differ here
            nn.BatchNorm2d(32, affine=False),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3),
            nn.ReLU(),
        )
        
        self.fc1=torch.nn.Linear(1152, 128)
        self.fc2=torch.nn.Linear(128, 32)
        self.fc3=torch.nn.Linear(32, 10)
    
        self.loss = nn.NLLLoss()
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        x=torch.relu(self.fc1(x))
        x=torch.relu(self.fc2(x))
        x=F.log_softmax(self.fc3(x), dim=-1)
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.75,
            patience=2,
            min_lr=1e-6,
            verbose=True
        )
        
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_accuracy"}
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.loss(logits, y)
        self.log('train_loss', loss)
        return loss
    
    def _evaluate(self, batch, batch_idx, stage=None):
        x, y = batch
        logits = self.forward(x)
        loss = self.loss(logits, y)
        preds = torch.argmax(logits, dim=-1)
        acc = accuracy(preds, y)

        if stage:
            self.log(f'{stage}_loss', loss, prog_bar=True)
            self.log(f'{stage}_accuracy', acc, prog_bar=True)

        return loss, acc
    
    def validation_step(self, batch, batch_idx):
        return self._evaluate(batch, batch_idx, 'val')[0]
    
    def train_dataloader(self):
        return train_loader
    
    def val_dataloader(self):
        return test_loader

In [None]:
run_time_string = datetime.datetime.utcnow().isoformat(timespec='minutes')
filename = f'datalab_nn_pytorch_{run_time_string}'

early_stop_callback = EarlyStopping(
   monitor='val_accuracy',
   min_delta=0.001,
   patience=3,
   verbose=True,
   mode='max'
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_accuracy',
    mode='max',
    dirpath='nn_results',
    filename=filename,
    verbose=True,
    save_top_k=1
)

logger = [CSVLogger("nn_results1", name=filename), TensorBoardLogger("nn_results", name=filename)]

pl.seed_everything(42)

myTrainer=pl.Trainer(callbacks=[early_stop_callback, checkpoint_callback], logger=logger,
                     gpus=-1, accelerator='dp', auto_select_gpus=True, max_epochs=20)
model=mycnn()
myTrainer.fit(model)

### Aside: Dropout 

You'll notice above that the `accuracy` is much higher than the `val_accuracy`. That is, we overfit on the training data. One way to help protect against this is to introduce `Dropout`

<img src="https://cdn-images-1.medium.com/max/1600/1*iWQzxhVlvadk6VAJjsgXgg.png">

Srivastava, Nitish, et al. ”Dropout: a simple way to prevent neural networks from
overfitting”, JMLR 2014

```python
        x = self.layer3(x)
        # Flatten
        x = x.view(x.size(0), -1) 
        x = nn.Dropout(p=0.1)(x) # 10% of dropping an output connection
```

### Aside: Visualization of the layers

From François Chollet (“DEEP LEARNING with Python”):

Intermediate activations are “useful for understanding how successive convnet layers transform their input, and for getting a first idea of the meaning of individual convnet filters.”

“The representations learned by convnets are highly amenable to visualization, in large part because they’re representations of visual concepts. Visualizing intermediate activations consists of displaying the feature maps that are output by various convolution and pooling layers in a network, given a certain input (the output of a layer is often called its activation, the output of the activation function). This gives a view into how an input is decomposed into the different filters learned by the network. Each channel encodes relatively independent features, so the proper way to visualize these feature maps is by independently plotting the contents of every channel as a 2D image.”

Following from https://github.com/gabrielpierobon/cnnshapes/blob/master/README.md

In [None]:
# Visualize feature maps
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

data, _ = next(iter(train_set))
data.unsqueeze_(0)

for layer_name, layer in [('layer1', model.layer1), 
                          ('layer2', model.layer2), ('layer3', model.layer3)]:
    layer.register_forward_hook(get_activation(layer_name))
    output = model(data)

    layer_activation = activation[layer_name].squeeze()

    images_per_row = 15
    n_features = layer_activation.shape[0]   # Number of features in the feature map
    size = layer_activation.shape[1] # The feature map has shape (n_features, size, size).
    n_cols = n_features // images_per_row # Tiles the activation channels in this matrix
    display_grid = np.zeros((size * n_cols, images_per_row * size))
    for col in range(n_cols): # Tiles each filter into a big horizontal grid
        for row in range(images_per_row):
            channel_image = layer_activation[col * images_per_row + row,
                                              :, :]
            channel_image -= channel_image.mean() # Post-processes the feature to make it visually palatable
            channel_image /= channel_image.std()
            channel_image *= 64
            channel_image += 128
            channel_image = np.clip(channel_image, 0, 255)
            display_grid[col * size : (col + 1) * size, # Displays the grid
                          row * size : (row + 1) * size] = channel_image
    scale = 1. / size
    plt.figure(figsize=(scale * display_grid.shape[1],
                        scale * display_grid.shape[0]))
    plt.title(layer_name)
    plt.grid(False)
    plt.imshow(display_grid, aspect='auto', cmap='viridis')

# Data Augmentation

Another way to avoid overfitting, aside from `Dropout`, is to increase the number of exmaples used to to train the model.  Data augmentation is a generic term for methods used to expand the effect training set size by generating more data from the original training set. In images, this is pretty natural: scale changes, rotations, flips, etc. should still give us the same label. This method has the benefit of usually increasing test-time accuracy.

<img src="https://cdn-images-1.medium.com/max/1200/1*C8hNiOqur4OJyEZmC7OnzQ.png">

In Pytorch see https://pytorch-lightning.readthedocs.io/en/stable/extensions/datamodules.html and https://pytorch.org/vision/stable/transforms.html

In [None]:
train_transforms = transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.RandomAffine(degrees=15, shear=0.1),
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.ToTensor(),
])

train_set = FashionDataset(train_csv, transform=train_transforms)
test_set = FashionDataset(test_csv, transform=transforms.Compose([transforms.ToTensor()]))

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=2)
test_loader = DataLoader(train_set, batch_size=batch_size, num_workers=2)

In [None]:
image, label = next(iter(train_set))
plt.axis('off')

plt.imshow(image.squeeze(), cmap=plt.cm.gray_r, interpolation='nearest')
output_label(label)

In [None]:
class mycnn_dropout(pl.LightningModule):

    def __init__(self):
        super().__init__()

        # set this to an example input size to the see a summary
        # see https://pytorch-lightning.readthedocs.io/en/latest/common/debugging.html
        self._example_input_array = torch.randn((1, 1, 28, 28))

        # define the layers here
        # Conv2d(in_channels, out_channels, kernel_size)
        # see https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3),
            
            # see https://github.com/sksq96/pytorch-summary/issues/55#issuecomment-471844028
            # to understand why pytorch and keras differ here
            nn.BatchNorm2d(32, affine=False),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(p=0.1)
        )
        
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3),
            nn.ReLU(),
        )
        
        self.fc1=torch.nn.Linear(1152, 128)
        self.fc2=torch.nn.Linear(128, 32)
        self.fc3=torch.nn.Linear(32, 10)
    
        self.loss = nn.NLLLoss()
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # add dropout 
        x = nn.Dropout(p=0.2)(x)

        x=torch.relu(self.fc1(x))
        x=torch.relu(self.fc2(x))
        x=F.log_softmax(self.fc3(x), dim=-1)
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.75,
            patience=2,
            min_lr=1e-6,
            verbose=True
        )
        
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_accuracy"}
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.loss(logits, y)
        self.log('train_loss', loss)
        return loss
    
    def _evaluate(self, batch, batch_idx, stage=None):
        x, y = batch
        logits = self.forward(x)
        loss = self.loss(logits, y)
        preds = torch.argmax(logits, dim=-1)
        acc = accuracy(preds, y)

        if stage:
            self.log(f'{stage}_loss', loss, prog_bar=True)
            self.log(f'{stage}_accuracy', acc, prog_bar=True)

        return loss, acc
    
    def validation_step(self, batch, batch_idx):
        return self._evaluate(batch, batch_idx, 'val')[0]
    
    def train_dataloader(self):
        return train_loader
    
    def val_dataloader(self):
        return test_loader

In [None]:
run_time_string = datetime.datetime.utcnow().isoformat(timespec='minutes')
filename = f'datalab_nn_pytorch_dropout_{run_time_string}'

early_stop_callback = EarlyStopping(
   monitor='val_accuracy',
   min_delta=0.001,
   patience=3,
   verbose=True,
   mode='max'
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_accuracy',
    mode='max',
    dirpath='nn_results',
    filename=filename,
    verbose=True,
    save_top_k=1
)

logger = [CSVLogger("nn_results1", name=filename), TensorBoardLogger("nn_results", name=filename)]

pl.seed_everything(42)

myTrainer=pl.Trainer(callbacks=[early_stop_callback, checkpoint_callback], logger=logger,
                     gpus=-1, accelerator='dp', auto_select_gpus=True, max_epochs=20)
model_dropout=mycnn_dropout()
myTrainer.fit(model_dropout)

In [None]:
ls -t1 nn_results

In [None]:
import pandas as pd

latest_log_file = !ls -t1 nn_results1/{filename}/version_*/metrics.csv | head -1
latest_model_file = !ls -t1 nn_results/{filename}.ckpt | head -1

hist_df = pd.read_csv(latest_log_file[0])
hist_df

In [None]:
latest_model_file

In [None]:
# reload the best model
from tensorflow.keras.models import load_model
import tensorflow as tf

model = load_model(latest_model_file[0])

In [None]:
model_dropout.eval()  # set model to evaluation mode

In [None]:
image, label = next(iter(test_set))

In [None]:
prediction = (np.e**model_dropout(image.unsqueeze(0))).detach().numpy()

In [None]:
prediction.sum()

In [None]:
np.argmax(prediction)

In [None]:
plt.axis('off')
_ = plt.imshow(image[0,:,:], cmap=plt.cm.gray_r, interpolation='nearest')

In [None]:
print(output_label(np.argmax(prediction)))