In [1]:
# do this step below to get lightning, lightning bolts, etc.
# !pip install lightning-bolts torchvision torchsummary

# Classification Revisited, with ResNets

*AY 128/256 (UC Berkeley, 2018–2024)*

Previously, we used CNNs in our own custom model to classify images. You are asked to stack up your own model to classify galaxies in Lab 3, and you should definitely explore different CNN depths, kernel sizes, and filters to get a feel for how this works. Many of them will be able to perform the functions you need for Lab 3 adequately. That said, some architectures are better than others, and an architecture we are going to introduce today, **ResNets (Residual neural Networks)** are particularly efficient at image classification.

More generally, PyTorch comes with a bunch of models and pre-trained weights that come from fitting to some generic (in this case classification) data sets. As it turns out, many of the features in these trained CNNs are widely applicable and can be easily repurposed for another task. In this lecture, we will explore how to do that.

Again, we'll make use of the Fashion MNIST labeled dataset, which you may recall looked something like this:

<img src="https://github.com/zalandoresearch/fashion-mnist/blob/master/doc/img/fashion-mnist-sprite.png?raw=true" width="80%">

With labels: 

In [1]:
lookup = {0: "T-shirt/top",
          1: "Trouser",
          2: "Pullover",
          3: "Dress",
          4: "Coat",
          5: "Sandal",
          6: "Shirt",
          7: "Sneaker",
          8: "Bag",
          9: "Ankle boot"}

def output_label(label):
    input = (label.item() if type(label) == torch.Tensor else label)
    return lookup[input]

nb_classes = len(lookup)

In [2]:
import datetime, os
import numpy as np
import warnings
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torchmetrics.functional import accuracy
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, ModelSummary
from pytorch_lightning.loggers import CSVLogger

import torchvision
from torchvision import models, transforms
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision.datasets import FashionMNIST
from torch.utils.data import random_split, DataLoader
import time
import copy

# use a GPU or MPS (Mac) if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.backends.mps.is_available():
    device = "mps"

print("pytorch version:", torch.__version__)
print("device:", device)

pytorch version: 2.3.0+cu121
device: cuda:0


In [3]:
train_csv = pd.read_csv("../Lecture20_lab3_pytorch/fashion-mnist_train.csv.gz")
test_csv = pd.read_csv("../Lecture20_lab3_pytorch/fashion-mnist_test.csv.gz")

In [4]:
%%writefile fashion_dataset.py

import numpy as np
from torch.utils.data import Dataset

class FashionDataset(Dataset):
    """User defined class to build a datset using Pytorch class Dataset."""
    
    def __init__(self, data, transform = None):
        """Method to initilaize variables.""" 
        self.fashion_MNIST = list(data.values)
        self.transform = transform
        
        label = []
        image = []
        
        for i in self.fashion_MNIST:
             # first column is of labels.
            label.append(i[0])
            image.append(i[1:])
        self.labels = np.asarray(label)
        # Dimension of Images = 28 * 28 * 1. where height = width = 28 and color_channels = 1.
        self.images = np.asarray(image).reshape(-1, 28, 28, 1).astype('float32')

    def __getitem__(self, index):
        label = self.labels[index]
        image = self.images[index]
        
        if self.transform is not None:
            image = self.transform(image)

        return image, label

    def __len__(self):
        return len(self.images)

Overwriting fashion_dataset.py


In [5]:
from fashion_dataset import FashionDataset

In [6]:
batch_size = 128

## Transform data into Tensor that has a range from 0 to 1
data_transforms = {
    'train': transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.RandomAffine(degrees=15, shear=0.1),
        #transforms.Resize(28),
        transforms.RandomHorizontalFlip(),
        transforms.Grayscale(3), 
        transforms.ToTensor(), 
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
    'test': transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.RandomAffine(degrees=15, shear=0.1),
        #transforms.Resize(28),
        transforms.Grayscale(3),
        transforms.ToTensor(), 
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])
}

train_set = FashionDataset(train_csv, transform=data_transforms['train'])
test_set = FashionDataset(test_csv, transform=data_transforms['test'])

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=2)
test_loader = DataLoader(train_set, batch_size=batch_size, num_workers=2)

train_label = torch.tensor([train_set[i][1] for i in range(nb_classes)])

## Our (Previous) CNN

Let's inspect the model we created:

In [7]:
class mycnn_dropout(pl.LightningModule):

    def __init__(self):
        super().__init__()

        # define the layers here
        # Conv2d(in_channels, out_channels, kernel_size)
        # see https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
        self.layer1 = nn.Sequential(
            #nn.Conv2d(1, 32, kernel_size=3),
            nn.Conv2d(3, 32, kernel_size=3),
            
            # see https://github.com/sksq96/pytorch-summary/issues/55#issuecomment-471844028
            # to understand why pytorch and keras differ here
            nn.BatchNorm2d(32, affine=False),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(p=0.1)
        )
        
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3),
            nn.ReLU(),
        )
        
        self.fc1=torch.nn.Linear(1152, 32)
        self.fc2=torch.nn.Linear(32, 10)
    
        self.loss = nn.NLLLoss()
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # add dropout 
        x = nn.Dropout(p=0.2)(x)

        x=torch.relu(self.fc1(x))
        x=F.log_softmax(self.fc2(x), dim=-1)
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.75,
            patience=2,
            min_lr=1e-6,
            verbose=True
        )
        
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_accuracy"}
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.loss(logits, y)
        self.log('train_loss', loss)
        return loss
    
    def _evaluate(self, batch, batch_idx, stage=None):
        x, y = batch
        logits = self.forward(x)
        loss = self.loss(logits, y)
        preds = torch.argmax(logits, dim=-1)
        acc = accuracy(preds, y, task="multiclass", num_classes=nb_classes)

        if stage:
            self.log(f'{stage}_loss', loss, prog_bar=True)
            self.log(f'{stage}_accuracy', acc, prog_bar=True)

        return loss, acc
    
    def validation_step(self, batch, batch_idx):
        return self._evaluate(batch, batch_idx, 'val')[0]
    
    def train_dataloader(self):
        return train_loader
    
    def val_dataloader(self):
        return test_loader

In [8]:
run_time_string = datetime.datetime.utcnow().isoformat(timespec='minutes')
filename = f'datalab_nn_pytorch_dropout_{run_time_string}'

early_stop_callback = EarlyStopping(
   monitor='val_accuracy',
   min_delta=0.001,
   patience=3,
   verbose=True,
   mode='max'
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_accuracy',
    mode='max',
    dirpath='nn_results',
    filename=filename,
    verbose=True,
    save_top_k=1
)

logger = [CSVLogger("nn_results1", name=filename), TensorBoardLogger("nn_results1", name=filename)]

pl.seed_everything(42)

if device.type == "cuda":
    myTrainer=pl.Trainer(callbacks=[early_stop_callback, checkpoint_callback], logger=logger,
                     gpus=-1, accelerator='cuda', auto_select_gpus=True, max_epochs=5)
else:
    myTrainer=pl.Trainer(callbacks=[early_stop_callback, checkpoint_callback], logger=logger,
                         max_epochs=5)
if True:
    cpt = !ls -td nn_results/*
    cpt = cpt[0]
    print('Reading from:', cpt)
    model_dropout = mycnn_dropout.load_from_checkpoint(cpt).to(device)
else:
    model_dropout = mycnn_dropout().to(device)

#summary(model_dropout.to(device), input_size=(1, 28, 28))
summary(model_dropout, input_size=(3, 28, 28))

Global seed set to 42
  rank_zero_deprecation(
  rank_zero_deprecation(
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Reading from: nn_results/datalab_nn_pytorch_resnet_2024-11-25T18:02


IsADirectoryError: [Errno 21] Is a directory: '/home/aparsons/projects/pedagogy/ucb-datalab/course_materials_fall2024/lectures/Lecture22_lab3_resnet/nn_results/datalab_nn_pytorch_resnet_2024-11-25T18:02'

In [None]:
#myTrainer.fit(model_dropout)

In [None]:
model_dropout.eval()  # Set the model to evaluation mode
# get a batch of data from the test set
data = next(iter(test_loader))
example_input = data[0].to(device)

# Get predictions
with torch.no_grad():  # Disable gradient calculation
    predictions = model_dropout(example_input)

# If you want the class with the highest probability
predicted_class = torch.argmax(predictions, dim=1)

print(predicted_class)

# ResNets
ResNets are a neural net architecture designed for image classification tasks, but they have a long (for machine learning) history dating back to the 1980s.
Generally, they are a solution to the problem that deep neural nets can "lose track" of image information in nets that aren't already well-trained. This makes it hard
for gradients to be propagated deep into the network to identify useful features. The solution was to introduce 
skip connections (residual connections, or identity maps) between stacks of convolutional layers to allow gradients to flow directly through the network, mitigating the vanishing gradient problem in very deep networks.
The residual blocks allow the model to learn identity mappings easily, making it more robust for deep architectures.

<img src="ResBlock.png" width="20%">

<img src="Original-ResNet-18-Architecture.png" width="50%">

And of course, there are many flavors of residual networks with varying depths and numbers of parameters:

<img src="resnet_param_counts.png" width="50%">

This general architecture, whereby earlier layers are fed forward with intermediate layers providing augmented context, is also very widely used, including
in U-Nets (used for image segmentation), Transformer networks (e.g. the "T" in GPT), and many others. But because we are interested in image classification,
we'll start with ResNets here.

Here's an example of some of the intermediate layers of a ResNet-18 that was trained on a very large and generic set of millions of labeled images, and is now being shown a hops berry (I think?).

<img src="resnet_feature_visual.png" width="50%">

What's interesting here is how generic the filter shapes are, especially in the first few layers.

In a previous lecture, we examined how important it was to have large training sets, and to augment our training sets with transformations in order to avoid overfitting.
Well, it turns out that training a ResNet on a huge number of images that have nothing to do with galaxies or fashion (or both?) can lead it to identify features
that are generally usefull for image classification, and these features can help jump-start galaxy classification.

Even better, someone else spend a huge amount of computing time getting you these weights, so you can leverage trainings that might have taken months on large GPU clusters to run.

Harnessing pre-trained networks for new tasks is called **learning transfer**, and it can be quite powerful.
The trick is, knowning where to create the splice between your pre-trained network and a custom network that focuses on your particular task.

In [None]:

#class myresnet(pl.LightningModule):
#    def __init__(self, num_classes=nb_classes):
#        super(myresnet, self).__init__()
#        
#        self.model = models.resnet18(pretrained=True)
#        # Freeze all layers initially (so their weights don't update)
#        #for param in self.model.parameters():
#        #    param.requires_grad = False
#
#        # Modify the final fully connected layer to match the number of classes
#        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
#
#        # Unfreeze specific layers (e.g., the final fully connected layer)
#        #for param in self.model.fc.parameters():
#        #    param.requires_grad = True
#
#        #self.loss = nn.CrossEntropyLoss()
#        self.loss = nn.NLLLoss()
#
#
#    def forward(self, x):
#        x = self.model(x)
#        #x = F.log_softmax(x, dim=-1)
#        return x
#
#    def configure_optimizers(self):
#        optimizer = torch.optim.Adam(self.parameters())
#        
#        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#            optimizer,
#            mode='min',
#            factor=0.75,
#            patience=2,
#            min_lr=1e-6,
#            verbose=True
#        )
#        
#        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_accuracy"}
#    
#    def training_step(self, batch, batch_idx):
#        x, y = batch
#        logits = self.forward(x)
#        loss = self.loss(logits, y)
#        self.log('train_loss', loss)
#        return loss
#    
#    def _evaluate(self, batch, batch_idx, stage=None):
#        x, y = batch
#        logits = self.forward(x)
#        loss = self.loss(logits, y)
#        preds = torch.argmax(logits, dim=-1)
#        acc = accuracy(preds, y, task="multiclass", num_classes=nb_classes)
#
#        if stage:
#            self.log(f'{stage}_loss', loss, prog_bar=True)
#            self.log(f'{stage}_accuracy', acc, prog_bar=True)
#
#        return loss, acc
#    
#    def validation_step(self, batch, batch_idx):
#        return self._evaluate(batch, batch_idx, 'val')[0]
#    
#    def train_dataloader(self):
#        return train_loader
#    
#    def val_dataloader(self):
#        return test_loader

In [None]:
#run_time_string = datetime.datetime.utcnow().isoformat(timespec='minutes')
#filename = f'datalab_nn_pytorch_resnet_{run_time_string}'
#
#early_stop_callback = EarlyStopping(
#   monitor='val_accuracy',
#   min_delta=0.001,
#   patience=3,
#   verbose=True,
#   mode='max'
#)
#
#checkpoint_callback = ModelCheckpoint(
#    monitor='val_accuracy',
#    mode='max',
#    dirpath='nn_results',
#    filename=filename,
#    verbose=True,
#    save_top_k=1
#)
#
#logger = [CSVLogger("nn_results2", name=filename), TensorBoardLogger("nn_results", name=filename)]
#
#pl.seed_everything(42)
#
#if device == "gpu":
#    myTrainer=pl.Trainer(callbacks=[early_stop_callback, checkpoint_callback], logger=logger,
#                     gpus=-1, accelerator='dp', auto_select_gpus=True, max_epochs=5)
#else:
#    myTrainer=pl.Trainer(callbacks=[early_stop_callback, checkpoint_callback], logger=logger,
#                         max_epochs=5)
#    
#model_resnet = myresnet().to(device)
#summary(model_resnet, input_size=(3, 28, 28))

In [None]:
#myTrainer.fit(model_resnet)

In [12]:
def train_model(model, criterion, optimizer, scheduler, val_acc, 
                val_loss, train_acc, train_loss,epoch, 
                num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    list = {'train': {'acc': train_acc, 'loss': train_loss}, 
        'val':{'acc': val_acc, 'loss': val_loss}}
    next = epoch
    for epoch in range(next, next+num_epochs):
        print('Epoch {}/{}'.format(epoch, next + num_epochs - 1))
        print('-' * 10)
        
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                #scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
        
            running_loss = 0.0
            running_corrects = 0
        
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
        
                # zero the parameter gradients
                optimizer.zero_grad()
        
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
        
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        scheduler.step()  # XXX
        
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            list[phase]['loss'].append(epoch_loss)
            list[phase]['acc'].append(epoch_acc.item())
        
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
        
        print()
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
        
    return model, epoch + 1

In [9]:
model = models.resnet18(pretrained=True)
#for param in model.parameters():
#    param.requires_grad = False

# Parameters of newly constructed modules have requires_grad=True by default
model.fc = nn.Linear(model.fc.in_features, nb_classes)

model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) # Decay LR by a factor of 0.1 every 7 epochs

# load in previous save state, if available
if False:
    checkpoint = torch.load('./FMNIST_ResNet18_noresize.tar')
    model.load_state_dict(checkpoint['model_state_dict'])  
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    train_loss = checkpoint['train_loss']
    train_acc = checkpoint['train_acc']
    val_loss = checkpoint['val_loss']
    val_acc = checkpoint['val_acc']
    epoch = checkpoint['epoch']



In [10]:
epoch = 0
val_acc = []
val_loss = []
train_acc = []
train_loss = []
dataloaders = {'train': train_loader, 'val': test_loader}
dataset_sizes = {'train': len(train_loader.dataset.images), 'val': len(test_loader.dataset.images)}

In [None]:
model, epoch = train_model(model, criterion, optimizer, scheduler, val_acc, val_loss,  train_acc, train_loss, epoch, num_epochs=5)

Epoch 0/4
----------


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [47]:
torch.save({'epoch' : epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss, 'val_loss': val_loss, 
            'train_acc': train_acc, 'val_acc': val_acc}, 
           './FMNIST_ResNet18_noresize.tar')