# IMPORTS

In [1]:
# !pip install scikit-learn tqdm
import torch
import torch.nn.functional as F
import torch.nn as nn
import torchvision
import torchvision.transforms as T
import numpy as np
from sklearn.linear_model import Lasso
from tqdm import tqdm
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np
from sklearn.linear_model import Lasso
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  import pynvml  # type: ignore[import]


In [2]:
model_cifar10 = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar10_vgg16_bn", pretrained=True).to(device)
model_cifar100 = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar100_vgg16_bn", pretrained=True).to(device)

Using cache found in /home/foxunderground/.cache/torch/hub/chenyaofo_pytorch-cifar-models_master
Using cache found in /home/foxunderground/.cache/torch/hub/chenyaofo_pytorch-cifar-models_master


In [3]:
from torch.profiler import profile, ProfilerActivity

model_cifar10.eval()
inputs = torch.randn(256, 3, 32, 32).to(device)

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if torch.cuda.is_available() else [ProfilerActivity.CPU]) as prof:
    with torch.no_grad():
        model_cifar10(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

model_cifar100.eval()
inputs = torch.randn(256, 3, 32, 32).to(device)

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if torch.cuda.is_available() else [ProfilerActivity.CPU]) as prof:
    with torch.no_grad():
        model_cifar100(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.02%      36.250us        67.29%     111.156ms       8.550ms       0.000us         0.00%      35.086ms       2.699ms            13  
                                      aten::convolution         0.06%      94.199us        67.27%     111.120ms       8.548ms       0.000us         0.00%      35.086ms       2.699ms            13  
         

In [4]:
transform_train_c10 = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                        (0.2023, 0.1994, 0.2010))
])

transform_test_c10 = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010))
])

trainset_c10 = torchvision.datasets.CIFAR10(root='./data/cifar10', train=False, download=True, transform=transform_train_c10)
testset_c10 = torchvision.datasets.CIFAR10(root='./data/cifar10', train=False, download=True, transform=transform_test_c10)
trainloader_c10 = DataLoader(trainset_c10, batch_size=64, shuffle=True, num_workers=0)
testloader_c10 = DataLoader(testset_c10, batch_size=64, shuffle=False, num_workers=0)

In [5]:
mean_c100 = [0.5070, 0.4865, 0.4409]
std_c100 = [0.2673, 0.2564, 0.2761]

train_transform_c100 = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean_c100, std_c100)
])

test_transform_c100 = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean_c100, std_c100)
])

trainset_c100 = torchvision.datasets.CIFAR100(root="./data/cifar100", train=True, download=True, transform=train_transform_c100)
testset_c100 = torchvision.datasets.CIFAR100(root="./data/cifar100", train=False, download=True, transform=test_transform_c100)

trainloader_c100 = DataLoader(trainset_c100, batch_size=64, shuffle=True, num_workers=0)
testloader_c100 = DataLoader(testset_c100, batch_size=64, shuffle=False, num_workers=0)

# Pruning Modules

In [6]:
def get_conv_modules(model):
    convs = []
    names = []
    for n, m in model.named_modules():
        if isinstance(m, nn.Conv2d):
            convs.append(m)
            names.append(n)
    return convs, names

def find_following_bn(model, conv_name):
    modlist = list(model.named_modules())
    for i, (n, m) in enumerate(modlist):
        if n == conv_name:
            for j in range(i+1, min(i+6, len(modlist))):
                if isinstance(modlist[j][1], nn.BatchNorm2d):
                    return modlist[j][0], modlist[j][1]
    return None, None

def get_parent_module(model, layer_name):
    parent_module = model
    name_parts = layer_name.split('.')
    for part in name_parts[:-1]:
        if part.isdigit():
            parent_module = parent_module[int(part)]
        else:
            parent_module = getattr(parent_module, part)
    return parent_module, name_parts[-1]

In [7]:
def prune_layer_regression(layer, inputs, keep_ratio=0.3, device='cpu'):
    layer_cpu = layer.cpu()
    inputs_cpu = inputs.cpu()
    N, C_in, H, W = inputs_cpu.shape
    kh, kw = layer_cpu.kernel_size
    stride = layer_cpu.stride
    padding = layer_cpu.padding
    C_out = layer_cpu.out_channels
    X_unf = F.unfold(inputs_cpu, kernel_size=(kh, kw), padding=padding, stride=stride)
    L = X_unf.shape[-1]
    X_unf = X_unf.permute(0, 2, 1).reshape(-1, C_in * kh * kw)
    W = layer_cpu.weight.data
    W_mat = W.view(C_out, -1).t()
    Y = X_unf @ W_mat
    scores = torch.zeros(C_in, dtype=torch.float64)
    for i in range(C_in):
        start = i * kh * kw
        end = (i + 1) * kh * kw
        Xi = X_unf[:, start:end]
        Wi = W[:, i, :, :].reshape(C_out, -1)
        Zi = Xi @ Wi.t()
        scores[i] = torch.abs((Zi.double() * Y.double()).sum())
    num_keep = max(1, int(C_in * keep_ratio))
    _, keep_idx = torch.topk(scores, num_keep, largest=True)
    keep_idx_sorted, _ = torch.sort(keep_idx)
    cols = []
    for i in keep_idx_sorted.tolist():
        cols.extend(range(i * kh * kw, (i + 1) * kh * kw))
    cols = torch.tensor(cols, dtype=torch.long)
    X_reduced = X_unf[:, cols]
    sol = torch.linalg.lstsq(X_reduced.float(), Y.float()).solution
    W_recon = sol.t().reshape(C_out, num_keep, kh, kw).contiguous()
    new_layer = nn.Conv2d(
        in_channels=num_keep,
        out_channels=C_out,
        kernel_size=layer_cpu.kernel_size,
        stride=layer_cpu.stride,
        padding=layer_cpu.padding,
        bias=(layer_cpu.bias is not None)
    )
    new_layer.weight.data = W_recon
    if layer_cpu.bias is not None:
        new_layer.bias.data = layer_cpu.bias.data.clone()
    return new_layer.to(device), keep_idx_sorted.to(device)


In [8]:
def prune_output_channels(layer, keep_idx, device):
    keep_idx_sorted, _ = torch.sort(keep_idx)
    new_layer = nn.Conv2d(
        in_channels=layer.in_channels,
        out_channels=len(keep_idx_sorted),
        kernel_size=layer.kernel_size,
        stride=layer.stride,
        padding=layer.padding,
        bias=(layer.bias is not None)
    )
    new_layer.weight.data = layer.weight.data[keep_idx_sorted].clone()
    if layer.bias is not None:
        new_layer.bias.data = layer.bias.data[keep_idx_sorted].clone()
    return new_layer.to(device)

In [9]:
def prune_batchnorm(bn_layer, keep_idx, device):
    keep_idx_sorted, _ = torch.sort(keep_idx)
    new_bn = nn.BatchNorm2d(len(keep_idx_sorted))
    new_bn.weight.data = bn_layer.weight.data[keep_idx_sorted].clone()
    new_bn.bias.data = bn_layer.bias.data[keep_idx_sorted].clone()
    new_bn.running_mean = bn_layer.running_mean[keep_idx_sorted].clone()
    new_bn.running_var = bn_layer.running_var[keep_idx_sorted].clone()
    new_bn.num_batches_tracked = bn_layer.num_batches_tracked.clone()
    return new_bn.to(device)

In [10]:
def sequential_prune(model, data_loader, device, overall_sparsity=0.7, calib_batches=1):
    if overall_sparsity <= 0:
        return model

    print(f"Starting sequential pruning with target sparsity: {overall_sparsity*100:.1f}%")
    model.eval()
    conv_layers = [(n, m) for n, m in model.named_modules() if isinstance(m, nn.Conv2d)]
    keep_ratio = 1.0 - overall_sparsity
    data_iter = iter(data_loader)
    calib_images, _ = next(data_iter)
    calib_images = calib_images.to(device)
    original_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    original_conv_params = sum(m.weight.numel() for _, m in conv_layers)
    for idx, (layer_name, layer) in enumerate(conv_layers):
        if idx == 0:
            print(f"Skip first layer (takes raw image input)\n")
            continue
        print("=========================================================================================")
        print(f"\nProcessing layer {idx+1}: {layer_name}")

        print(f"  Input channels: {layer.in_channels}, Output channels: {layer.out_channels}")
        prev_layer_name, prev_layer = conv_layers[idx - 1]
        layer_inputs = []
        def capture_input(module, inp, outp):
            layer_inputs.append(inp[0].detach().cpu())
        handle = layer.register_forward_hook(capture_input)
        with torch.no_grad():
            _ = model(calib_images)
        handle.remove()
        inputs = layer_inputs[0]

        print(f"  Pruning with keep_ratio={keep_ratio:.2f} (target sparsity={overall_sparsity:.2f})")
        new_layer, keep_idx = prune_layer_regression(layer, inputs, keep_ratio=keep_ratio, device=device)
        parent_module, attr_name = get_parent_module(model, layer_name)
        setattr(parent_module, attr_name, new_layer)

        print(f"  Current layer: {layer.in_channels} -> {new_layer.in_channels} input channels")
        prev_parent, prev_attr = get_parent_module(model, prev_layer_name)
        prev_layer = getattr(prev_parent, prev_attr)
        pruned_prev_layer = prune_output_channels(prev_layer, keep_idx, device)
        setattr(prev_parent, prev_attr, pruned_prev_layer)

        print(f"  Previous layer: {prev_layer.out_channels} -> {pruned_prev_layer.out_channels} output channels")
        bn_name, bn_layer = find_following_bn(model, prev_layer_name)
        if bn_name is not None and bn_layer is not None:
            print(f"  Pruning BatchNorm layer: {bn_name}")
            bn_parent, bn_attr = get_parent_module(model, bn_name)
            pruned_bn = prune_batchnorm(bn_layer, keep_idx, device)
            setattr(bn_parent, bn_attr, pruned_bn)
            print(f"  BatchNorm: {bn_layer.num_features} -> {pruned_bn.num_features} channels")
        print("=========================================================================================")
        print()

    # Calculate final statistics for debugging
    final_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    final_conv_params = sum(m.weight.numel() for _, m in model.named_modules() if isinstance(m, nn.Conv2d))
    nonzero_params = sum((p != 0).sum().item() for p in model.parameters() if p.requires_grad)
    print(f"{'='*70}")
    print(f"=== Pruning Results ===")
    print(f"Original total params: {original_params:,}")
    print(f"Final total params: {final_params:,}")
    print(f"Reduction: {(1 - final_params/original_params)*100:.2f}%")
    print(f"\nOriginal conv params: {original_conv_params:,}")
    print(f"Final conv params: {final_conv_params:,}")
    print(f"Conv reduction: {(1 - final_conv_params/original_conv_params)*100:.2f}%")
    print(f"\nTarget sparsity: {overall_sparsity*100:.1f}%")
    print(f"Achieved reduction: {(1 - final_params/original_params)*100:.2f}%")
    print(f"{'='*70}\n")
    print("Sequential pruning finished.")
    return model

# Testing - CIFAR10



In [15]:
pruned_c10 = sequential_prune(model_cifar10, trainloader_c10, device, overall_sparsity=0.7, calib_batches=6)
# torch.save(pruned_c10.state_dict(), 'vgg16bn_pruned_10.pth')

Starting sequential pruning with target sparsity: 70.0%
Skip first layer (takes raw image input)


Processing layer 2: features.3
  Input channels: 64, Output channels: 64
  Pruning with keep_ratio=0.30 (target sparsity=0.70)
  Current layer: 64 -> 19 input channels
  Previous layer: 64 -> 19 output channels
  Pruning BatchNorm layer: features.1
  BatchNorm: 64 -> 19 channels


Processing layer 3: features.7
  Input channels: 64, Output channels: 128
  Pruning with keep_ratio=0.30 (target sparsity=0.70)
  Current layer: 64 -> 19 input channels
  Previous layer: 64 -> 19 output channels
  Pruning BatchNorm layer: features.4
  BatchNorm: 64 -> 19 channels


Processing layer 4: features.10
  Input channels: 128, Output channels: 128
  Pruning with keep_ratio=0.30 (target sparsity=0.70)
  Current layer: 128 -> 38 input channels
  Previous layer: 128 -> 38 output channels
  Pruning BatchNorm layer: features.8
  BatchNorm: 128 -> 38 channels


Processing layer 5: features.14
  Input channels

In [None]:
optimizer = torch.optim.SGD(pruned.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

num_epochs = 20

In [None]:
pruned_c10.train()

for epoch in range(num_epochs):
    pruned_c10.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for imgs, labels in tqdm(trainloader_c10):
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        out = pruned_c10(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        _, predicted = torch.max(out.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    train_loss = running_loss / total
    train_acc = 100.0 * correct / total

    pruned_c10.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for imgs, labels in testloader_c10:
            imgs, labels = imgs.to(device), labels.to(device)
            out = pruned_c10(imgs)
            loss = criterion(out, labels)
            val_loss += loss.item() * imgs.size(0)
            _, predicted = torch.max(out.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    val_loss /= val_total
    val_acc = 100.0 * val_correct / val_total

    print(f"Epoch [{epoch+1}/{num_epochs}] | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")


100%|██████████| 157/157 [00:06<00:00, 24.17it/s]


Epoch [1/20] | Train Loss: 2.0352 | Train Acc: 24.22% | Val Loss: 1.8417 | Val Acc: 28.18%


100%|██████████| 157/157 [00:06<00:00, 25.81it/s]


Epoch [2/20] | Train Loss: 1.6911 | Train Acc: 33.45% | Val Loss: 1.6119 | Val Acc: 38.69%


100%|██████████| 157/157 [00:07<00:00, 21.50it/s]


Epoch [3/20] | Train Loss: 1.5610 | Train Acc: 41.37% | Val Loss: 1.5131 | Val Acc: 42.22%


100%|██████████| 157/157 [00:06<00:00, 23.75it/s]


Epoch [4/20] | Train Loss: 1.4229 | Train Acc: 46.92% | Val Loss: 1.2982 | Val Acc: 52.03%


100%|██████████| 157/157 [00:06<00:00, 24.93it/s]


Epoch [5/20] | Train Loss: 1.3201 | Train Acc: 52.68% | Val Loss: 1.1988 | Val Acc: 56.58%


100%|██████████| 157/157 [00:06<00:00, 26.02it/s]


Epoch [6/20] | Train Loss: 1.2241 | Train Acc: 56.71% | Val Loss: 1.4083 | Val Acc: 49.69%


100%|██████████| 157/157 [00:06<00:00, 24.21it/s]


Epoch [7/20] | Train Loss: 1.1424 | Train Acc: 59.75% | Val Loss: 1.0781 | Val Acc: 61.32%


100%|██████████| 157/157 [00:06<00:00, 23.30it/s]


Epoch [8/20] | Train Loss: 1.0936 | Train Acc: 61.48% | Val Loss: 1.0255 | Val Acc: 64.20%


100%|██████████| 157/157 [00:06<00:00, 23.56it/s]


Epoch [9/20] | Train Loss: 1.0544 | Train Acc: 63.04% | Val Loss: 0.9010 | Val Acc: 69.00%


100%|██████████| 157/157 [00:06<00:00, 25.81it/s]


Epoch [10/20] | Train Loss: 1.0053 | Train Acc: 65.14% | Val Loss: 1.0244 | Val Acc: 64.76%


100%|██████████| 157/157 [00:06<00:00, 25.76it/s]


Epoch [11/20] | Train Loss: 0.9689 | Train Acc: 66.80% | Val Loss: 0.9897 | Val Acc: 65.97%


100%|██████████| 157/157 [00:06<00:00, 23.40it/s]


Epoch [12/20] | Train Loss: 0.9570 | Train Acc: 67.53% | Val Loss: 0.9175 | Val Acc: 67.34%


100%|██████████| 157/157 [00:06<00:00, 23.68it/s]


Epoch [13/20] | Train Loss: 0.9043 | Train Acc: 69.59% | Val Loss: 0.9594 | Val Acc: 67.09%


100%|██████████| 157/157 [00:06<00:00, 25.36it/s]


Epoch [14/20] | Train Loss: 0.8987 | Train Acc: 69.57% | Val Loss: 0.8752 | Val Acc: 69.83%


100%|██████████| 157/157 [00:06<00:00, 25.43it/s]


Epoch [15/20] | Train Loss: 0.8658 | Train Acc: 70.89% | Val Loss: 0.8662 | Val Acc: 71.40%


100%|██████████| 157/157 [00:06<00:00, 23.09it/s]


Epoch [16/20] | Train Loss: 0.8593 | Train Acc: 70.94% | Val Loss: 0.8908 | Val Acc: 69.81%


100%|██████████| 157/157 [00:06<00:00, 23.30it/s]


Epoch [17/20] | Train Loss: 0.8359 | Train Acc: 72.54% | Val Loss: 1.0518 | Val Acc: 67.26%


100%|██████████| 157/157 [00:06<00:00, 23.63it/s]


Epoch [18/20] | Train Loss: 0.8383 | Train Acc: 72.38% | Val Loss: 0.7594 | Val Acc: 74.72%


100%|██████████| 157/157 [00:06<00:00, 25.02it/s]


Epoch [19/20] | Train Loss: 0.8079 | Train Acc: 72.99% | Val Loss: 0.7975 | Val Acc: 72.81%


100%|██████████| 157/157 [00:06<00:00, 24.70it/s]


Epoch [20/20] | Train Loss: 0.7709 | Train Acc: 74.32% | Val Loss: 0.7344 | Val Acc: 75.37%


In [None]:
from torch.profiler import profile, ProfilerActivity, record_function
imgs, labels = next(iter(testloader_c10))
imgs = imgs.to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("inference"):
        _ = pruned_c10(imgs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              inference         0.00%       0.000us         0.00%       0.000us       0.000us       2.611ms       113.81%       2.611ms       2.611ms             1  
                                              inference        16.99%       1.067ms        99.59%       6.256ms       6.256ms       0.000us         0.00%       2.294ms       2.294ms             1  
         

# Testing - CIFAR100

In [16]:
pruned_c100 = sequential_prune(model_cifar100, trainloader_c100, device, overall_sparsity=0.7, calib_batches=6)
torch.save(pruned_c100.state_dict(), 'vgg16bn_pruned_100.pth')

Starting sequential pruning with target sparsity: 70.0%
Skip first layer (takes raw image input)


Processing layer 2: features.3
  Input channels: 64, Output channels: 64
  Pruning with keep_ratio=0.30 (target sparsity=0.70)
  Current layer: 64 -> 19 input channels
  Previous layer: 64 -> 19 output channels
  Pruning BatchNorm layer: features.1
  BatchNorm: 64 -> 19 channels


Processing layer 3: features.7
  Input channels: 64, Output channels: 128
  Pruning with keep_ratio=0.30 (target sparsity=0.70)
  Current layer: 64 -> 19 input channels
  Previous layer: 64 -> 19 output channels
  Pruning BatchNorm layer: features.4
  BatchNorm: 64 -> 19 channels


Processing layer 4: features.10
  Input channels: 128, Output channels: 128
  Pruning with keep_ratio=0.30 (target sparsity=0.70)
  Current layer: 128 -> 38 input channels
  Previous layer: 128 -> 38 output channels
  Pruning BatchNorm layer: features.8
  BatchNorm: 128 -> 38 channels


Processing layer 5: features.14
  Input channels

In [None]:
optimizer = torch.optim.SGD(pruned_c100.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

num_epochs = 20

In [None]:
pruned_c100.train()

for epoch in range(num_epochs):
    pruned_c100.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for imgs, labels in tqdm(trainloader_c100):
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        out = pruned_c100(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        _, predicted = torch.max(out.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    train_loss = running_loss / total
    train_acc = 100.0 * correct / total

    pruned_c100.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for imgs, labels in testloader_c100:
            imgs, labels = imgs.to(device), labels.to(device)
            out = pruned_c100(imgs)
            loss = criterion(out, labels)
            val_loss += loss.item() * imgs.size(0)
            _, predicted = torch.max(out.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    val_loss /= val_total
    val_acc = 100.0 * val_correct / val_total

    print(f"Epoch [{epoch+1}/{num_epochs}] | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")


100%|██████████| 782/782 [00:31<00:00, 24.78it/s]


Epoch [1/20] | Train Loss: 4.2003 | Train Acc: 4.89% | Val Loss: 3.8325 | Val Acc: 8.81%


100%|██████████| 782/782 [00:32<00:00, 24.38it/s]


Epoch [2/20] | Train Loss: 3.6694 | Train Acc: 11.89% | Val Loss: 3.3167 | Val Acc: 17.91%


100%|██████████| 782/782 [00:32<00:00, 24.25it/s]


Epoch [3/20] | Train Loss: 3.3202 | Train Acc: 17.36% | Val Loss: 3.0671 | Val Acc: 22.87%


100%|██████████| 782/782 [00:32<00:00, 24.32it/s]


Epoch [4/20] | Train Loss: 3.0837 | Train Acc: 21.87% | Val Loss: 2.9157 | Val Acc: 25.41%


100%|██████████| 782/782 [00:31<00:00, 24.56it/s]


Epoch [5/20] | Train Loss: 2.9086 | Train Acc: 25.06% | Val Loss: 2.6867 | Val Acc: 28.71%


100%|██████████| 782/782 [00:31<00:00, 24.50it/s]


Epoch [6/20] | Train Loss: 2.7867 | Train Acc: 27.64% | Val Loss: 2.7608 | Val Acc: 29.28%


100%|██████████| 782/782 [00:31<00:00, 24.80it/s]


Epoch [7/20] | Train Loss: 2.6798 | Train Acc: 30.07% | Val Loss: 2.5965 | Val Acc: 31.79%


100%|██████████| 782/782 [00:31<00:00, 24.93it/s]


Epoch [8/20] | Train Loss: 2.5894 | Train Acc: 32.41% | Val Loss: 2.3547 | Val Acc: 36.54%


100%|██████████| 782/782 [00:31<00:00, 24.85it/s]


Epoch [9/20] | Train Loss: 2.5192 | Train Acc: 33.72% | Val Loss: 2.4645 | Val Acc: 35.23%


100%|██████████| 782/782 [00:32<00:00, 24.43it/s]


Epoch [10/20] | Train Loss: 2.4545 | Train Acc: 35.30% | Val Loss: 2.4040 | Val Acc: 36.39%


100%|██████████| 782/782 [00:32<00:00, 23.98it/s]


Epoch [11/20] | Train Loss: 2.4048 | Train Acc: 36.38% | Val Loss: 2.4090 | Val Acc: 36.81%


100%|██████████| 782/782 [00:32<00:00, 24.39it/s]


Epoch [12/20] | Train Loss: 2.3582 | Train Acc: 37.69% | Val Loss: 2.3166 | Val Acc: 38.54%


100%|██████████| 782/782 [00:31<00:00, 24.44it/s]


Epoch [13/20] | Train Loss: 2.3112 | Train Acc: 38.97% | Val Loss: 2.3250 | Val Acc: 38.81%


100%|██████████| 782/782 [00:31<00:00, 24.77it/s]


Epoch [14/20] | Train Loss: 2.2669 | Train Acc: 39.69% | Val Loss: 2.3088 | Val Acc: 40.04%


100%|██████████| 782/782 [00:32<00:00, 24.32it/s]


Epoch [15/20] | Train Loss: 2.2232 | Train Acc: 40.83% | Val Loss: 2.2803 | Val Acc: 39.79%


100%|██████████| 782/782 [00:31<00:00, 24.84it/s]


Epoch [16/20] | Train Loss: 2.1975 | Train Acc: 41.57% | Val Loss: 2.0889 | Val Acc: 43.96%


100%|██████████| 782/782 [00:32<00:00, 23.76it/s]


Epoch [17/20] | Train Loss: 2.1613 | Train Acc: 42.42% | Val Loss: 2.0702 | Val Acc: 44.06%


100%|██████████| 782/782 [00:31<00:00, 24.84it/s]


Epoch [18/20] | Train Loss: 2.1314 | Train Acc: 43.21% | Val Loss: 2.2170 | Val Acc: 41.97%


100%|██████████| 782/782 [00:32<00:00, 24.34it/s]


Epoch [19/20] | Train Loss: 2.1023 | Train Acc: 44.03% | Val Loss: 2.1594 | Val Acc: 43.25%


100%|██████████| 782/782 [00:33<00:00, 23.68it/s]


Epoch [20/20] | Train Loss: 2.0783 | Train Acc: 44.58% | Val Loss: 2.1320 | Val Acc: 43.82%


In [None]:
from torch.profiler import profile, ProfilerActivity, record_function
imgs, labels = next(iter(testloader_c100))
imgs = imgs.to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("inference"):
        _ = pruned_c100(imgs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              inference         0.00%       0.000us         0.00%       0.000us       0.000us       1.047ms       104.14%       1.047ms       1.047ms             1  
                                              inference        16.79%       1.000ms        98.55%       5.872ms       5.872ms       0.000us         0.00%       1.006ms       1.006ms             1  
         

# Final Results

In [17]:
import torch
import time
import io
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetTotalEnergyConsumption
import thop.profile
from torch.profiler import profile, record_function, ProfilerActivity

In [18]:
def evaluate_model_efficiency(model, dataloader, device, runs=100):
    model.to(device)
    model.eval()

    data_iter = iter(dataloader)
    images, labels = next(data_iter)
    images, labels = images.to(device), labels.to(device)

    nvmlInit()
    gpu = nvmlDeviceGetHandleByIndex(0)
    start_energy = nvmlDeviceGetTotalEnergyConsumption(gpu)

    total_time, total_mem, correct1, correct5 = 0, 0, 0, 0
    for _ in range(runs):
        torch.cuda.reset_peak_memory_stats()
        start_t = time.time()
        with torch.no_grad():
            outputs = model(images)
        torch.cuda.synchronize()
        end_t = time.time()
        total_time += (end_t - start_t) * 1000
        total_mem += torch.cuda.memory_allocated(device) / (1024 ** 2)
        top5_pred = outputs.topk(5, dim=1).indices
        top1_pred = outputs.argmax(dim=1)
        correct1 += (top1_pred == labels).sum().item()
        correct5 += sum(labels[i] in top5_pred[i] for i in range(len(labels)))
    total = len(labels)
    top1_acc = 100 * correct1 / (total * runs)
    top5_acc = 100 * correct5 / (total * runs)

    end_energy = nvmlDeviceGetTotalEnergyConsumption(gpu)
    avg_energy = (end_energy - start_energy) / runs
    avg_latency = total_time / runs
    avg_mem = total_mem / runs
    peak_mem = torch.cuda.max_memory_allocated(device) / (1024 ** 2)

    buf = io.BytesIO()
    torch.save(model.state_dict(), buf)
    model_size = len(buf.getvalue()) / (1024 ** 2)

    try:
        macs, _ = thop.profile(model, inputs=(images,), verbose=False)
    except Exception:
        macs = float("nan")
    print("\n========== Profiling Summary ==========")
    print(f"Runs:            {runs}")
    print(f"Model Size:      {model_size:.2f} MB")
    print(f"Average Latency: {avg_latency:.2f} ms")
    print(f"Peak Memory:     {peak_mem:.2f} MB")
    print(f"Mean Memory:     {avg_mem:.2f} MB")
    print(f"Energy / Run:    {avg_energy:.2f} mJ")
    print(f"Top-1 Accuracy:  {top1_acc:.2f}%")
    print(f"Top-5 Accuracy:  {top5_acc:.2f}%")
    print(f"MACs / Batch:    {macs / 1e6:.2f} M")
    print("=======================================\n")

    return {
        "model_size_mb": model_size,
        "latency_ms": avg_latency,
        "avg_mem_mb": avg_mem,
        "peak_mem_mb": peak_mem,
        "energy_mJ": avg_energy,
        "top1_acc": top1_acc,
        "top5_acc": top5_acc,
        "macs": macs
    }


In [19]:
pruned_c10.load_state_dict(torch.load("vgg16bn_pruned_10.pth"))
pruned_c100.load_state_dict(torch.load("vgg16bn_pruned_100.pth"))

<All keys matched successfully>

In [20]:
device = torch.device("cuda")
evaluation_c10 = evaluate_model_efficiency(pruned_c10, testloader_c10, device)

pruned_c10.eval()
inputs = torch.randn(256, 3, 32, 32).to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if torch.cuda.is_available() else [ProfilerActivity.CPU]) as prof:
    with torch.no_grad():
        pruned_c10(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


time.sleep(3)
evaluation_c100 = evaluate_model_efficiency(pruned_c100, testloader_c100, device)

pruned_c100.eval()
inputs = torch.randn(256, 3, 32, 32).to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if torch.cuda.is_available() else [ProfilerActivity.CPU]) as prof:
    with torch.no_grad():
        pruned_c100(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


Runs:            100
Model Size:      8.97 MB
Average Latency: 2.19 ms
Peak Memory:     76.75 MB
Mean Memory:     39.78 MB
Energy / Run:    735.93 mJ
Top-1 Accuracy:  48.44%
Top-5 Accuracy:  82.81%
MACs / Batch:    1980.52 M

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.25%      19.350us        81.02%       6.161ms     473.953us       0.000us         0.00%       3.236ms     248.90