In [1]:
import os
import sys
sys.path.append("../../")

In [2]:
import torch
from torchvision import transforms as transforms
from torchvision import datasets as datasets
torch.cuda.set_device(7)

In [3]:
def build_imagenet_data(data_path: str = '', input_size: int = 224, batch_size: int = 64, workers: int = 4,
                        dist_sample: bool = False):
    print('==> Using Imagenet Dataset')

    traindir = os.path.join(data_path, 'train')
    valdir = os.path.join(data_path, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    #torchvision.set_image_backend('accimage')
    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    val_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            normalize,
        ]))

    if dist_sample:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
    else:
        train_sampler = None
        val_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
        num_workers=workers, pin_memory=True, sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,batch_size=batch_size, shuffle=False,
        num_workers=workers, pin_memory=True, sampler=val_sampler)
    return train_loader, val_loader

In [4]:
# load data
dataloaders = {'train':[], 'val':[]}
dataloaders['train'], dataloaders['val'] = build_imagenet_data(data_path='/workspace/code/Akash/ImageNet')

==> Using Imagenet Dataset


In [5]:
# import libraries
from trailmet.models import resnet
from trailmet.algorithms.quantize.brecq import BRECQ
from trailmet.algorithms.quantize.q_utils import validate_model

In [6]:
# load model
cnn = resnet.get_resnet_model('resnet50', 1000, 224, pretrained=True)

In [7]:
# test full precision model
validate_model(dataloaders['val'], cnn, device=torch.device('cuda:7'))

Test: [  0/754]	Time  2.949 ( 2.949)	Acc@1  95.31 ( 95.31)	Acc@5  96.88 ( 96.88)
Test: [100/754]	Time  0.057 ( 0.141)	Acc@1  92.19 ( 86.80)	Acc@5 100.00 ( 97.26)
Test: [200/754]	Time  0.066 ( 0.125)	Acc@1  96.88 ( 85.85)	Acc@5  98.44 ( 97.21)
Test: [300/754]	Time  0.062 ( 0.124)	Acc@1  84.38 ( 85.55)	Acc@5  95.31 ( 97.23)
Test: [400/754]	Time  0.060 ( 0.122)	Acc@1  64.06 ( 83.44)	Acc@5  82.81 ( 96.35)
Test: [500/754]	Time  0.062 ( 0.121)	Acc@1  31.25 ( 82.07)	Acc@5  85.94 ( 95.80)
Test: [600/754]	Time  0.066 ( 0.120)	Acc@1  78.12 ( 81.32)	Acc@5  96.88 ( 95.47)
Test: [700/754]	Time  0.061 ( 0.120)	Acc@1  71.88 ( 80.47)	Acc@5  93.75 ( 95.16)
 * Acc@1 80.538 Acc@5 95.224


tensor(80.5382, device='cuda:7')

In [7]:
# quantize model
kwargs = {
    'W_BITS':4, 
    'A_BITS':8, 
    'CHANNEL_WISE':True, 
    'ACT_QUANT':True, 
    'NUM_SAMPLES':1024, 
    'ITERS_W':2000, 
    'ITERS_A':2000, 
    'CALIB_BS':64, 
    'GPU_ID':7,
    }
qnn = BRECQ(cnn, dataloaders, **kwargs)
qnn.compress_model()

# increase iterations from 2000 to 20000 to achieve full optimization potential.

==> Setting the first and the last layer to 8-bit
==> Initializing weight quantization parameters
Test: [  0/754]	Time  1.203 ( 1.203)	Acc@1  85.94 ( 85.94)	Acc@5  96.88 ( 96.88)
Test: [100/754]	Time  0.234 ( 0.129)	Acc@1  90.62 ( 76.07)	Acc@5  96.88 ( 92.30)
Test: [200/754]	Time  0.068 ( 0.119)	Acc@1  92.19 ( 75.48)	Acc@5  95.31 ( 92.52)
Test: [300/754]	Time  0.078 ( 0.120)	Acc@1  75.00 ( 75.60)	Acc@5  95.31 ( 92.81)
Test: [400/754]	Time  0.066 ( 0.117)	Acc@1  56.25 ( 73.09)	Acc@5  79.69 ( 91.09)
Test: [500/754]	Time  0.059 ( 0.116)	Acc@1  37.50 ( 71.63)	Acc@5  81.25 ( 90.03)
Test: [600/754]	Time  0.066 ( 0.115)	Acc@1  65.62 ( 70.68)	Acc@5  84.38 ( 89.40)
Test: [700/754]	Time  0.065 ( 0.115)	Acc@1  70.31 ( 69.69)	Acc@5  93.75 ( 88.76)
 * Acc@1 69.719 Acc@5 88.735
Quantized accuracy before brecq: 69.7188949584961
==> Starting weight calibration
Ignore reconstruction of layer conv1
Reconstruction for block 0
Init alpha to be FP32
Init alpha to be FP32
Init alpha to be FP32
Init alpha to

In [17]:
torch.cuda.empty_cache()

In [18]:
!nvidia-smi

Fri Jul 15 19:23:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   51C    P0   212W / 300W |  31301MiB / 32510MiB |     99%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   57C    P0   199W / 300W |    670MiB / 32510MiB |     93%      Default |
|       