In [1]:
import os
import random

import torch
import torch.nn as nn
import torchvision

import time
import copy
import numpy as np
from torchvision import transforms
from tqdm import tqdm

def set_random_seeds(random_seed=0):

    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

def memory_check():
    print(f"  Allocated: {round(torch.cuda.memory_allocated()/1024**3,2)} GB")
    print(f"  Cached:    {round(torch.cuda.memory_reserved()/1024**3,2)} GB\n")

print(f"torch = {torch.__version__}")
print(f"torchvision = {torchvision.__version__}")
set_random_seeds(42)

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "1"  # Set the GPU 1 to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

torch = 1.12.1
torchvision = 0.13.1
Device: cuda
Current cuda device: 0
Count of using GPUs: 1


In [2]:
def time_test(model, device, input_size = (1,3,256,256),num_tests=100):
    model.to(device)
    model.eval()

    x = torch.rand(size=input_size).to(device)

    with torch.no_grad():
        for _ in range(10):
            _ = model(x)
    torch.cuda.synchronize()

    with torch.no_grad():
        start_time = time.time()

        for _ in range(num_tests):
            _ = model(x)
            torch.cuda.synchronize()
        total_time = time.time() - start_time

    aver_time = total_time / num_tests
    return total_time, aver_time

In [3]:
def model_eq_check(model1, model2, device, rtol=1e-04, atol=1e-08, num_tests=100, input_size=(1,3,256,256)):

    model1.to(device)
    model2.to(device)

    for _ in range(num_tests):
        x = torch.rand(size=input_size).to(device)
        y1 = model1(x).detach().cpu().numpy()
        y2 = model2(x).detach().cpu().numpy()
        # 배열이 허용 오차범위 abs(a - b) <= (atol + rtol * absolute(b)) 이내면 True
        if np.allclose(a=y1, b=y2, rtol=rtol, atol=atol, equal_nan=False) == False:
            print("Model equivalence test fail")
            print(f"output size : {y1.size}")
            print(y1)
            print(y2)
            return False
    print("Two models equal")
    return True

In [4]:
from models import mobilenet_v2, MobileNet_V2_Weights
mobilenet = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1,activation_layer=nn.ReLU)
mobilenet.classifier.append(nn.Dropout(0.2))
mobilenet.classifier.append(nn.Linear(1000, 10))
print(mobilenet.classifier)
mobilenet.load_state_dict(torch.load("./models/mobilenetv2_cifar10.pt"))
from torchsummary import summary
summary(mobilenet,(3,32,32), device='cpu')

Sequential(
  (0): Dropout(p=0.2, inplace=False)
  (1): Linear(in_features=1280, out_features=1000, bias=True)
  (2): Dropout(p=0.2, inplace=False)
  (3): Linear(in_features=1000, out_features=10, bias=True)
)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 16, 16]             864
       BatchNorm2d-2           [-1, 32, 16, 16]              64
              ReLU-3           [-1, 32, 16, 16]               0
            Conv2d-4           [-1, 32, 16, 16]             288
       BatchNorm2d-5           [-1, 32, 16, 16]              64
              ReLU-6           [-1, 32, 16, 16]               0
            Conv2d-7           [-1, 16, 16, 16]             512
       BatchNorm2d-8           [-1, 16, 16, 16]              32
  InvertedResidual-9           [-1, 16, 16, 16]               0
           Conv2d-10           [-1, 96, 16, 16]           1,536
      BatchNorm2d-11 

In [5]:
for name, param in mobilenet.named_parameters():
    if "features" in name:
        param.requires_grad = False
    else:
        param.requires_grad = True
    print(name, param.requires_grad)

features.0.0.weight False
features.0.1.weight False
features.0.1.bias False
features.1.conv.0.0.weight False
features.1.conv.0.1.weight False
features.1.conv.0.1.bias False
features.1.conv.1.weight False
features.1.conv.2.weight False
features.1.conv.2.bias False
features.2.conv.0.0.weight False
features.2.conv.0.1.weight False
features.2.conv.0.1.bias False
features.2.conv.1.0.weight False
features.2.conv.1.1.weight False
features.2.conv.1.1.bias False
features.2.conv.2.weight False
features.2.conv.3.weight False
features.2.conv.3.bias False
features.3.conv.0.0.weight False
features.3.conv.0.1.weight False
features.3.conv.0.1.bias False
features.3.conv.1.0.weight False
features.3.conv.1.1.weight False
features.3.conv.1.1.bias False
features.3.conv.2.weight False
features.3.conv.3.weight False
features.3.conv.3.bias False
features.4.conv.0.0.weight False
features.4.conv.0.1.weight False
features.4.conv.0.1.bias False
features.4.conv.1.0.weight False
features.4.conv.1.1.weight False
fea

In [6]:
import warnings
def bit2float(b, num_e_bits=8, num_m_bits=23, bias=127.):
  """Turn input tensor into float.
      Args:
          b : binary tensor. The last dimension of this tensor should be the
          the one the binary is at.
          num_e_bits : Number of exponent bits. Default: 8.
          num_m_bits : Number of mantissa bits. Default: 23.
          bias : Exponent bias/ zero offset. Default: 127.
      Returns:
          Tensor: Float tensor. Reduces last dimension.
  """
  expected_last_dim = num_m_bits + num_e_bits + 1
  assert b.shape[-1] == expected_last_dim, "Binary tensors last dimension " \
                                           "should be {}, not {}.".format(
    expected_last_dim, b.shape[-1])

  # check if we got the right type
  dtype = torch.float32
  if expected_last_dim > 32: dtype = torch.float64
  if expected_last_dim > 64:
    warnings.warn("pytorch can not process floats larger than 64 bits, keep"
                  " this in mind. Your result will be not exact.")

  s = torch.index_select(b, -1, torch.arange(0, 1))
  e = torch.index_select(b, -1, torch.arange(1, 1 + num_e_bits))
  m = torch.index_select(b, -1, torch.arange(1 + num_e_bits,
                                             1 + num_e_bits + num_m_bits))
  # SIGN BIT
  out = ((-1) ** s).squeeze(-1).type(dtype)
  # EXPONENT BIT
  exponents = -torch.arange(-(num_e_bits - 1.), 1.)
  exponents = exponents.repeat(b.shape[:-1] + (1,))
  e_decimal = torch.sum(e * 2 ** exponents, dim=-1) - bias
  out *= 2 ** e_decimal
  # MANTISSA
  matissa = (torch.Tensor([2.]) ** (
    -torch.arange(1., num_m_bits + 1.))).repeat(
    m.shape[:-1] + (1,))
  out *= 1. + torch.sum(m * matissa, dim=-1)
  return out

def remainder2bit(remainder, num_bits=127):
  """Turn a tensor with remainders (floats < 1) to mantissa bits.
      Args:
          remainder : torch.Tensor, tensor with remainders
          num_bits : Number of bits to specify the precision. Default: 127.
      Returns:
          Tensor: Binary tensor. Adds last dimension to original tensor for
          bits.
  """
  dtype = remainder.type()
  exponent_bits = torch.arange(num_bits).type(dtype)
  exponent_bits = exponent_bits.repeat(remainder.shape + (1,))
  out = (remainder.unsqueeze(-1) * 2 ** exponent_bits) % 1
  return torch.floor(2 * out)


def integer2bit(integer, num_bits=8):
  """Turn integer tensor to binary representation.
      Args:
          integer : torch.Tensor, tensor with integers
          num_bits : Number of bits to specify the precision. Default: 8.
      Returns:
          Tensor: Binary tensor. Adds last dimension to original tensor for
          bits.
  """
  dtype = integer.type()
  exponent_bits = -torch.arange(-(num_bits - 1), 1).type(dtype)
  exponent_bits = exponent_bits.repeat(integer.shape + (1,))
  out = integer.unsqueeze(-1) / 2 ** exponent_bits
  return (out - (out % 1)) % 2

def float2bit(f, num_e_bits=8, num_m_bits=23, bias=127., dtype=torch.float32):
  """Turn input tensor into binary.
      Args:
          f : float tensor.
          num_e_bits : Number of exponent bits. Default: 8.
          num_m_bits : Number of mantissa bits. Default: 23.
          bias : Exponent bias/ zero offset. Default: 127.
          dtype : This is the actual type of the tensor that is going to be
          returned. Default: torch.float32.
      Returns:
          Tensor: Binary tensor. Adds last dimension to original tensor for
          bits.
  """
  ## SIGN BIT
  s = torch.sign(f)
  f = f * s
  # turn sign into sign-bit
  s = (s * (-1) + 1.) * 0.5
  s = s.unsqueeze(-1)

  ## EXPONENT BIT
  e_scientific = torch.floor(torch.log2(f))
  e_decimal = e_scientific + bias
  e = integer2bit(e_decimal, num_bits=num_e_bits)

  ## MANTISSA
  m1 = integer2bit(f - f % 1, num_bits=num_e_bits)
  m2 = remainder2bit(f % 1, num_bits=bias)
  m = torch.cat([m1, m2], dim=-1)
  
  dtype = f.type()
  idx = torch.arange(num_m_bits).unsqueeze(0).type(dtype) \
        + (8. - e_scientific).unsqueeze(-1)
  idx = idx.long()
  m = torch.gather(m, dim=-1, index=idx)

  return torch.cat([s, e, m], dim=-1).type(dtype)

In [7]:
int_model = copy.deepcopy(mobilenet)
for name, param in int_model.named_parameters():
    print(type(param.data),param.dtype,param.shape)
    new_param = param.half()
    with torch.no_grad():
        round_param = torch.round(param.data,decimals=10)
    for i in range(len(param)):    
        # print(param[p])
        p = param.data[i][0][0]
        bit = float2bit(p)
        print(bit.shape)
        # print(bit)
        f = bit2float(bit)
        if np.allclose(a=p, b=f,rtol=1e-4,atol=1e-4) == False:
                print("Model equivalence test fail")
                print(p,end='\n\n')
                print(f)
        
    break

<class 'torch.Tensor'> torch.float32 torch.Size([32, 3, 3, 3])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])
torch.Size([3, 32])


In [8]:
M = 0
m = 0
for name,param in int_model.named_parameters():
    M = max(M,param.max())
    m = min(m,param.min())
print(M,m)

tensor(2.5196) tensor(-2.2460)


MobileNet은 fusion 시 속도가 느려짐
+
output 결과가 다름

In [9]:
a = torch.ones((1,3,32,32))
print(mobilenet.forward(a).shape)

torch.Size([1, 10])


In [10]:
fuse_model = copy.deepcopy(mobilenet)
fuse_model.eval()
for module_name, module in fuse_model.named_children():
    if module_name in "features":
        for block_name, block in module.named_children():
            if isinstance(block,torchvision.ops.misc.ConvNormActivation):
                torch.ao.quantization.fuse_modules(block,[["0","1"]],inplace=True)
            else:
                for name, conv in block.named_children():
                    # try:
                    #     torch.ao.quantization.fuse_modules(conv,[["1","2"]],inplace=True,fuser_func=(nn.Conv2d, nn.BatchNorm2d))
                    # except:
                    #     torch.ao.quantization.fuse_modules(conv,[["2","3"]],inplace=True,fuser_func=(nn.Conv2d, nn.BatchNorm2d))
                                                           
                    for cell_name, cell in conv.named_children():
                        if isinstance(cell,torchvision.ops.misc.ConvNormActivation):
                            torch.ao.quantization.fuse_modules(cell,[["0","1"]],inplace=True,fuser_func=(nn.Conv2d, nn.BatchNorm2d, nn.ReLU))

print(fuse_model)

MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): Identity()
      (2): ReLU(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
          (1): Identity()
          (2): ReLU(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1))
          (1): Identity()
          (2): ReLU(inplace=True)
        )
        (1): Conv2dNormActivation(
          (0): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96)
          (1):

In [11]:
print(f"-- Equal Test --")
model_eq_check(mobilenet, fuse_model, device=torch.device("cpu:0"))


print(f"-- Infer Time Test --")
ori_cpu_time,_ = time_test(mobilenet,torch.device("cpu"))
fus_cpu_time,_ = time_test(fuse_model,torch.device("cpu"))

print(f"origin model infer time {ori_cpu_time:.3f}s")
print(f"fusion model infer time {fus_cpu_time:.3f}s")

-- Equal Test --
Model equivalence test fail
output size : 10
[[-0.10295074 -0.2944044   0.2564814   0.74343145  0.14422883  1.3580137
  -0.2582492  -0.03915124 -0.29986712 -0.027831  ]]
[[ 0.36052272  0.04072542 -0.7131027  -0.3688231  -0.29279172 -0.01099667
   1.3875433  -0.32531443  0.09175906 -0.10298437]]
-- Infer Time Test --
origin model infer time 1.613s
fusion model infer time 1.520s


In [12]:
class QConvBnReLUModel(nn.Module):
    def __init__(self):
        super(QConvBnReLUModel,self).__init__()
        self.conv = nn.Conv2d(3,5,3,bias=True).to(dtype=torch.float)
        self.bn = nn.BatchNorm2d(5).to(dtype=torch.float)
        self.relu = torch.nn.ReLU(inplace=True)
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()
    
    def forward(self,x):
        print(f"before quant: {x.dtype}, Max : {torch.max(x)}, min : {torch.min(x)}")
        x = self.quant(x)
        print(f"after quant : {x.dtype}, Max : {torch.max(x)}, min : {torch.min(x)}")
    
        print(f"self.conv dtype : {self.conv.state_dict()}")
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.dequant(x)
        return x
    
class ConvBnModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(3,5,3,bias=True)
        self.bn = nn.BatchNorm2d(5)
    
    def forward(self,x):
        x = self.conv(x)
        x = self.bn(x)
        return x
    
class ConvBnReLUModel(nn.Module):
    def __init__(self):
        super(ConvBnReLUModel,self).__init__()
        self.conv = nn.Conv2d(3,5,3,bias=True).to(dtype=torch.float)
        self.bn = nn.BatchNorm2d(5).to(dtype=torch.float)
        self.relu = torch.nn.ReLU(inplace=True)
    
    def forward(self,x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x

model = ConvBnReLUModel()
# model = QConvBnReLUModel().to(device=torch.device("cpu:0"))
model.eval()
print(model)
# "fbgemm" for server , "qnnpack" for mobile 
# model.qconfig = torch.quantization.get_default_qconfig('fbgemm')

# # torch.quantization.fuse_module or myModel.fuse_model()
fuse_model = torch.ao.quantization.fuse_modules(model,[['conv','bn','relu']], inplace=False)
# fuse_model = model.fuse_model()
print(fuse_model)

print(f"-- Equal Test --")
model_eq_check(model, fuse_model, device=torch.device("cpu:0"))


print(f"-- Infer Time Test --")
ori_cpu_time,_ = time_test(model,torch.device("cpu"))
fus_cpu_time,_ = time_test(fuse_model,torch.device("cpu"))

print(f"origin model infer time {ori_cpu_time:.3f}s")
print(f"fusion model infer time {fus_cpu_time:.3f}s")



ConvBnReLUModel(
  (conv): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1))
  (bn): BatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
)
ConvBnReLUModel(
  (conv): ConvReLU2d(
    (0): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU(inplace=True)
  )
  (bn): Identity()
  (relu): Identity()
)
-- Equal Test --
Model equivalence test fail
output size : 322580
[[[[0.         0.04978545 0.18221086 ... 0.19953066 0.12825319
    0.2680617 ]
   [0.302618   0.25694257 0.         ... 0.03463042 0.
    0.11867654]
   [0.2761687  0.         0.         ... 0.38341272 0.24167342
    0.11720394]
   ...
   [0.36444092 0.         0.11769357 ... 0.25166047 0.23247495
    0.56167454]
   [0.07713657 0.31332463 0.17682478 ... 0.         0.31133115
    0.18637848]
   [0.11211896 0.35029468 0.         ... 0.1000894  0.
    0.20824143]]

  [[0.         0.         0.         ... 0.         0.
    0.        ]
   [0.         0.         0.     

In [13]:
M = QConvBnReLUModel()
M.train()
M.qconfig=torch.quantization.get_default_qconfig("fbgemm")
print(M.qconfig)
torch.quantization.prepare(M,inplace=True)
torch.quantization.convert(M,inplace=True)
M.eval()
print(M)
M(torch.rand(1,3,5,5))

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
QConvBnReLUModel(
  (conv): QuantizedConv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0)
  (bn): QuantizedBatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (quant): Quantize(scale=tensor([1.]), zero_point=tensor([0]), dtype=torch.quint8)
  (dequant): DeQuantize()
)
before quant: torch.float32, Max : 0.9950127601623535, min : 0.05630141496658325
after quant : torch.quint8, Max : 1, min : 0
self.conv dtype : OrderedDict([('weight', tensor([[[[-0.1309, -0.1755,  0.1160],
          [ 0.1398, -0.1116,  0.1740],
          [ 0.0238, -0.0907,  0.1889]],

         [[ 0.0283,  0.0729,  0.0312],
          [-0.0476, -0.0833, -0.0178],
          [-0.0045



tensor([[[[0., 1., 0.],
          [1., 1., 0.],
          [0., 1., 1.]],

         [[0., 0., 0.],
          [0., 0., 1.],
          [1., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 1.],
          [0., 0., 0.]],

         [[0., 1., 0.],
          [1., 0., 0.],
          [1., 0., 0.]],

         [[0., 0., 0.],
          [0., 1., 1.],
          [0., 0., 0.]]]])

In [14]:
def Cifar10_Dataloader():
    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding = 4),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    test_transform = transforms.Compose([
        transforms.Resize((32,32)),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])


    train_dataset = torchvision.datasets.CIFAR10(root="data", train=True, download=True, transform=train_transform) 
    # We will use test set for validation and test in this project.
    # Do not use test set for validation in practice!
    test_dataset = torchvision.datasets.CIFAR10(root="data", train=False, download=True, transform=test_transform)
    print(f"Train data set = {len(train_dataset)}, Test = {len(test_dataset)}")

    train_sampler = torch.utils.data.RandomSampler(train_dataset)
    test_sampler = torch.utils.data.SequentialSampler(test_dataset)

    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset, batch_size=128,
        sampler=train_sampler)

    test_loader = torch.utils.data.DataLoader(
        dataset=test_dataset, batch_size=128,
        sampler=test_sampler)
    return train_loader, test_loader


In [37]:
def Evaluating(model, test_loader, device, criterion=None):
    model.to(device)
    model.eval()

    running_loss = 0
    running_corrects = 0

    for inputs, labels in tqdm(iter(test_loader)):
        
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)

        _, preds = torch.max(outputs, 1)
        if criterion is not None:
            loss = criterion(outputs, labels).item()
        else:
            loss = 0
        # statistics
        running_loss += loss * labels.size(0)
        running_corrects += (preds == labels).sum().item()

    eval_loss = running_loss / len(test_loader.dataset)
    eval_accuracy = 100 * running_corrects / len(test_loader.dataset)

    return eval_loss, eval_accuracy

def Training(model, train_loader, test_loader, device, optimizer, scheduler, epochs=100,model_name="test"):
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    print("Before Training")
    torch.cuda.memory_reserved()
    memory_check()
    count = 0
    best_loss = np.Inf
    # Training
    model.to(device)
    for epoch in range(epochs):

        running_loss = 0
        running_corrects = 0
        model.train()

        for inputs, labels in tqdm(iter(train_loader)):

            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)

            loss = criterion(outputs, labels)
 
            loss.backward()
            optimizer.step()
            
            _, preds = torch.max(outputs, 1)
            # statistics
            running_loss += loss.item() * labels.size(0)
            running_corrects += (preds == labels).sum().item()

            del inputs
            del outputs
            del loss
            del preds
        # Set learning rate scheduler
        if scheduler is not None:
            scheduler.step()
        train_loss = running_loss / len(train_loader.dataset)
        train_accuracy = 100 * running_corrects / len(train_loader.dataset) 

        # Evaluation
        val_loss, val_acc = Evaluating(model,test_loader,device=device,criterion=criterion)
        print(f"--------{epoch}----------")
        print(f"Train {train_loss:.4f} Loss, {train_accuracy:.2f} Acc")
        print(f"Validation {val_loss:.4f} Loss, {val_acc:.2f} Acc")

        if best_loss > val_loss:
            best_loss = val_loss
            count = 0
            torch.save(model.state_dict(), f"./models/{model_name}.pt")
        else:
            count +=1
            if count > 10:
                break
    model.load_state_dict(torch.load(f"./models/{model_name}.pt")) 
    return model

from models import mobilenet_v2, MobileNet_V2_Weights,quat_mobilenet_v2
model1 = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1,activation_layer=nn.ReLU)
model1.classifier.append(nn.Dropout(0.2))
model1.classifier.append(nn.Linear(1000, 10))
train_loader, test_loader = Cifar10_Dataloader()
optimizer = torch.optim.SGD(model1.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

val_loss, val_acc = Evaluating(model1,test_loader,device=device,criterion=criterion)
print(f"--------before----------")
print(f"Validation {val_loss:.4f} Loss, {val_acc:.2f} Acc")
for i in range(10):
    # Training
    running_loss1 = 0
    running_loss2 = 0
    running_corrects = 0
    model1.to(device)
    model1.train()
    for inputs, labels in tqdm(iter(train_loader)):
        with torch.no_grad():
            model2 = copy.deepcopy(model1)
        
            inputs1 = inputs.to(device)
            inputs2 = inputs.to(device)
        
            labels = labels.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        outputs1 = model1(inputs1)
        # forward + backward + optimize
        outputs2 = model2(inputs2)
        loss1 = criterion(outputs1, labels)
        loss2 = criterion(outputs2,labels)

        loss2.backward()
        optimizer.step()
        
        _, preds = torch.max(outputs2, 1)
        # statistics
        running_loss1 += loss1.item() * labels.size(0)
        running_corrects += (preds == labels).sum().item()
        running_loss2 += loss2.item() * labels.size(0)
    # Set learning rate scheduler
    train_loss = running_loss1 / len(train_loader.dataset)
    train_loss2 = running_loss2 / len(train_loader.dataset)
    train_accuracy = 100 * running_corrects / len(train_loader.dataset) 

    # Evaluation
    val_loss, val_acc = Evaluating(model1,test_loader,device=device,criterion=criterion)
    print(f"--------{i}----------")
    print(f"Train {train_loss:.4f} Loss, {train_accuracy:.2f} Acc")
    print(f"Train2 {train_loss2:.4f} Loss, {train_accuracy:.2f} Acc")
    print(f"Validation {val_loss:.4f} Loss, {val_acc:.2f} Acc")


Files already downloaded and verified
Files already downloaded and verified
Train data set = 50000, Test = 10000


100%|██████████| 79/79 [00:02<00:00, 36.34it/s]


--------before----------
Validation 6.5055 Loss, 7.67 Acc


  5%|▍         | 19/391 [00:01<00:34, 10.84it/s]


KeyboardInterrupt: 

In [45]:
model = quat_mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1,activation_layer=nn.ReLU)
model.classifier.append(nn.Dropout(0.2))
model.classifier.append(nn.Linear(1000, 10))
quat_model = model
quat_model.fuse_model()

quat_model.train()
quat_model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
quat_model = torch.quantization.prepare_qat(quat_model)
copy_model = copy.deepcopy(quat_model)
for name, param in copy_model.named_parameters():
    if "classifier" in name:
        print(torch.max(param))
 
optimizer = torch.optim.SGD(quat_model.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,60,90], gamma=0.5)

quat_model = Training(quat_model,train_loader=train_loader,test_loader=test_loader,device="cuda",optimizer=optimizer,scheduler=scheduler,epochs=1)

for name, param in quat_model.named_parameters():
    if "classifier" in name:
        print(torch.max(param))



tensor(0.3308, grad_fn=<MaxBackward1>)
tensor(0.1551, grad_fn=<MaxBackward1>)
tensor(0.0316, grad_fn=<MaxBackward1>)
tensor(0.0272, grad_fn=<MaxBackward1>)
Before Training
  Allocated: 0.97 GB
  Cached:    2.61 GB



100%|██████████| 391/391 [00:56<00:00,  6.93it/s]
100%|██████████| 79/79 [00:07<00:00, 10.58it/s]


--------0----------
Train 1.4654 Loss, 62.71 Acc
Validation 1.0464 Loss, 77.65 Acc


100%|██████████| 391/391 [00:56<00:00,  6.91it/s]
100%|██████████| 79/79 [00:07<00:00, 10.79it/s]


--------1----------
Train 1.0267 Loss, 78.02 Acc
Validation 0.9416 Loss, 81.37 Acc


100%|██████████| 391/391 [00:56<00:00,  6.87it/s]
100%|██████████| 79/79 [00:07<00:00, 10.75it/s]


--------2----------
Train 0.9454 Loss, 81.40 Acc
Validation 0.8949 Loss, 83.29 Acc


100%|██████████| 391/391 [00:56<00:00,  6.88it/s]
100%|██████████| 79/79 [00:07<00:00, 10.67it/s]


--------3----------
Train 0.8934 Loss, 83.70 Acc
Validation 0.8944 Loss, 83.46 Acc


100%|██████████| 391/391 [00:56<00:00,  6.91it/s]
100%|██████████| 79/79 [00:07<00:00, 10.45it/s]


--------4----------
Train 0.8609 Loss, 85.01 Acc
Validation 0.8476 Loss, 85.63 Acc
tensor(0.3275, device='cuda:0', grad_fn=<MaxBackward1>)
tensor(0.1534, device='cuda:0', grad_fn=<MaxBackward1>)
tensor(0.0499, device='cuda:0', grad_fn=<MaxBackward1>)
tensor(0.0300, device='cuda:0', grad_fn=<MaxBackward1>)


In [83]:
with torch.no_grad():
    copy_model.eval()
    for name, param in copy_model.named_parameters():
        print(name)
        if "classifier" in name:
            print(name,param.shape)
            ma = torch.max(param)
            mi = torch.min(param)
            print(ma, mi)
            print(param)
            cal = torch.round((127*2*(param-mi/(ma-mi))-127),decimals=0)
            print(cal)
            print(torch.max(cal), torch.min(cal))
        else:
            if "features.0.0.bn.weight" in name:
                print(param)
            elif "features.0.0.bn.bias" in name:
                print(param)
            
    print("-"*50)
    for name, param in quat_model.named_parameters():
        if "classifier" in name:
            print(torch.max(param),torch.min(param))

features.0.0.weight
features.0.0.bn.weight
Parameter containing:
tensor([0.0381, 0.1872, 0.1975, 0.2451, 0.1313, 0.1590, 0.0881, 0.2552, 0.0870,
        0.0119, 0.4129, 0.1137, 0.2245, 0.3014, 0.0114, 0.0104, 0.0128, 0.0043,
        0.0668, 0.4108, 0.3578, 0.1278, 0.5250, 0.0039, 0.4444, 0.1298, 0.3284,
        0.2453, 0.3565, 0.3066, 0.4146, 0.1419], requires_grad=True)
features.0.0.bn.bias
Parameter containing:
tensor([-8.4354e-02,  5.6023e-01,  3.5002e-01,  2.8363e-01,  9.7327e-01,
         6.4774e-01,  4.9481e-01,  5.5817e-01,  6.1756e-01, -4.2980e-04,
        -3.0858e-01,  9.5334e-01,  4.4609e-01, -3.8414e-01, -9.3045e-04,
         5.7470e-03, -4.2064e-02, -1.7965e-02,  3.3821e-01,  1.1017e-01,
        -2.5284e-01,  5.0251e-01,  3.7990e-01, -1.5532e-02, -4.6869e-01,
         5.1056e-01, -2.8880e-01,  6.4006e-01, -1.0935e-01, -5.9483e-02,
         3.7479e-01,  2.6511e-01], requires_grad=True)
features.1.conv.0.0.weight
features.1.conv.0.0.bn.weight
features.1.conv.0.0.bn.bias
featu

In [None]:
optimizer = torch.optim.SGD(model.parameters())
for i in range(epoch):
    for inputs, labels in tqdm(iter(train_loader)):
        optimizer.zero_grad()
        
        with torch.no_grad():
            model = weight_quant(model)
            input = weight_quant(input)
        
        # zero the parameter gradients
        outputs = model(inputs)
        with torch.no_grad():
            model = weight_dequant(model)
        # forward + backward + optimize
        loss = criterion(outputs, labels) * scale

        loss.backward() 
        optimizer.step()
        
        _, preds = torch.max(outputs, 1)
        # statistics
        running_loss += loss.item() * labels.size(0)
        running_corrects += (preds == labels).sum().item()

    # Set learning rate scheduler
    train_loss = running_loss1 / len(train_loader.dataset)
    train_accuracy = 100 * running_corrects / len(train_loader.dataset) 

    # Evaluation
    val_loss, val_acc = Evaluating(model1,test_loader,device=device,criterion=criterion)
    print(f"--------{i}----------")
    print(f"Train {train_loss:.4f} Loss, {train_accuracy:.2f} Acc")
    print(f"Train2 {train_loss2:.4f} Loss, {train_accuracy:.2f} Acc")
    print(f"Validation {val_loss:.4f} Loss, {val_acc:.2f} Acc")

In [None]:
optimizer = torch.optim.SGD(model.parameters())
for i in range(epoch):
    for inputs, labels in tqdm(iter(train_loader)):
        with torch.no_grad():
            quant_model = quantize(model)
            inputs1 = inputs.to(device)
            inputs2 = inputs.to(device)
            labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs1 = model(inputs1)
        outputs2 = quant_model(inputs2)
        # forward + backward + optimize
        loss1 = criterion(outputs1, labels)
        loss2 = criterion(outputs2,labels)
        
        total_loss = total(loss1,loss2)
        loss.backward()
        optimizer.step()
        
        _, preds = torch.max(outputs2, 1)
        # statistics
        running_loss1 += loss1.item() * labels.size(0)
        running_corrects += (preds == labels).sum().item()
        running_loss2 += loss2.item() * labels.size(0)
    # Set learning rate scheduler
    train_loss = running_loss1 / len(train_loader.dataset)
    train_loss2 = running_loss2 / len(train_loader.dataset)
    train_accuracy = 100 * running_corrects / len(train_loader.dataset) 

    # Evaluation
    val_loss, val_acc = Evaluating(model,test_loader,device=device,criterion=criterion)
    print(f"--------{i}----------")
    print(f"Train {train_loss:.4f} Loss, {train_accuracy:.2f} Acc")
    print(f"Train2 {train_loss2:.4f} Loss, {train_accuracy:.2f} Acc")
    print(f"Validation {val_loss:.4f} Loss, {val_acc:.2f} Acc")
