In [1]:
import os
import random

import torch
import torch.nn as nn
import torchvision

import time
import copy
import numpy as np
from torchvision import transforms
from tqdm import tqdm
from torchsummary import summary

def set_random_seeds(random_seed=0):

    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

def memory_check():
    print(f"  Allocated: {round(torch.cuda.memory_allocated()/1024**3,2)} GB")
    print(f"  Cached:    {round(torch.cuda.memory_reserved()/1024**3,2)} GB\n")

print(f"torch = {torch.__version__}")
print(f"torchvision = {torchvision.__version__}")

torch = 1.12.1
torchvision = 0.13.1


In [2]:
from models import quat_mobilenet_v2
tiny_model = quat_mobilenet_v2(cifar10=True)
tiny_model.load_state_dict(torch.load("./models/tiny_mobilenetv2_cifar.pt"))
summary(tiny_model,(3,32,32),device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         QuantStub-1            [-1, 3, 32, 32]               0
            Conv2d-2           [-1, 32, 16, 16]             864
       BatchNorm2d-3           [-1, 32, 16, 16]              64
              ReLU-4           [-1, 32, 16, 16]               0
            Conv2d-5           [-1, 32, 16, 16]             288
       BatchNorm2d-6           [-1, 32, 16, 16]              64
              ReLU-7           [-1, 32, 16, 16]               0
            Conv2d-8           [-1, 16, 16, 16]             512
       BatchNorm2d-9           [-1, 16, 16, 16]              32
QuantizableInvertedResidual-10           [-1, 16, 16, 16]               0
           Conv2d-11           [-1, 64, 16, 16]           1,024
      BatchNorm2d-12           [-1, 64, 16, 16]             128
             ReLU-13           [-1, 64, 16, 16]               0
           Conv2d-14         

In [3]:
tiny_model.to("cpu")
with torch.no_grad():
    total_tensor = torch.tensor([])
    state = tiny_model.state_dict()
    for i in state.keys():
        new_param = state[i].view(-1)
        total_tensor = torch.cat((total_tensor,new_param),0)
        
    print(total_tensor.shape)
    total_tensor,_ = total_tensor.sort()
    print(f"Max {torch.max(total_tensor)}")
    print(f"Min {torch.min(total_tensor)}")
    number = int(len(total_tensor)*0.01)
    print(number)
    
    M = total_tensor[-number]
    m = total_tensor[number]
    print(M,m)
    print(total_tensor[:20])

    

torch.Size([670508])
Max 32846.0
Min -0.9091643691062927
6705
tensor(0.5107) tensor(-0.1579)
tensor([-0.9092, -0.8394, -0.8371, -0.7841, -0.7512, -0.7318, -0.7111, -0.7075,
        -0.7015, -0.6907, -0.6876, -0.6641, -0.6532, -0.6381, -0.6341, -0.6314,
        -0.6302, -0.6222, -0.6174, -0.6107])


In [4]:
with torch.no_grad():
    new_M = 0
    new_m = 0 
    for name, param in tiny_model.named_parameters():
        new_param = param.clamp(m,M)
        new_param = torch.round(254*(new_param-m)/(M-m)-127)
        new_param = new_param/1000
        
        new_M = max(new_M,torch.max(new_param))
        new_m = max(new_m,torch.max(new_param))
    print(new_M,new_m)


tensor(0.1270) tensor(0.1270)


In [5]:
def custom_quant_model(model:nn.Module,inplace:bool=True):
    model.eval()
    model.to("cpu")
    with torch.no_grad():
        state = model.state_dict()
        
        if not inplace:
            new_model = copy.deepcopy(model)
        else:
            new_model = model
    
        total_tensor = torch.tensor([])
        for i in state.keys():
            new_param = state[i].view(-1)
            total_tensor = torch.cat((total_tensor,new_param),0)

        total_tensor,_ = total_tensor.sort()
        number = int(len(total_tensor)*0.05)
        
        # M = total_tensor[-number]
        # m = total_tensor[number]
        M = torch.max(total_tensor)
        m = torch.min(total_tensor)
        print(f"Max weight : {M}")
        print(f"Min weight : {m}")
        for i in state.keys():
            param = state[i]
            new_param = param.clamp(m,M)
            new_param = torch.round(254*(new_param-m)/(M-m)-127)
            new_param = new_param
            # param.data = torch.quantize_per_tensor(new_param, 0.1, 10, torch.quint8)
            state[i] = new_param
        
        new_model.load_state_dict(state)
    return new_model

In [6]:
from utils import Train

quat_model = copy.deepcopy(tiny_model)
quat_model.fuse_model()
for name, param in quat_model.named_parameters():
    print(param.dtype,torch.max(param), torch.min(param))
    break
quat_model = custom_quant_model(quat_model)
train_loader, test_loader = Train.Cifar10_Dataloader()

total_tensor = torch.tensor([])
state= quat_model.state_dict()
for i in state.keys():
    new_param = state[i].view(-1)
    total_tensor = torch.cat((total_tensor,new_param),0)
M = torch.max(total_tensor)
m = torch.min(total_tensor)
print(f"Max weight : {M}")
print(f"Min weight : {m}")

def calibrate_model(model, loader, device=torch.device("cpu")):
    print("calibrating ...")
    model.to(device)
    model.eval()

    for inputs, labels in tqdm((loader),leave=False):
        inputs = inputs.to(device)
        labels = labels.to(device)
        _ = model(inputs)
        
calibrate_model(quat_model,test_loader)
_,val_acc = Train.Evaluating(quat_model,test_loader,device='cuda')
print(val_acc)

torch.float32 tensor(0.5626, grad_fn=<MaxBackward1>) tensor(-0.4744, grad_fn=<MinBackward1>)
Max weight : 32846.0
Min weight : -0.9091643691062927
Files already downloaded and verified
Files already downloaded and verified
Train data set = 50000, Test = 10000
Max weight : 127.0
Min weight : -127.0
calibrating ...


                                               

10.0




In [7]:
for img, label in iter(train_loader):
    with torch.no_grad():
        print(f"Max tensor : {torch.max(img)}")
        print(f"Min tensor : {torch.min(img)}")
    break

Max tensor : 2.7537312507629395
Min tensor : -2.429065704345703


In [8]:
from utils import Train
train_dataset, test_dataset = Train.Cifar10_Dataloader(quantize=True,only_dataset=True)


Files already downloaded and verified
Files already downloaded and verified
Train data set = 50000, Test = 10000


In [9]:
fuse_model = copy.deepcopy(tiny_model)
fuse_model.fuse_model()
state = fuse_model.state_dict()
total_tensor = torch.tensor([])
for i in state.keys():
    new_param = state[i].view(-1)
    total_tensor = torch.cat((total_tensor,new_param),0)
length = len(total_tensor)
total_tensor,_ = total_tensor.sort()
print(length)
print(torch.max(total_tensor), torch.min(total_tensor))
print(total_tensor[:20])
print(total_tensor[-20:])

670508
tensor(32846.) tensor(-0.9092)
tensor([-0.9092, -0.8394, -0.8371, -0.7841, -0.7512, -0.7318, -0.7111, -0.7075,
        -0.7015, -0.6907, -0.6876, -0.6641, -0.6532, -0.6381, -0.6341, -0.6314,
        -0.6302, -0.6222, -0.6174, -0.6107])
tensor([32846., 32846., 32846., 32846., 32846., 32846., 32846., 32846., 32846.,
        32846., 32846., 32846., 32846., 32846., 32846., 32846., 32846., 32846.,
        32846., 32846.])


In [10]:
tiny_model.to("cpu")
tiny_model.eval()
with torch.no_grad():
    for img, label in iter(test_loader):
        for name, module in tiny_model.named_modules():
            print(name)
        break



features
features.0
features.0.0
features.0.1
features.0.2
features.1
features.1.conv
features.1.conv.0
features.1.conv.0.0
features.1.conv.0.1
features.1.conv.0.2
features.1.conv.1
features.1.conv.2
features.1.skip_add
features.1.skip_add.activation_post_process
features.2
features.2.conv
features.2.conv.0
features.2.conv.0.0
features.2.conv.0.1
features.2.conv.0.2
features.2.conv.1
features.2.conv.1.0
features.2.conv.1.1
features.2.conv.1.2
features.2.conv.2
features.2.conv.3
features.2.skip_add
features.2.skip_add.activation_post_process
features.3
features.3.conv
features.3.conv.0
features.3.conv.0.0
features.3.conv.0.1
features.3.conv.0.2
features.3.conv.1
features.3.conv.1.0
features.3.conv.1.1
features.3.conv.1.2
features.3.conv.2
features.3.conv.3
features.3.skip_add
features.3.skip_add.activation_post_process
features.4
features.4.conv
features.4.conv.0
features.4.conv.0.0
features.4.conv.0.1
features.4.conv.0.2
features.4.conv.1
features.4.conv.1.0
features.4.conv.1.1
featur

In [4]:
from models import Quant_ReLU
from models import quat_mobilenet_v2
import torch
test_model = quat_mobilenet_v2(cifar10=True, activation_layer = Quant_ReLU)
x = torch.rand(1,3,32,32)
print(x.shape)
print(x.dim)
test_model.eval()
test_model.to('cpu')
y = test_model(x,check=True)

torch.Size([1, 3, 32, 32])
<built-in method dim of Tensor object at 0x7f734a4deb30>
Before quant torch.Size([1, 3, 32, 32])
tensor([[[[0.7015, 0.0094, 0.6844,  ..., 0.9708, 0.9056, 0.4092],
          [0.4952, 0.8349, 0.2106,  ..., 0.1093, 0.3152, 0.3218],
          [0.9619, 0.2924, 0.7515,  ..., 0.0749, 0.6031, 0.9446],
          ...,
          [0.6778, 0.7219, 0.6975,  ..., 0.5404, 0.7204, 0.2027],
          [0.4839, 0.5764, 0.8945,  ..., 0.7231, 0.9630, 0.8795],
          [0.2107, 0.4932, 0.2533,  ..., 0.0445, 0.0238, 0.9571]],

         [[0.3503, 0.7456, 0.3864,  ..., 0.5104, 0.3343, 0.2511],
          [0.6302, 0.4099, 0.0470,  ..., 0.6817, 0.0322, 0.0287],
          [0.2448, 0.3077, 0.5786,  ..., 0.6606, 0.3641, 0.2507],
          ...,
          [0.2450, 0.5552, 0.5145,  ..., 0.0156, 0.4594, 0.5732],
          [0.6668, 0.1426, 0.4182,  ..., 0.2259, 0.1909, 0.5003],
          [0.1236, 0.4521, 0.3786,  ..., 0.4592, 0.4758, 0.9741]],

         [[0.6547, 0.3612, 0.9795,  ..., 0.2076, 0

In [5]:
from models import quat_mobilenet_v2,mobilenet_v2
from models import Quant_ReLU
from torchsummary import summary
from utils import Data
from utils import Train
import torch
import numpy as np
from tqdm import tqdm
from utils.Train import Evaluating
from utils.Train import custom_quant_weights,custom_dequant_weights
from models.mobilenetv2 import replace_Qrelu, replace_relu, MobileNet_V2_Weights
from utils import set_random_seeds
# device 
if torch.cuda.is_available():
    gpu_device = torch.device("cuda")
cpu_device = torch.device("cpu")

# set random 
set_random_seeds(42)

# model load
model = quat_mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1,activation_layer=torch.nn.ReLU)
model.classifier.append(torch.nn.Dropout(0.2))
model.classifier.append(torch.nn.Linear(1000, 10))
model.load_state_dict(torch.load("./models/mobilenetv2_cifar10.pt"))
# model = mobilenet_v2(cifar10=True)
# summary(model,(3,32,32),device="cpu")
# data load
train_loader, test_loader = Data.Cifar10_Dataloader()


Files already downloaded and verified
Files already downloaded and verified
Train data set = 50000, Test = 10000


In [5]:

# optimizer 
optimizer = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=0.9, weight_decay=5e-4)

# scheduler 
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20,60,90], gamma=0.5)

# train model
criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.0)
count = 0
best_loss = np.Inf
# Training
model.to(gpu_device)
val_loss, val_acc = Evaluating(model,test_loader,device=gpu_device,criterion=criterion)
print("Before Training")
print(f"Validation {val_loss:.4f} Loss, {val_acc:.2f} Acc")

# with torch.autograd.set_detect_anomaly(True):
for epoch in range(100):

    running_loss = 0
    running_corrects = 0
    model.train()
    for inputs, labels in tqdm(iter(train_loader),leave=False):
        inputs = inputs.to(gpu_device)
        labels = labels.to(gpu_device)

        with torch.no_grad():
            check = []
            for i in range(inputs.size(0)):
                M = torch.max(inputs[i])
                m = torch.min(inputs[i])
                check.append([m,M])
                inputs[i] = torch.round(254*(inputs[i]-m)/(M-m)-127)/1000
            
            for i in range(inputs.size(0)):
                m = check[i][0]
                M = check[i][1]
                inputs[i] = (1000*inputs[i]+127)*(M-m)/254+m

        optimizer.zero_grad()
        # forward + backward + optimize
        # model,backup = custom_quant_weights(model)
        # model = custom_dequant_weights(model,backup)
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        _, preds = torch.max(outputs, 1)
        # statistics
        running_loss += loss.item() * labels.size(0)
        running_corrects += (preds == labels).sum().item()
        
    # Set learning rate scheduler
    if scheduler is not None:
        scheduler.step()
    train_loss = running_loss / len(train_loader.dataset)
    train_accuracy = 100 * running_corrects / len(train_loader.dataset) 

    # Evaluation
    val_loss, val_acc = Evaluating(model,test_loader,device=gpu_device,criterion=criterion)
    print(f"--------{epoch+1}----------")
    print(f"Train {train_loss:.4f} Loss, {train_accuracy:.2f} Acc")
    print(f"Validation {val_loss:.4f} Loss, {val_acc:.2f} Acc")
    if best_loss > val_loss:
        best_loss = val_loss
        count = 0
        torch.save(model.state_dict(), f"./models/test.pt")
    else:
        count +=1
        if count > 10:
            break

Files already downloaded and verified
Files already downloaded and verified
Train data set = 50000, Test = 10000
Before Training


                                               

Validation 0.3452 Loss, 90.60 Acc


                                                 

--------1----------
Train 0.1846 Loss, 95.85 Acc
Validation 0.3071 Loss, 90.73 Acc


                                                 

--------2----------
Train 0.1479 Loss, 95.67 Acc
Validation 0.3043 Loss, 90.78 Acc


                                                 

--------3----------
Train 0.1310 Loss, 95.89 Acc
Validation 0.3083 Loss, 90.93 Acc


                                                 

--------4----------
Train 0.1292 Loss, 95.85 Acc
Validation 0.3151 Loss, 90.82 Acc


                                                 

--------5----------
Train 0.1275 Loss, 95.84 Acc
Validation 0.3189 Loss, 90.85 Acc


                                                 

--------6----------
Train 0.1235 Loss, 95.92 Acc
Validation 0.3217 Loss, 90.74 Acc


                                                 

--------7----------
Train 0.1227 Loss, 96.05 Acc
Validation 0.3224 Loss, 90.81 Acc


                                                 

--------8----------
Train 0.1217 Loss, 96.01 Acc
Validation 0.3212 Loss, 90.88 Acc


                                                 

--------9----------
Train 0.1198 Loss, 96.09 Acc
Validation 0.3238 Loss, 90.81 Acc


                                                 

--------10----------
Train 0.1231 Loss, 95.92 Acc
Validation 0.3203 Loss, 90.83 Acc


                                                 

--------11----------
Train 0.1214 Loss, 96.02 Acc
Validation 0.3218 Loss, 90.81 Acc


                                                 

--------12----------
Train 0.1225 Loss, 95.93 Acc
Validation 0.3218 Loss, 90.77 Acc


                                                 

--------13----------
Train 0.1184 Loss, 96.08 Acc
Validation 0.3219 Loss, 90.74 Acc




In [8]:
from models import quantize_model,quat_mobilenet_v2,MobileNet_V2_Weights
import copy
import torch
train_loader, test_loader
model = quat_mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1,activation_layer=torch.nn.ReLU)
model.classifier.append(torch.nn.Dropout(0.2))
model.classifier.append(torch.nn.Linear(1000, 10))
model.to('cpu')
model.load_state_dict(torch.load("./models/test.pt"))
quat_model = copy.deepcopy(model)
quantize_model(quat_model, data= test_loader)
_,int8_acc = Evaluating(quat_model,test_loader,"cpu")
print(f"post int8_model acc :{int8_acc:.2f} %")


Q config = QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, quant_min=0, quant_max=127){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
calibrating...


                                               

post int8_model acc :80.32 %




In [9]:

model.load_state_dict(torch.load("./models/mobilenetv2_cifar10.pt"))
quat_model = copy.deepcopy(model)
quantize_model(quat_model, data= test_loader)
_,int8_acc = Evaluating(quat_model,test_loader,"cpu")
print(f"post int8_model acc :{int8_acc:.2f} %")

Q config = QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, quant_min=0, quant_max=127){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
calibrating...


                                               

post int8_model acc :68.55 %


