In [1]:
import torch
from torch.utils.cpp_extension import load
import os

os.environ['CUDA_LAUNCH_BLOCKING']="1"

torch integer data flow 는 gpu에서 사용 불가

In [2]:
# int_data = torch.randint(0,255,(1,3,24,24), dtype=torch.uint8)
# weight = torch.randint(0,255,(1,3,3,3), dtype=torch.uint8)

# # b = torch.nn.functional.conv2d(int_data.cuda(), weight=weight.cuda(),stride=1)
# b = torch.nn.functional.conv2d(int_data, weight=weight,stride=1,bias=None, padding=1, dtype=torch.uint8)
# print(b.shape)

In [3]:
import int8mm_cuda
from torch.nn.modules import Module

class IntLinear(Module):
    def __init__(self, in_channels, out_channels):
        super(IntLinear,self).__init__()
        self.weight = torch.randint(-127,127,(out_channels, in_channels), dtype=torch.int8)

    def forward(self,x):
        # weight [OUT, IN} - > [IN, OUT]
        # input [BATCH, IN]
        y = int8mm_cuda.int8_mm(x,self.weight.transpose(1,0).contiguous())
        y = (y > 127).int()*5
        y = y.type(torch.int8)
        return y
    
    def cuda(self):
        self.weight = self.weight.cuda()
        

mm = IntLinear(4,12)
x = torch.randint(-127,127,(1,4), dtype=torch.int8).cuda()
print(x.dtype, mm.weight.dtype)
with torch.no_grad():
    mm.cuda()
    y = mm(x)
print(x)
print(mm.weight)
print(y)

torch.int8 torch.int8
tensor([[  20,   -4,  -20, -100]], device='cuda:0', dtype=torch.int8)
tensor([[  42,  -94,   85,  -93],
        [ 103,   33,   60,  -19],
        [  57,   27, -103,  -85],
        [-112,   55,  109,  112],
        [  56,  -43,   47,  -10],
        [ 105,   18, -101,  -84],
        [ -21, -118,   23, -121],
        [ -22,  -98,    6,  -19],
        [  19,   73,  -94,   26],
        [  88, -101,   -8,  -38],
        [   8,  -61,  -39,   16],
        [  86,   26,  -93,   22]], device='cuda:0', dtype=torch.int8)
tensor([[5, 5, 5, 0, 5, 5, 5, 5, 0, 5, 0, 5]], device='cuda:0',
       dtype=torch.int8)


In [4]:
import numpy as np 
print(mm.weight.transpose(1,0),end="\n\n")
x_data = x.detach().cpu().numpy()
mm_data = mm.weight.detach().cpu().numpy()
print(f"x data - {x_data}\n")
print(f"mm data - {mm_data}\nmm Trans - {mm_data.T}\n")
y = x_data @ mm_data.T
print(f"y - {y}")

tensor([[  42,  103,   57, -112,   56,  105,  -21,  -22,   19,   88,    8,   86],
        [ -94,   33,   27,   55,  -43,   18, -118,  -98,   73, -101,  -61,   26],
        [  85,   60, -103,  109,   47, -101,   23,    6,  -94,   -8,  -39,  -93],
        [ -93,  -19,  -85,  112,  -10,  -84, -121,  -19,   26,  -38,   16,   22]],
       device='cuda:0', dtype=torch.int8)

x data - [[  20   -4  -20 -100]]

mm data - [[  42  -94   85  -93]
 [ 103   33   60  -19]
 [  57   27 -103  -85]
 [-112   55  109  112]
 [  56  -43   47  -10]
 [ 105   18 -101  -84]
 [ -21 -118   23 -121]
 [ -22  -98    6  -19]
 [  19   73  -94   26]
 [  88 -101   -8  -38]
 [   8  -61  -39   16]
 [  86   26  -93   22]]
mm Trans - [[  42  103   57 -112   56  105  -21  -22   19   88    8   86]
 [ -94   33   27   55  -43   18 -118  -98   73 -101  -61   26]
 [  85   60 -103  109   47 -101   23    6  -94   -8  -39  -93]
 [ -93  -19  -85  112  -10  -84 -121  -19   26  -38   16   22]]

y - [[ 112   68   72   32   72  -96  -84  

Pooling layer


In [5]:
import int8pool_cuda

class IntPool(Module):
    def __init__(self,kernel_size = 2, stride = 2, padding=0, mode=0):
        super(IntPool,self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.mode = mode
    
    def forward(self,x):
        y = int8pool_cuda.int8_pool(x,self.kernel_size, self.stride, self.padding, self.mode)
        y = (y > 127).int()*5
        y = y.type(torch.int8)
        return y

pool = IntPool()
x = torch.randint(-127, 127,(4,32,32,4), dtype=torch.int8).cuda()

In [6]:
with torch.no_grad():
    y = pool(x)

In [7]:
print(f"x - {x.shape} \n{x}\n")
print(f"y - {y.shape}\n{y}")

x - torch.Size([4, 32, 32, 4]) 
tensor([[[[ -26,   80,  -42,   88],
          [-107,  -43,  -32,  -61],
          [  27,   17,  -28,  109],
          ...,
          [-122,   76,  -96,   50],
          [ -21,   95,   41,   58],
          [ 107,   24, -111,  -94]],

         [[  11,  -73,   34,   97],
          [  40,  -90,  -18,   99],
          [ -14, -119,    7,  -90],
          ...,
          [ -35,    4,   25,  111],
          [ 122,  -48,  -29,   25],
          [-110,  108, -105,  -73]],

         [[  95,  101,   48,  -72],
          [   0,  -35, -109,  -59],
          [ -25,  -39,    2,  -12],
          ...,
          [  89,  -72, -117,   54],
          [  86,   96,   -9,  -56],
          [   9,  -28,   52,   64]],

         ...,

         [[  71,   47,  -39,    3],
          [   7, -107,    7,  -48],
          [  83,  -28,   25,   47],
          ...,
          [ 124,   40,   11,   24],
          [ 109,   17,   33,  110],
          [  95,  107, -104,  107]],

         [[  44,  -76

pytorch conv layer parmeter shape 보기

In [8]:
import torchvision
model = torchvision.models.vgg.vgg16(pretrained=True)
print(model.features[0].weight.shape)
c = torch.nn.Conv2d(4,12,3,1,1)
print(c.weight.shape) # NCHW



torch.Size([64, 3, 3, 3])
torch.Size([12, 4, 3, 3])


In [9]:
import cutlassconv
from torch.nn.modules import Module

class IntConv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride =1, padding =1):
        super(IntConv2d,self).__init__()
        self.weight = torch.randint(-127,127,(out_channels, kernel_size, kernel_size, in_channels), dtype=torch.int8)
        self.stride = stride
        self.padding = padding

    def forward(self,x):
        # trans_weight = torch.flip(self.weight,[1,2]).transpose(0,3).contiguous()
        # trans_weight = self.weight.permute(0,2,3,1).contiguous()
        trans_weight = self.weight
        return cutlassconv.int8_conv(x,trans_weight)
    
    def cuda(self):
        self.weight = self.weight.cuda()
        
## cutlass는 16의 배수만
input_channel= 16
conv = IntConv2d(input_channel,32,3,1,1)
print(conv.weight.shape)
x = torch.randint(0,127,(1,32,32,input_channel), dtype=torch.int8).cuda()


torch.Size([32, 3, 3, 16])


In [10]:
with torch.no_grad():
    conv.cuda()
    y = conv(x)


In [11]:
import numpy as np 
print(f"x data - {x.shape} \n{x_data}\n")
print(f"conv data - {conv.weight.shape}\n{conv.weight}\n")
print(f"y data - {y.shape}\n{y}")

x data - torch.Size([1, 32, 32, 16]) 
[[  20   -4  -20 -100]]

conv data - torch.Size([32, 3, 3, 16])
tensor([[[[ -34,   39, -102,  ...,   91, -123, -122],
          [  82,  -92,  120,  ...,   66,  113,  106],
          [ 102,  -17,  -38,  ...,   26,   82,  115]],

         [[   2, -122,    0,  ..., -105,  -98,  -72],
          [  90,  -27,   30,  ...,  120,  -79,  -57],
          [ -88,  -56,   55,  ...,   20,  -87,  122]],

         [[  45,   30,  -16,  ...,  112,   78,  -28],
          [ 105,   99,    8,  ...,   -9,   23, -115],
          [ -88,  -54, -113,  ...,   21,   95,  -29]]],


        [[[-115,  -12, -100,  ...,   83,   90,   18],
          [  68,  -94,   33,  ...,   99,  -90, -100],
          [-101,  -18, -106,  ...,  118,  -10,   56]],

         [[-111,  -41,   49,  ...,  104,   83, -123],
          [-107, -121,   41,  ...,  -55, -126,  -79],
          [  16,   52,  -90,  ...,  -88,  -75,   50]],

         [[ -19,    1, -118,  ...,  124,  -26,  -84],
          [ -95,  -44,

In [12]:
import int8conv_cuda
from torch.nn.modules import Module

class IntConv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride =1, padding =1):
        super(IntConv2d,self).__init__()
        self.weight = torch.randint(0,127,(out_channels, kernel_size, kernel_size, in_channels), dtype=torch.int8)
        self.stride = stride
        self.padding = padding

    def forward(self,x):
        # trans_weight = torch.flip(self.weight,[1,2]).transpose(0,3).contiguous()
        # trans_weight = self.weight.permute(0,2,3,1).contiguous()
        trans_weight = self.weight
        y = int8conv_cuda.cu_int8_conv(x,trans_weight,self.stride, self.padding,1)
        y = (y > 127).int()*5
        y = y.type(torch.int8)
        return y
    
    def cuda(self):
        self.weight = self.weight.cuda()
    
# cudnn은 4의 배수만
input_channel= 4
conv = IntConv2d(input_channel,32,3,1,1)
print(conv.weight.shape)
x = torch.randint(0,127,(1,32,32,input_channel), dtype=torch.int8).cuda()


torch.Size([32, 3, 3, 4])


In [13]:
with torch.no_grad():
    conv.cuda()
    y = conv(x)
print(y.shape)

torch.Size([1, 32, 32, 32])


In [14]:
import numpy as np 
conv_data = conv.weight.detach().cpu().numpy()
x_data = x.detach().cpu().numpy()
y_data = y.detach().cpu().numpy()
print(f"x data - {x.shape} \n{x_data}\n")
print(f"conv data - {conv_data.shape}\n{conv_data}\n")
print(f"y data - {y.shape}\n{y}")

x data - torch.Size([1, 32, 32, 4]) 
[[[[115 113 121  50]
   [ 20   9 125  21]
   [ 95  41 112   2]
   ...
   [ 85  94  28  66]
   [ 38  10  73  42]
   [110  92  59  29]]

  [[ 37 112  41   5]
   [107  49  35 115]
   [ 66  37 116  34]
   ...
   [ 22  78  79  67]
   [  3  52   3  98]
   [ 82  35 117  57]]

  [[ 40  72  39  88]
   [ 97  56   4  74]
   [ 84  66  45  85]
   ...
   [ 32 103  72 107]
   [ 28  41  97 116]
   [119  74  43  53]]

  ...

  [[ 99  80  93  44]
   [ 97 109  19  29]
   [ 40  57  70  85]
   ...
   [ 97   0  84 120]
   [ 58  62  68 120]
   [  4  10  79  35]]

  [[118  37  56 115]
   [ 92  18 115  11]
   [ 51 122  36  16]
   ...
   [106  66 112  31]
   [  8  80  60 125]
   [ 14 123  80 120]]

  [[124  57  44  51]
   [ 80  46  54 110]
   [  6  32   7 107]
   ...
   [ 51 122   4  17]
   [ 91   1  81 117]
   [ 72  60  69  92]]]]

conv data - (32, 3, 3, 4)
[[[[ 90  38  94 101]
   [ 46  17   6  59]
   [ 77  53  57 113]]

  [[104  24  76  50]
   [ 59  31  46  38]
   [ 40  45

VGG 모델 테스트

In [15]:
import torch.nn as nn

class VGG(nn.Module):
    def __init__(
        self, features: nn.Module, num_classes: int = 100, dropout: float = 0.5) -> None:
        super().__init__()
        self.features = features
        self.avgpool = IntPool(7,1,0,1)
        self.classifier = nn.Sequential(
            IntLinear(512,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            IntLinear(4096,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            IntLinear(4096,num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x
    
    def cuda(self):
        for layer in model.modules():
            if 'Int' in str(type(layer)):
                layer.cuda()

def make_layers(cfg, batch_norm: bool = False) -> nn.Sequential:
    layers = []
    in_channels = 4
    for vs in cfg:
        for v in vs:
            v = int(v)
            conv2d = IntConv2d(in_channels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.ReLU()]
            in_channels = v
        layers += [IntPool(kernel_size=2, stride=2)]
    return nn.Sequential(*layers)


cfgs = {
    "D": [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512],[512, 512, 512]],
    # "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
}

def int_vgg(cfg: str, **kwargs) -> VGG:
    model = VGG(make_layers(cfgs[cfg]), **kwargs)
    return model

In [16]:
modules = []
before_l = []
after_l = []
hooks = []

def hook_fn(module, input, output):
    modules.append(module)
    before_l.append(input[0])
    after_l.append(output)

def add_forward_hook(net, hooks):
    for name, layer in net._modules.items():
        if isinstance(layer, nn.Sequential) or isinstance(layer, torchvision.models.vgg.VGG):
            add_forward_hook(layer, hooks)
        else:
            hook = layer.register_forward_hook(hook_fn)
            hooks.append(hook)
            
    return hooks

def remove_forward_hook(hooks):
    for i in hooks:
        i.remove()
# out = model((torch.randn(1,3,32,32)))

In [17]:
# class test_module(Module):
#     def __init__(self,num_classes= 100):
#         super(test_module,self).__init__()
#         self.layers = nn.Sequential(
#             IntLinear(512,4096),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             IntLinear(4096,4096),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             IntLinear(4096,num_classes),
#         )
#     def forward(self,x):
#         for l in self.layers:
#             x = l(x)
#             print(x.shape, x.dtype)
#         return x
#     def cuda(self):
#         for layer in model.modules():
#             if 'Int' in str(type(layer)):
#                 layer.cuda()

# model = test_module()
# x = torch.randint(-127,127,(1,512)).cuda()
# model.eval()
# model.cuda()
# print(model.layers[0].weight)
# print(x.dtype)
# with torch.no_grad():
#     y = model(x)



In [20]:
model = int_vgg("D")
model.eval()
hooks = add_forward_hook(model, hooks)
# remove hook, hook works at once
remove_forward_hook(hooks)
model.cuda()
with torch.no_grad():
    x = torch.randint(-127,127,(1,224,224,4), dtype=torch.int8).cuda()
    y = model(x)
    print(len(hooks), len(modules), len(before_l), len(after_l))
    remove_forward_hook(hooks)
    hooks=[]
print(y.dtype, y.shape, y)
    

78 0 0 0
torch.int8 torch.Size([1, 100]) tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]], device='cuda:0', dtype=torch.int8)


In [19]:
i = torch.randint(-127, 127,(1,4,4,3), dtype=torch.int8).cuda()
lay = nn.Dropout(0.5)

lay.cuda()
with torch.no_grad():
    lay.eval()
    y = lay(i)
    k = nn.functional.relu(y)
print(y)
print(k)

tensor([[[[-108,  117,   34],
          [  60,   15,  -27],
          [  -1, -124,    6],
          [  67,   70,  116]],

         [[  28,  -51,    8],
          [  86,   71,  -30],
          [  68,  122,  111],
          [-103,   33,  -62]],

         [[-102,   31, -122],
          [  71, -110,   -3],
          [  65,   78,   35],
          [  50,  101,   -6]],

         [[  19,  125,  -79],
          [ 113,   37,  -89],
          [ -86,  -91,   72],
          [-114,   86,   95]]]], device='cuda:0', dtype=torch.int8)
tensor([[[[  0, 117,  34],
          [ 60,  15,   0],
          [  0,   0,   6],
          [ 67,  70, 116]],

         [[ 28,   0,   8],
          [ 86,  71,   0],
          [ 68, 122, 111],
          [  0,  33,   0]],

         [[  0,  31,   0],
          [ 71,   0,   0],
          [ 65,  78,  35],
          [ 50, 101,   0]],

         [[ 19, 125,   0],
          [113,  37,   0],
          [  0,   0,  72],
          [  0,  86,  95]]]], device='cuda:0', dtype=torch.int8)
