In [1]:
import torch
from torch.utils.cpp_extension import load
import os

os.environ['CUDA_LAUNCH_BLOCKING']="1"

torch integer data flow 는 gpu에서 사용 불가

In [2]:
# int_data = torch.randint(0,255,(1,3,24,24), dtype=torch.uint8)
# weight = torch.randint(0,255,(1,3,3,3), dtype=torch.uint8)

# # b = torch.nn.functional.conv2d(int_data.cuda(), weight=weight.cuda(),stride=1)
# b = torch.nn.functional.conv2d(int_data, weight=weight,stride=1,bias=None, padding=1, dtype=torch.uint8)
# print(b.shape)

In [3]:
import int8mm_cuda
from torch.nn.modules import Module

class IntLinear(Module):
    def __init__(self, in_channels, out_channels):
        super(IntLinear,self).__init__()
        self.weight = torch.randint(-127,127,(out_channels, in_channels), dtype=torch.int8)

    def forward(self,x):
        # weight [OUT, IN} - > [IN, OUT]
        # input [BATCH, IN]
        y = int8mm_cuda.int8_mm(x,self.weight.transpose(1,0).contiguous())
        y = (y > 127).int()*5
        y = y.type(torch.int8)
        return y
    
    def cuda(self):
        self.weight = self.weight.cuda()
        

mm = IntLinear(4,12)
x = torch.randint(-127,127,(1,4), dtype=torch.int8).cuda()
print(x.dtype, mm.weight.dtype)
with torch.no_grad():
    mm.cuda()
    y = mm(x)
print(x)
print(mm.weight)
print(y)

torch.int8 torch.int8
tensor([[ 110,   85,    1, -126]], device='cuda:0', dtype=torch.int8)
tensor([[ -77,   -5, -112,  -78],
        [ -65,  -61,   65, -103],
        [ -59,   37, -124,   77],
        [ -10,  123,   75,  -58],
        [ -40,   95,   83,   22],
        [  40,  -69,   -4,  -49],
        [ -72,   86, -105,  -54],
        [ -48,  -75,  -16,   23],
        [ 122,  -28,   29,  105],
        [  19,   57,   27, -115],
        [  69,  111,   33,   86],
        [ -97,  -19,   87,   50]], device='cuda:0', dtype=torch.int8)
tensor([[5, 5, 0, 5, 5, 5, 5, 0, 0, 5, 5, 0]], device='cuda:0',
       dtype=torch.int8)


In [4]:
import numpy as np 
print(mm.weight.transpose(1,0),end="\n\n")
x_data = x.detach().cpu().numpy()
mm_data = mm.weight.detach().cpu().numpy()
print(f"x data - {x_data}\n")
print(f"mm data - {mm_data}\nmm Trans - {mm_data.T}\n")
y = x_data @ mm_data.T
print(f"y - {y}")

tensor([[ -77,  -65,  -59,  -10,  -40,   40,  -72,  -48,  122,   19,   69,  -97],
        [  -5,  -61,   37,  123,   95,  -69,   86,  -75,  -28,   57,  111,  -19],
        [-112,   65, -124,   75,   83,   -4, -105,  -16,   29,   27,   33,   87],
        [ -78, -103,   77,  -58,   22,  -49,  -54,   23,  105, -115,   86,   50]],
       device='cuda:0', dtype=torch.int8)

x data - [[ 110   85    1 -126]]

mm data - [[ -77   -5 -112  -78]
 [ -65  -61   65 -103]
 [ -59   37 -124   77]
 [ -10  123   75  -58]
 [ -40   95   83   22]
 [  40  -69   -4  -49]
 [ -72   86 -105  -54]
 [ -48  -75  -16   23]
 [ 122  -28   29  105]
 [  19   57   27 -115]
 [  69  111   33   86]
 [ -97  -19   87   50]]
mm Trans - [[ -77  -65  -59  -10  -40   40  -72  -48  122   19   69  -97]
 [  -5  -61   37  123   95  -69   86  -75  -28   57  111  -19]
 [-112   65 -124   75   83   -4 -105  -16   29   27   33   87]
 [ -78 -103   77  -58   22  -49  -54   23  105 -115   86   50]]

y - [[  53  -60 -115   98  -38   97  -55  

Pooling layer


In [5]:
import int8pool_cuda

class IntPool(Module):
    def __init__(self,kernel_size = 2, stride = 2, padding=0, mode=0):
        super(IntPool,self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.mode = mode
    
    def forward(self,x):
        y = int8pool_cuda.int8_pool(x,self.kernel_size, self.stride, self.padding, self.mode)
        # y = (y > 10).int()*5
        # y = y.type(torch.int8)
        return y

pool = IntPool()
x = torch.randint(0, 127,(4,32,32,4), dtype=torch.int8).cuda()

In [6]:
with torch.no_grad():
    y = pool(x)

In [7]:
print(f"x - {x.shape} \n{x}\n")
print(f"y - {y.shape}\n{y}")

x - torch.Size([4, 32, 32, 4]) 
tensor([[[[ 43,  72,  59,  47],
          [ 45,  93,  37,  86],
          [ 74,  44, 114,  98],
          ...,
          [  0, 115,  17,  18],
          [  6,  78, 116,  16],
          [ 82,  91,  43,  26]],

         [[ 98,  55,  83,  59],
          [  0,   3,   2,  76],
          [ 90,  39, 119, 115],
          ...,
          [ 53,  27,  39,  88],
          [ 70,  90, 101,   3],
          [110,  60,  20,  33]],

         [[ 49,  27,  99,  17],
          [ 86,  13,  66,  64],
          [  6, 108,   9,   9],
          ...,
          [ 82,   0,  59,  38],
          [ 69,  89,   7,  68],
          [ 27,  98, 111, 106]],

         ...,

         [[ 13,  24,   3,  63],
          [117, 120,   9,  66],
          [ 19,  26,  91,  96],
          ...,
          [110,  27,   9,  35],
          [ 61,  73, 100,  51],
          [ 13,  70,  29,  21]],

         [[ 42,  19,  15, 117],
          [ 49, 116,  85,  44],
          [123,  33,  86,  24],
          ...,
      

In [8]:
avg_pool = IntPool(mode=1)
x = torch.randint(0, 127,(4,32,32,4), dtype=torch.int8).cuda()
with torch.no_grad():
    y = pool(x)
print(f"x - {x.shape} \n{x}\n")
print(f"y - {y.shape}\n{y}")

x - torch.Size([4, 32, 32, 4]) 
tensor([[[[125, 124,  67,  85],
          [ 36,  84, 102,  20],
          [ 99,  11, 123,  88],
          ...,
          [ 77,  36, 126,  56],
          [106,   8,  54, 105],
          [126,  83,  56,  49]],

         [[126,  36,  35,  83],
          [ 39,  87,  18,  11],
          [ 92,   2,   8, 119],
          ...,
          [ 50,  32,  62,  16],
          [ 16,  23,  45,  25],
          [ 23,  44, 113,  67]],

         [[ 77,  67, 113,  44],
          [ 11,  92, 105,  40],
          [ 64,  34,  64,  20],
          ...,
          [103,  68,  61,  40],
          [ 61,  58,  20,   5],
          [ 94,  64,   2,  66]],

         ...,

         [[ 34,  62,  61, 111],
          [ 79,  57,  39,  46],
          [ 79,  36,  29,  48],
          ...,
          [ 63,   0,  81,  93],
          [108, 104,   5,  27],
          [ 53,  60,  79,  63]],

         [[117,  64,  80,  48],
          [ 80,   0,  15,  66],
          [ 25,  84,  55,   5],
          ...,
      

pytorch conv layer parmeter shape 보기

In [9]:
import torchvision
model = torchvision.models.vgg.vgg16(pretrained=True)
print(model.features[0].weight.shape)
c = torch.nn.Conv2d(4,12,3,1,1)
print(c.weight.shape) # NCHW



torch.Size([64, 3, 3, 3])
torch.Size([12, 4, 3, 3])


In [10]:
import cutlassconv
from torch.nn.modules import Module

class IntConv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride =1, padding =1):
        super(IntConv2d,self).__init__()
        self.weight = torch.randint(-127,127,(out_channels, kernel_size, kernel_size, in_channels), dtype=torch.int8)
        self.stride = stride
        self.padding = padding

    def forward(self,x):
        # trans_weight = torch.flip(self.weight,[1,2]).transpose(0,3).contiguous()
        # trans_weight = self.weight.permute(0,2,3,1).contiguous()
        trans_weight = self.weight
        return cutlassconv.int8_conv(x,trans_weight)
    
    def cuda(self):
        self.weight = self.weight.cuda()
        
## cutlass는 16의 배수만
input_channel= 16
conv = IntConv2d(input_channel,32,3,1,1)
print(conv.weight.shape)
x = torch.randint(0,127,(1,32,32,input_channel), dtype=torch.int8).cuda()


torch.Size([32, 3, 3, 16])


In [11]:
with torch.no_grad():
    conv.cuda()
    y = conv(x)


In [12]:
import numpy as np 
print(f"x data - {x.shape} \n{x_data}\n")
print(f"conv data - {conv.weight.shape}\n{conv.weight}\n")
print(f"y data - {y.shape}\n{y}")

x data - torch.Size([1, 32, 32, 16]) 
[[ 110   85    1 -126]]

conv data - torch.Size([32, 3, 3, 16])
tensor([[[[ -73,   33,   35,  ...,   81, -114,  126],
          [ 110,  -57, -125,  ..., -109,  -84,   23],
          [  38, -110,  -36,  ...,   12,  -24,   80]],

         [[ -88,  106,  -18,  ...,   31,    7,    9],
          [  -7,  -64, -108,  ...,    8,   89,    3],
          [ -83, -109,   21,  ...,   37,  -32,   64]],

         [[ -28,  -42, -106,  ...,   82,   72,  -61],
          [   2,  -32,  121,  ...,  -43,   77,   27],
          [ -10,   71,  -45,  ...,  -19,   65,  -58]]],


        [[[  81,  126, -122,  ...,  -17,  -15, -118],
          [ -38,   95,  106,  ...,  120,  -54,   81],
          [ -20,   53,  -51,  ...,  -52,  -81,   70]],

         [[-111,   75,   61,  ...,   54,   52, -119],
          [ -78,    2,  -62,  ..., -127,  -95,   70],
          [   4,  -62, -106,  ...,  125,    8,  102]],

         [[ -89,   22,   50,  ...,  -85,  -67,  -89],
          [ -93,  -30,

In [13]:
import int8conv_cuda
from torch.nn.modules import Module

class IntConv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride =1, padding =1):
        super(IntConv2d,self).__init__()
        self.weight = torch.randint(0,127,(out_channels, kernel_size, kernel_size, in_channels), dtype=torch.int8)
        self.stride = stride
        self.padding = padding

    def forward(self,x):
        # trans_weight = torch.flip(self.weight,[1,2]).transpose(0,3).contiguous()
        # trans_weight = self.weight.permute(0,2,3,1).contiguous()
        trans_weight = self.weight
        y = int8conv_cuda.cu_int8_conv(x,trans_weight,self.stride, self.padding,1)
        y = (y > 127).int()*5
        y = y.type(torch.int8)
        return y
    
    def cuda(self):
        self.weight = self.weight.cuda()
    
# cudnn은 4의 배수만
input_channel= 4
conv = IntConv2d(input_channel,32,3,1,1)
print(conv.weight.shape)
x = torch.randint(0,127,(1,32,32,input_channel), dtype=torch.int8).cuda()


torch.Size([32, 3, 3, 4])


In [14]:
with torch.no_grad():
    conv.cuda()
    y = conv(x)
print(y.shape, y.device)

torch.Size([1, 32, 32, 32]) cuda:0


In [15]:
import numpy as np 
conv_data = conv.weight.detach().cpu().numpy()
x_data = x.detach().cpu().numpy()
y_data = y.detach().cpu().numpy()
print(f"x data - {x.shape} \n{x_data}\n")
print(f"conv data - {conv_data.shape}\n{conv_data}\n")
print(f"y data - {y.shape}\n{y}")

x data - torch.Size([1, 32, 32, 4]) 
[[[[ 68  85 106 123]
   [ 79  70  53   9]
   [ 96 100  14  18]
   ...
   [ 94   8  24 100]
   [ 68  56 125  12]
   [ 92  53  48  14]]

  [[ 26  61  37 118]
   [ 29  51  30  64]
   [ 90 103  46  87]
   ...
   [103  16  30  27]
   [ 67  42 120  88]
   [ 75  30 106  26]]

  [[109 122  47 110]
   [ 30 104   5  87]
   [115  98  46   6]
   ...
   [ 13  22  34 118]
   [  3 104  26  11]
   [ 90  52  37 117]]

  ...

  [[117  46  36  43]
   [ 72  21 105 103]
   [ 91 104 124  87]
   ...
   [ 78  50  59  12]
   [ 75 117  47  11]
   [ 90  66  46  46]]

  [[ 11  98  87  96]
   [ 45  95  20  74]
   [117  56 104  68]
   ...
   [ 28  15  89  81]
   [ 46 123  91  70]
   [102  58  10  93]]

  [[ 54 107  69 116]
   [ 57  58  77   1]
   [117  38  68 107]
   ...
   [ 64  70 123   4]
   [  7  48  29 117]
   [ 94  28  60  27]]]]

conv data - (32, 3, 3, 4)
[[[[ 65  29  93  47]
   [ 10  80  54   8]
   [ 13  49  10  51]]

  [[ 79  64  87 126]
   [ 12 118  55  28]
   [ 93  60

VGG 모델 테스트

In [16]:
import torch.nn as nn

class VGG(nn.Module):
    def __init__(
        self, features: nn.Module, num_classes: int = 100, dropout: float = 0.5) -> None:
        super().__init__()
        self.features = features
        self.avgpool = IntPool(7,1,0,1)
        self.classifier = nn.Sequential(
            IntLinear(512,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            IntLinear(4096,4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            IntLinear(4096,num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x
    
    def cuda(self):
        for layer in model.modules():
            if 'Int' in str(type(layer)):
                layer.cuda()

def make_layers(cfg, batch_norm: bool = False) -> nn.Sequential:
    layers = []
    in_channels = 4
    for vs in cfg:
        for v in vs:
            v = int(v)
            conv2d = IntConv2d(in_channels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.ReLU()]
            in_channels = v
        layers += [IntPool(kernel_size=2, stride=2)]
    return nn.Sequential(*layers)


cfgs = {
    "D": [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512],[512, 512, 512]],
    # "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"],
}

def int_vgg(cfg: str, **kwargs) -> VGG:
    model = VGG(make_layers(cfgs[cfg]), **kwargs)
    return model

In [17]:
modules = []
before_l = []
after_l = []
hooks = []

def hook_fn(module, input, output):
    modules.append(module)
    before_l.append(input[0])
    after_l.append(output)

def add_forward_hook(net, hooks):
    for name, layer in net._modules.items():
        if isinstance(layer, nn.Sequential) or isinstance(layer, torchvision.models.vgg.VGG):
            add_forward_hook(layer, hooks)
        else:
            hook = layer.register_forward_hook(hook_fn)
            hooks.append(hook)
            
    return hooks

def remove_forward_hook(hooks):
    for i in hooks:
        i.remove()
# out = model((torch.randn(1,3,32,32)))

In [18]:
# class test_module(Module):
#     def __init__(self,num_classes= 100):
#         super(test_module,self).__init__()
#         self.layers = nn.Sequential(
#             IntLinear(512,4096),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             IntLinear(4096,4096),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             IntLinear(4096,num_classes),
#         )
#     def forward(self,x):
#         for l in self.layers:
#             x = l(x)
#             print(x.shape, x.dtype)
#         return x
#     def cuda(self):
#         for layer in model.modules():
#             if 'Int' in str(type(layer)):
#                 layer.cuda()

# model = test_module()
# x = torch.randint(-127,127,(1,512)).cuda()
# model.eval()
# model.cuda()
# print(model.layers[0].weight)
# print(x.dtype)
# with torch.no_grad():
#     y = model(x)



In [19]:
model = int_vgg("D")
model.eval()
hooks = add_forward_hook(model, hooks)
# remove hook, hook works at once
remove_forward_hook(hooks)
model.cuda()
with torch.no_grad():
    x = torch.randint(-127,127,(1,224,224,4), dtype=torch.int8).cuda()
    y = model(x)
    print(len(hooks), len(modules), len(before_l), len(after_l))
    remove_forward_hook(hooks)
    hooks=[]
print(y.dtype, y.shape, y)
    

39 0 0 0
torch.int8 torch.Size([1, 100]) tensor([[5, 5, 5, 0, 0, 0, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0, 5, 0, 0,
         0, 0, 5, 5, 0, 5, 0, 0, 5, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 5, 5, 0, 5,
         5, 0, 0, 0, 0, 5, 0, 5, 0, 0, 0, 5, 5, 0, 0, 5, 0, 5, 5, 5, 0, 5, 0, 0,
         5, 0, 0, 0, 5, 0, 0, 5, 0, 5, 0, 0, 5, 0, 0, 5, 0, 0, 5, 0, 0, 0, 0, 0,
         0, 0, 0, 5]], device='cuda:0', dtype=torch.int8)


In [20]:
i = torch.randint(-127, 127,(1,4,4,3), dtype=torch.int8).cuda()
lay = nn.Dropout(0.5)

lay.cuda()
with torch.no_grad():
    lay.eval()
    y = lay(i)
    k = nn.functional.relu(y)
print(y)
print(k)

tensor([[[[  19,  -55,   61],
          [ 125,   74,    9],
          [ -34,   89,   48],
          [  43,  112,  114]],

         [[ -94,  116, -108],
          [  87,   57,   17],
          [  61,    4, -125],
          [  51,   65,   43]],

         [[ -21,   66,    4],
          [  69,  -53,    0],
          [  12,  -33,   98],
          [ -49, -120,   71]],

         [[  92,   99, -122],
          [-118,  -58,   37],
          [-107,   28, -108],
          [  -1,   97,   95]]]], device='cuda:0', dtype=torch.int8)
tensor([[[[ 19,   0,  61],
          [125,  74,   9],
          [  0,  89,  48],
          [ 43, 112, 114]],

         [[  0, 116,   0],
          [ 87,  57,  17],
          [ 61,   4,   0],
          [ 51,  65,  43]],

         [[  0,  66,   4],
          [ 69,   0,   0],
          [ 12,   0,  98],
          [  0,   0,  71]],

         [[ 92,  99,   0],
          [  0,   0,  37],
          [  0,  28,   0],
          [  0,  97,  95]]]], device='cuda:0', dtype=torch.int8)


In [21]:
from models import vgg

model = vgg.int_vgg16("D")
x = torch.randint(-128,127,(1,224,224,4),dtype=torch.int8).cuda()
model.eval()
model.cuda()
with torch.no_grad():
    y = model(x)
    print(y.shape)
print(y)

torch.Size([1, 100])
tensor([[  196347,  -401627,     8226,  -749283,   398182,  -476397,  -342389,
           -84538,   -37475,   230175,  -402695,   -94966,   -78212,   605970,
           489143,   484344,  -577307,  -554651,   518771,   190319,  -764553,
           105657,  -397995,  -566682,  -268697,  -148393,   284067,   794409,
           244869,   808711,  -913071,   306214,   -27984,   614078,   412797,
           217105,   -44410,   601753,  -344748,   430561,   701011,   -87553,
          -756884,  -371892, -1108798,    93179,  -787099,    85379,   191883,
          -233575,   756532,  -350879,   773189,   -57080,  -147580,   132845,
          -881600,   391286,  -334424,  -316748,   422377,   559127,   146402,
          -228850,   309907,  -322196,  -597101,  -194499,  -233162, -1152029,
          -740085,  -489113,  -626800,  -354718,   -25873,  -864417, -1064336,
          -756213,  -478563,  -197071,   450960,  1422805,   392720,   181673,
          -489042,   680798,   

In [22]:
p = nn.MaxPool2d(kernel_size=2, stride=2)

x = torch.nn