# GPU Porting

## Manually check:

* `AT_CHECK` just have 2 args: condition and message
* Check order of height, width in parameters of functions such as: resize,  

## Warnings:

* not garanteed for nested parentheses

In [1]:
import os
import re
import shutil

In [61]:
home_path = os.environ['HOME']
output_path = '/tmp/pytorch/output/gpu/'

os.makedirs(output_path, exist_ok=True)

pytorch_path = os.path.join(
    home_path,
    'dev/quansight/pytorch-project/pytorch'
)
thcunn_path = os.path.join(pytorch_path, 'aten/src/THCUNN')
at_cuda_path = os.path.join(pytorch_path, 'aten/src/ATen/native/cuda')

thcunn_files = [
    'Col2Im.cu',
]
thcunn_h_files = ['im2col.h']

In [58]:
!cp {pytorch_path}/.clang-format {output_path} 

In [39]:
# Remove `aten/src/THNN/generic/*.c` files that is being ported
# Remove functions to be ported from:
# `/aten/src/THCUNN/CMakeLists.txt`
# `/aten/src/THCUNN/generic/THCUNN.h`
# `/aten/src/THNN/init.cpp`
# `/aten/src/ATen/nn.yaml`

In [62]:
def _remove_ext(v):
    if '.' in v:
        return v.split('.')[0]
    return v

def _get_ext(v):
    if '.' in v:
        return '.' + v.split('.')[-1]
    return ''
    
RULES_NAME = [
    lambda v, w='Temporal': (
        _remove_ext(v).replace(w, '') + '1d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Spatial': (
        _remove_ext(v).replace(w, '') + '2d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Volumetric': (
        _remove_ext(v).replace(w, '') + '3d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
]

RULES_NAME_EXTRA = RULES_NAME + []

RULES = [] + RULES_NAME_EXTRA


def apply_rules(rules, text):
    _fn = text
    for r in rules:
        _fn = r(_fn)
    return _fn


def convert_filenames(filenames, extra_rules: list = []):
    rules = RULES + extra_rules
    
    result = []
    for fn in filenames:
        result.append(apply_rules(rules, fn))
    return result


# test
at_cuda_files = convert_filenames(thcunn_files)
at_cuda_files

['Col2Im.cu']

In [63]:
def create_aten_cuda_files(
    output_path: str,
    thcunn_path: str,
    at_cuda_path: str,
    th_at_filenames: list,
    just_gpu_porting: bool = True
): 
    """Porting code from `/aten/src/THCUNN/generic` and `/aten/src/THCUNN`
    to `/aten/src/ATen/native/cuda/`
    
    """
    for th_fn, at_fn in th_at_filenames:
        # get file data from THCUNN
        path_src = os.path.join(thcunn_path, th_fn)
        at_file_output_path = os.path.join(output_path, at_fn)
        # copy also properties and metadata
        shutil.copy2(path_src, at_file_output_path)
        # write output file
        with open(at_file_output_path, 'a') as f_dst:
            # get file data from THCUNN/generic
            f_dst.write('\n')
            path_src = os.path.join(thcunn_path, 'generic', th_fn) 
            if not os.path.isfile(path_src):
                continue
            with open(path_src, 'r') as f_src:
                f_dst.write('\n// THCUNN/generic\n')
                f_dst.write(f_src.read())
            
            # get file data from ATen/native/cuda
            # expetec a initial gpu porting after a `just cpu porting`
            if just_gpu_porting:
                f_dst.write('\n')
                path_src = os.path.join(at_cuda_path, at_fn)
                if os.path.isfile(path_src):
                    with open(path_src, 'r') as f_src:
                        f_dst.write('\n// ATen/native/cuda\n')
                        f_dst.write(f_src.read())

In [64]:
# test
create_aten_cuda_files(
    output_path, 
    thcunn_path,
    at_cuda_path,
    zip(thcunn_files, at_cuda_files)
)

print(output_path)
!ls -lah {output_path}

/tmp/pytorch/output/gpu/
total 48K
drwxrwxr-x 2 xmn xmn 4,0K may 17 16:14 .
drwxrwxr-x 4 xmn xmn 4,0K may 15 22:32 ..
-rw-rw-r-- 1 xmn xmn 2,6K may 16 01:02 .clang-format
-rw-rw-r-- 1 xmn xmn 5,3K may 17 16:14 Col2Im.cu
-rw-rw-r-- 1 xmn xmn  11K may 16 01:02 Im2Col.cu
-rw-rw-r-- 1 xmn xmn 5,9K may 16 01:02 im2col.cuh
-rw-rw-r-- 1 xmn xmn 5,8K may 16 00:52 im2col.h


In [70]:
def add_replace_rule(by, to):
    return lambda v: v.replace(by, to)


def th2at(text: str, extra_rules: list = []):
    # replace rules
    rules = [
        ('#include <THCUNN/THCUNN.h>', 
         '/* TODO: remove duplicated includes */\n'
         '#include <ATen/ATen.h>\n'
         '#include <ATen/AccumulateType.h>\n'
         '#include <ATen/NativeFunctions.h>\n'
         '#include <ATen/TensorUtils.h>\n'
         '#include <ATen/Utils.h>\n'
         '#include <ATen/cuda/CUDAContext.h>\n'
         '#include <ATen/cuda/CUDAApplyUtils.cuh>\n'
        ),
        ('getSize(', 'size('),
        ('Acctype', 'accscalar_t'),
        ('Dtype', 'scalar_t'),
        ('ScalarConvert<scalar_t, accscalar_t>::to',
         'static_cast<accscalar_t>'),
        ('ScalarConvert<accscalar_t, scalar_t>::to',
         'static_cast<scalar_t>'),
        ('THCNumerics<scalar_t>::min()',
         'at::numeric_lmits<scalar_t>::lowest()'),
        ('THCUNN_argCheck', '/* TODO: AT_CHECK just have 2 args */\n   AT_CHECK'),
        ('THAssert', 'AT_ASSERT'),
        ('THCTensor ', 'Tensor '),
        ('THCTensor*', 'Tensor*'),
        ('putDepth', 'put_depth'),
        ('putHeight', 'put_height'),
        ('putWidth', 'put_width'),
        ('putPlane', 'put_plane'),
        ('putLength', 'put_length'),
        ('gradOut', 'grad_out'),
        ('gradIn', 'grad_in'),
        ('nBatch', 'nbatch'),
        ('nChannel', 'nchannel'),
        ('THCState *state,', ''),
        ('THCState* state,', ''),
        ('THCDeviceTensor', 'PackedTensorAccessor'),
        ('state, ', ''),
        ('THCState_getCurrentStream(state)', 'at::cuda::getCurrentCUDAStream()'),
        ('THArgCheck(', '/* TODO: AT_CHECK just have 2 args */\n   AT_CHECK('),
        ('THCudaCheck(cudaGetLastError())',
         'AT_CUDA_CHECK(cudaGetLastError())'),
        ('NULL,', 'Tensor(),'),
        ('THCNumerics<scalar_t>::min()', 'at::numeric_limits<scalar_t>::lowest()'),
        ('->dim()', '.dim()'),
        ('->size(', '.size('),
        ('THCeilDiv', 'cuda::ATenCeilDiv'),
        ('nInput', 'n_input'),
        ('nOutput', 'n_output'),
        ('THCTensor_(new)(state)', 'Tensor()'),
        ('batchSize', 'batch_size'),
        ('THError', 'AT_ERROR'),
        ('THCTensor_(free)', '// TODO: remove it: THCTensor_(free)'),
        ('#if', '// #if'),
        ('#def', '// #def'),
        ('#else', '// #else'),
        ('#endif', '// #endif'),
        ('updateOutput', 'out_cuda'),
        ('updateGradInput', 'backward_out_cuda')
    ] + extra_rules
    
    for by, to in rules:
        text = text.replace(by, to)
        
    # regex rules
    # TODO:
    # - toDeviceTensor
    rules = (
        # rule, output pattern 
        (r'THNN_\((.*)\)', None),
        (r'THCTensor_\(size\)\(\s*([^,]*),\s*(.*)\s*\)', '{}.size({})'),
        (r'THCTensor_\(resize([0-9]*)d\)\(\s*([^,]*),\s*(.*)\s*\)', '{1}.resize_({{ {2} }})'),
        (r'THCTensor_\(nDimensionLegacyNoScalars\)\(\s*(.*)\s*\)', '{}.ndimension()'),
        (r'THCTensor_\(zero\)\(\s*(.*)\s*\)', '{0}.zero_()'),
        (r'THCTensor_\(select\)\(\s*([^,]*),\s*(.*)\s*\)', '{0}.select({1})'),
        (r'THCTensor_\(data\)\(\s*(.*)\s*\)', '{0}.data()'),
        (r'[!](.*)->is_empty\(\)', '{}.numel() != 0'),
        (r'(\w)\s*!=\s*NULL', '{}.defined()'),
        (r'THCUNN_assertSameGPU\([0-9]*,\s*(.*)\s*\);', 
         '/* TODO: TensorArg tensorname_arg{{tensorname, "tensorname", 1}}; */\n'
         '/* TODO: checkAllSameGPU should use TensorArg */\n'
         'checkAllSameGPU(\n'
         '  "/* TODO: use the name of the function as description here */",'
         '  {{ {} }});'), 
        (r'(.*)=\s*THCTensor_\(newContiguous\)\(\s*(.*)\s*\);', 
         'Tensor {0} = {1}_.contiguous(); /* TODO: add _ to the arg definition above */'),
        (r'accscalar_t\(\s*(.*)\s*\)', 'static_cast<accscalar_t>({})'),
        (r'THCNumerics\<scalar_t\>::ne\(\s*(.*),\s*(.*)\s*\)\s*', '{} != {}'),
    )
    
    for rule, output_format in rules:
        result = re.finditer(rule, text, re.MULTILINE)
        for r in result:
            _in = r.group(0)
            if output_format is None:
                _out = r.group(1)
            else:
                _out = output_format.format(*r.groups())
            text = text.replace(_in, apply_rules(RULES_NAME_EXTRA, _out))

    return text


def cuda_th2at(files_path: list, extra_rules: list = []):
    for f_path in files_path:
        with open(f_path, 'r') as f:
            f_content = th2at(f.read(), extra_rules)
        
        with open(f_path, 'w') as f:
            f.write(f_content)
            

In [71]:
# test
# refresh output files
create_aten_cuda_files(
    output_path, 
    thcunn_path,
    at_cuda_path,
    zip(thcunn_files, at_cuda_files)
)

at_cuda_files_path = [
    os.path.join(output_path, fn) 
    for fn in at_cuda_files
]

extra_rules = [
    ('#include <THCUNN/Im2Col.h>', '#include <ATen/cuda/Im2Col.h>'),
    ('kH', 'kernel_height'),
    ('kW', 'kernel_width'),
    ('dH', 'dilation_height'),
    ('dW', 'dilation_width'),
    ('padH', 'pad_height'),
    ('padW', 'pad_width'),
    ('sH', 'stride_height'),
    ('sW', 'stride_width'),
    ('nBlocksH', 'n_blocks_height'),
    ('nBlocksW', 'n_blocks_width'),
    ('shapeCheck', 'shape_check')
]
cuda_th2at(at_cuda_files_path, extra_rules)
for fn in at_cuda_files:
    !clang-format -i {output_path}/{fn}
!cat {output_path}/{at_cuda_files[0]}

/* TODO: remove duplicated includes */
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/NativeFunctions.h>
#include <ATen/TensorUtils.h>
#include <ATen/Utils.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

#include <THC/THCStorage.hpp>
#include <THC/THCTensor.hpp>
#include <THCUNN/common.h>

#include <TH/THHalf.h>
#include <THCUNN/THCHalfAutoNumerics.cuh>

#include <THC/THCGenerateFloatTypes.h>
#include <THCUNN/generic/Col2Im.cu>

#include <ATen/native/cuda/im2col.cuh>

// THCUNN/generic
// #ifndef THC_GENERIC_FILE
// #define THC_GENERIC_FILE "THCUNN/generic/Col2Im.cu"
// #else

#include <ATen/div_rtn.h>

static inline void Col2Im_shape_check(

    Tensor* input,
    Tensor* grad_output,
    int64_t output_height,
    int64_t output_width,
    int64_t kernel_height,
    int64_t kernel_width,
    int64_t dilation_height,
    int64_t dilation_width,
    int64_t padilation_height,
    int64_t padilation_width,
    int64_t stride_height,