# GPU Porting

Missing functions:
 
* THCUNN_check_dim_size

In [93]:
import os
import re
import shutil

In [77]:
output_path = '/home/xmn/dev/quansight/tmp/pytorch/output'
os.makedirs(output_path, exist_ok=True)

pytorch_path = '/home/xmn/dev/quansight/pytorch-project/pytorch'
thcunn_path = os.path.join(pytorch_path, 'aten/src/THCUNN')
at_cuda_path = os.path.join(pytorch_path, 'aten/src/ATen/native/cuda')

thcunn_files = [
    'SpatialUpSamplingBicubic.cu',
    'SpatialUpSamplingBilinear.cu',
    'SpatialUpSamplingNearest.cu',
    'TemporalUpSamplingLinear.cu',
    'TemporalUpSamplingNearest.cu',
    'VolumetricUpSamplingNearest.cu',
    'VolumetricUpSamplingTrilinear.cu'
]
thcunn_h_files = []

In [47]:
# Remove `aten/src/THNN/generic/*.c` files that is being ported
# Remove functions to be ported from:
# `/aten/src/THCUNN/CMakeLists.txt`
# `/aten/src/THCUNN/generic/THCUNN.h`
# `/aten/src/THNN/init.cpp`
# `/aten/src/ATen/nn.yaml`

In [173]:
def _remove_ext(v):
    if '.' in v:
        return v.split('.')[0]
    return v

def _get_ext(v):
    if '.' in v:
        return '.' + v.split('.')[-1]
    return ''
    
RULES_NAME = [
    lambda v, w='Temporal': (
        _remove_ext(v).replace(w, '') + '1d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Spatial': (
        _remove_ext(v).replace(w, '') + '2d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Volumetric': (
        _remove_ext(v).replace(w, '') + '3d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
]

RULES_NAME_EXTRA = RULES_NAME + [
    lambda v: v.replace('Sampling', 'Sample')
]

RULES = [] + RULES_NAME_EXTRA


def apply_rules(rules, text):
    _fn = text
    for r in rules:
        _fn = r(_fn)
    return _fn


def convert_filenames(filenames, extra_rules: list = []):
    rules = RULES + extra_rules
    
    result = []
    for fn in filenames:
        result.append(apply_rules(rules, fn))
    return result


# test
at_cuda_files = convert_filenames(thcunn_files)
at_cuda_files

['UpSampleBicubic2d.cu',
 'UpSampleBilinear2d.cu',
 'UpSampleNearest2d.cu',
 'UpSampleLinear1d.cu',
 'UpSampleNearest1d.cu',
 'UpSampleNearest3d.cu',
 'UpSampleTrilinear3d.cu']

In [330]:
def create_aten_cuda_files(
    output_path: str,
    thcunn_path: str,
    at_cuda_path: str,
    th_at_filenames: list,
    just_gpu_porting: bool = True
): 
    """Porting code from `/aten/src/THCUNN/generic` and `/aten/src/THCUNN`
    to `/aten/src/ATen/native/cuda/`
    
    """
    for th_fn, at_fn in th_at_filenames:
        # get file data from THCUNN
        path_src = os.path.join(thcunn_path, th_fn)
        at_file_output_path = os.path.join(output_path, at_fn)
        # copy also properties and metadata
        shutil.copy2(path_src, at_file_output_path)
        # write output file
        with open(at_file_output_path, 'a') as f_dst:
            # get file data from THCUNN/generic
            f_dst.write('\n')
            path_src = os.path.join(thcunn_path, 'generic', th_fn) 
            with open(path_src, 'r') as f_src:
                f_dst.write('\n// THCUNN/generic\n')
                f_dst.write(f_src.read())
            
            # get file data from ATen/native/cuda
            # expetec a initial gpu porting after a `just cpu porting`
            if just_gpu_porting:
                f_dst.write('\n')
                path_src = os.path.join(at_cuda_path, at_fn)
                with open(path_src, 'r') as f_src:
                    f_dst.write('\n// ATen/native/cuda\n')
                    f_dst.write(f_src.read())

# test
create_aten_cuda_files(
    output_path, 
    thcunn_path,
    at_cuda_path,
    zip(thcunn_files, at_cuda_files)
)

print(output_path)
!ls -lah {output_path}

/home/xmn/dev/quansight/tmp/pytorch/output
total 92K
drwxrwxr-x 2 xmn xmn 4,0K abr 17 21:05 .
drwxrwxr-x 3 xmn xmn 4,0K abr 17 20:36 ..
-rw-rw-r-- 1 xmn xmn  11K abr 18 15:46 UpSampleBicubic2d.cu
-rw-rw-r-- 1 xmn xmn  11K abr 18 15:46 UpSampleBilinear2d.cu
-rw-rw-r-- 1 xmn xmn 8,6K abr 18 15:46 UpSampleLinear1d.cu
-rw-rw-r-- 1 xmn xmn 7,4K abr 18 15:46 UpSampleNearest1d.cu
-rw-rw-r-- 1 xmn xmn 8,6K abr 18 15:46 UpSampleNearest2d.cu
-rw-rw-r-- 1 xmn xmn 9,9K abr 18 15:46 UpSampleNearest3d.cu
-rw-rw-r-- 1 xmn xmn  14K abr 18 15:46 UpSampleTrilinear3d.cu


In [339]:
def add_replace_rule(by, to):
    return lambda v: v.replace(by, to)


def th2at(text: str, extra_rules: list = []):
    # replace rules
    replace_pair = [
        ('#include <THCUNN/THCUNN.h>', '#include <ATen/ATen.h>'),
        ('getSize(', 'size('),
        (' int ', ' int64_t '),
        ('Acctype', 'accscalar_t'),
        ('Dtype', 'scalar_t'),
        ('ScalarConvert<scalar_t, accscalar_t>::to',
         'static_cast<accscalar_t>'),
        ('ScalarConvert<accscalar_t, scalar_t>::to',
         'static_cast<scalar_t>'),
        ('THCNumerics<scalar_t>::min()',
         'at::numeric_lmits<scalar_t>::lowest()'),
        ('THCUNN_argCheck', 'AT_CHECK'),
        ('THCTensor ', 'Tensor '),
        ('THCTensor*', 'Tensor*'),
        ('putWidth', 'put_width'),
        ('putHeight', 'put_height'),
        ('gradOut', 'grad_out'),
        ('gradIn', 'grad_in'),
        ('nBatch', 'nbatch'),
        ('nChannel', 'nchannel'),
        ('THCState *state,', ''),
        ('THCDeviceTensor', 'PackedTensorAccessor'),
        ('state, ', ''),
        ('THCState_getCurrentStream(state)', 'at::cuda::getCurrentCUDAStream()'),
        ('THArgCheck(', 'AT_CHECK('),
    ] + extra_rules
    for by, to in replace_pair:
        text = text.replace(by, to)
        
    # regex rules
    # TODO:
    # - THCCeilDiv
    # - toDeviceTensor
    # - ::ne
    # - ::min
    # - THCTensor_(zero)
    rules = (
        # rule, output pattern 
        ('THNN_\((.*)\)', None),
        ('THCTensor_\(size\)\(([^,]*),\s*(.*)\)', '{}.size({})'),
        ('THCTensor_\(resize([0-9]*)d\)\(([^,]*),\s*\n*(.*)\)', '{1}.resize({{ {2} }})  # {0}d'),
        ('THCTensor_(nDimensionLegacyNoScalars)(\s*(.*))', '{}.ndimension()'),
        ('[!](.*)->is_empty()', '{}.numel() != 0'),
        ('(\w)\s*!=\s*NULL', '{}.defined()')
    )
    
    for rule, output_format in rules:
        result = re.finditer(rule, text, re.MULTILINE)
        for r in result:
            _in = r.group(0)
            if output_format is None:
                _out = r.group(1)
            else:
                _out = output_format.format(*r.groups())
            text = text.replace(_in, apply_rules(RULES_NAME_EXTRA, _out))

    return text


def cuda_th2at(files_path: list, extra_rules: list = []):
    for f_path in files_path:
        with open(f_path, 'r') as f:
            f_content = th2at(f.read(), extra_rules)
        
        with open(f_path, 'w') as f:
            f.write(f_content)
            

# test
# refresh output files
create_aten_cuda_files(
    output_path, 
    thcunn_path,
    at_cuda_path,
    zip(thcunn_files, at_cuda_files)
)

at_cuda_files_path = [
    os.path.join(output_path, fn) 
    for fn in at_cuda_files
]

extra_rules = [
    ('#include <THCUNN/upsampling.h>', '#include <ATen/cuda/UpSample.h>')
]
cuda_th2at(at_cuda_files_path, extra_rules)

In [338]:
# experimental
for fn in at_cuda_files:
    print('=' * 80)
    print(fn)
    with open(os.path.join(output_path, fn), 'r') as f:
        text = f.read()

    _rules = [
        ('THNN_\((.*)\)', None),
        ('THCTensor_\(size\)\(([^,]*),\s*(.*)\)', '{}.size({})'),
        ('THCTensor_\(resize([0-9]*)d\)\(([^,]*),\s*\n*(.*)\)', '{1}.resize({{ {2} }})  # {0}d'),
        ('THCTensor_(nDimensionLegacyNoScalars)(\s*(.*))', '{}.ndimension()'),
        ('[!](.*)->is_empty()', '{}.numel() != 0'),
        ('(\w)\s*!=\s*NULL', '{}.defined()')
    ]

    for rule, output_format in _rules:
        print('-' * 80)
        print('rule: ', rule )
        result = re.finditer(rule, text, re.MULTILINE)

        for r in result:
            _in = r.group(0)

            if output_format is None:
                _out = r.group(1)
            else:
                _out = output_format.format(*r.groups())

            print('replace: ', _in, ' by: ', apply_rules(RULES_NAME, _out))
    break

UpSampleBicubic2d.cu
--------------------------------------------------------------------------------
rule:  THNN_\((.*)\)
replace:  THNN_(SpatialUpSamplingBicubic_shapeCheck)  by:  UpSamplingBicubic_shapeCheck2d
replace:  THNN_(SpatialUpSamplingBicubic_updateOutput)  by:  UpSamplingBicubic_updateOutput2d
replace:  THNN_(SpatialUpSamplingBicubic_shapeCheck)  by:  UpSamplingBicubic_shapeCheck2d
replace:  THNN_(SpatialUpSamplingBicubic_updateGradInput)  by:  UpSamplingBicubic_updateGradInput2d
replace:  THNN_(SpatialUpSamplingBicubic_shapeCheck)  by:  UpSamplingBicubic_shapeCheck2d
--------------------------------------------------------------------------------
rule:  THCTensor_\(size\)\(([^,]*),\s*(.*)\)
replace:  THCTensor_(size)(state, input, 0)  by:  state.size(input, 0)
replace:  THCTensor_(size)(state, input, 1)  by:  state.size(input, 1)
replace:  THCTensor_(size)(state, input, 2)  by:  state.size(input, 2)
replace:  THCTensor_(size)(state, input, 3)  by:  state.size(input, 3)
rep

In [340]:
!cat {output_path}/{at_cuda_files[0]}

#include <ATen/ATen.h>
#include <THC/THCTensor.hpp>
#include <THCUNN/common.h>
#include <ATen/cuda/UpSample.h>
#include <THC/PackedTensorAccessor.cuh>
#include <THC/PackedTensorAccessorUtils.cuh>
#include <THC/THCDeviceUtils.cuh>
#include <TH/THHalf.h>
#include <THCUNN/THCHalfAutoNumerics.cuh>
#include <THC/THCAtomics.cuh>

template<typename scalar_t, typename accscalar_t>
#if defined(__HIP_PLATFORM_HCC__)
__launch_bounds__(1024)
#endif
__global__ void bicubic_interp2d_kernel(
  const int64_t num_elements,
  const accscalar_t height_scale,
  const accscalar_t width_scale,
  const PackedTensorAccessor<scalar_t, 4> in_data,
  PackedTensorAccessor<scalar_t, 4> out_data
) {

  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
  const int64_t batchsize = in_data.size(0);
  const int64_t channels = in_data.size(1);
  const int64_t input_height = in_data.size(2);
  const int64_t input_width = in_data.size(3);
  const int64_t output_height = out_data.size(2);
  const int64_t output_width 

## Manually check:

* `AT_CHECK` just have 2 args: condition and message
* Check order of height, width in parameters of functions such as: resize,  