# GPU Porting

## Manually check:

* `AT_CHECK` just have 2 args: condition and message
* Check order of height, width in parameters of functions such as: resize,  

## Warnings:
 
* Missing THCUNN_check_dim_size
* not garanteed for nested parentheses

In [5]:
import os
import re
import shutil

In [6]:
home_path = os.environ['HOME']
output_path = '/tmp/pytorch/output'

os.makedirs(output_path, exist_ok=True)

pytorch_path = os.path.join(
    home_path,
    'dev/quansight/pytorch-project/pytorch'
)
thcunn_path = os.path.join(pytorch_path, 'aten/src/THCUNN')
at_cuda_path = os.path.join(pytorch_path, 'aten/src/ATen/native/cuda')

thcunn_files = [
    'Im2Col.cu',
]
thcunn_h_files = []

In [4]:
# Remove `aten/src/THNN/generic/*.c` files that is being ported
# Remove functions to be ported from:
# `/aten/src/THCUNN/CMakeLists.txt`
# `/aten/src/THCUNN/generic/THCUNN.h`
# `/aten/src/THNN/init.cpp`
# `/aten/src/ATen/nn.yaml`

In [7]:
def _remove_ext(v):
    if '.' in v:
        return v.split('.')[0]
    return v

def _get_ext(v):
    if '.' in v:
        return '.' + v.split('.')[-1]
    return ''
    
RULES_NAME = [
    lambda v, w='Temporal': (
        _remove_ext(v).replace(w, '') + '1d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Spatial': (
        _remove_ext(v).replace(w, '') + '2d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Volumetric': (
        _remove_ext(v).replace(w, '') + '3d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
]

RULES_NAME_EXTRA = RULES_NAME + [
    lambda v: v.replace('Sampling', 'Sample')
]

RULES = [] + RULES_NAME_EXTRA


def apply_rules(rules, text):
    _fn = text
    for r in rules:
        _fn = r(_fn)
    return _fn


def convert_filenames(filenames, extra_rules: list = []):
    rules = RULES + extra_rules
    
    result = []
    for fn in filenames:
        result.append(apply_rules(rules, fn))
    return result


# test
at_cuda_files = convert_filenames(thcunn_files)
at_cuda_files

['Im2Col.cu']

In [10]:
def create_aten_cuda_files(
    output_path: str,
    thcunn_path: str,
    at_cuda_path: str,
    th_at_filenames: list,
    just_gpu_porting: bool = True
): 
    """Porting code from `/aten/src/THCUNN/generic` and `/aten/src/THCUNN`
    to `/aten/src/ATen/native/cuda/`
    
    """
    for th_fn, at_fn in th_at_filenames:
        # get file data from THCUNN
        path_src = os.path.join(thcunn_path, th_fn)
        at_file_output_path = os.path.join(output_path, at_fn)
        # copy also properties and metadata
        shutil.copy2(path_src, at_file_output_path)
        # write output file
        with open(at_file_output_path, 'a') as f_dst:
            # get file data from THCUNN/generic
            f_dst.write('\n')
            path_src = os.path.join(thcunn_path, 'generic', th_fn) 
            with open(path_src, 'r') as f_src:
                f_dst.write('\n// THCUNN/generic\n')
                f_dst.write(f_src.read())
            
            # get file data from ATen/native/cuda
            # expetec a initial gpu porting after a `just cpu porting`
            if just_gpu_porting:
                f_dst.write('\n')
                path_src = os.path.join(at_cuda_path, at_fn)
                if os.path.isfile(path_src):
                    with open(path_src, 'r') as f_src:
                        f_dst.write('\n// ATen/native/cuda\n')
                        f_dst.write(f_src.read())

In [11]:
# test
create_aten_cuda_files(
    output_path, 
    thcunn_path,
    at_cuda_path,
    zip(thcunn_files, at_cuda_files)
)

print(output_path)
!ls -lah {output_path}

/tmp/pytorch/output
total 16K
drwxrwxr-x 2 xmn xmn 4,0K may 14 18:58 .
drwxrwxr-x 3 xmn xmn 4,0K may 14 18:57 ..
-rw-rw-r-- 1 xmn xmn 4,8K may 14 19:00 Im2Col.cu


In [38]:
def add_replace_rule(by, to):
    return lambda v: v.replace(by, to)


def th2at(text: str, extra_rules: list = []):
    # replace rules
    rules = [
        ('#include <THCUNN/THCUNN.h>', 
         '/* TODO: remove duplicated includes */\n'
         '#include <ATen/ATen.h>\n'
         '#include <ATen/AccumulateType.h>\n'
         '#include <ATen/NativeFunctions.h>\n'
         '#include <ATen/TensorUtils.h>\n'
         '#include <ATen/Utils.h>\n'
         '#include <ATen/cuda/CUDAContext.h>\n'
         '#include <ATen/cuda/CUDAApplyUtils.cuh>\n'
        ),
        ('getSize(', 'size('),
        ('Acctype', 'accscalar_t'),
        ('Dtype', 'scalar_t'),
        ('ScalarConvert<scalar_t, accscalar_t>::to',
         'static_cast<accscalar_t>'),
        ('ScalarConvert<accscalar_t, scalar_t>::to',
         'static_cast<scalar_t>'),
        ('THCNumerics<scalar_t>::min()',
         'at::numeric_lmits<scalar_t>::lowest()'),
        ('THCUNN_argCheck', '/* TODO: AT_CHECK just have 2 args*/ AT_CHECK'),
        ('THAssert', 'AT_ASSERT'),
        ('THCTensor ', 'Tensor '),
        ('THCTensor*', 'Tensor*'),
        ('putDepth', 'put_depth'),
        ('putHeight', 'put_height'),
        ('putWidth', 'put_width'),
        ('gradOut', 'grad_out'),
        ('gradIn', 'grad_in'),
        ('nBatch', 'nbatch'),
        ('nChannel', 'nchannel'),
        ('THCState *state,', ''),
        ('THCDeviceTensor', 'PackedTensorAccessor'),
        ('state, ', ''),
        ('THCState_getCurrentStream(state)', 'at::cuda::getCurrentCUDAStream()'),
        ('THArgCheck(', 'AT_CHECK('),
        ('THCudaCheck(cudaGetLastError())',
         'AT_CUDA_CHECK(cudaGetLastError())'),
        ('NULL,', 'Tensor(),'),
        ('THCNumerics<scalar_t>::min()', 'at::numeric_limits<scalar_t>::lowest()'),
        ('->dim()', '.dim()'),
        ('->size(', '.size('),
        ('THCeilDiv', 'cuda::ATenCeilDiv'),
        ('nInput', 'n_input'),
        ('nOutput', 'n_output'),
        ('putPlane', 'put_plane'),
        ('THCTensor_(new)(state)', 'Tensor()'),
        ('batchSize', 'batch_size')
    ] + extra_rules
    
    for by, to in rules:
        text = text.replace(by, to)
        
    # regex rules
    # TODO:
    # - toDeviceTensor
    rules = (
        # rule, output pattern 
        (r'THNN_\((.*)\)', None),
        (r'THCTensor_\(size\)\(\s*([^,]*),\s*(.*)\s*\)', '{}.size({})'),
        (r'THCTensor_\(resize([0-9]*)d\)\(\s*([^,]*),\s*(.*)\s*\)', '{1}.resize_({{ {2} }})'),
        (r'THCTensor_\(nDimensionLegacyNoScalars\)\(\s*(.*)\s*\)', '{}.ndimension()'),
        (r'THCTensor_\(zero\)\(\s*(.*)\s*\)', '{0}.zero_()'),
        (r'THCTensor_\(data\)\(\s*(.*)\s*\)', '{0}.data()'),
        (r'[!](.*)->is_empty\(\)', '{}.numel() != 0'),
        (r'(\w)\s*!=\s*NULL', '{}.defined()'),
        (r'THCUNN_assertSameGPU\([0-9]*,\s*(.*)\s*\);', 
         '/* TODO: TensorArg tensorname_arg{{tensorname, "tensorname", 1}}; */\n'
         '/* TODO: checkAllSameGPU should use TensorArg */\n'
         'checkAllSameGPU(\n'
         '  "/* TODO: use the name of the function as description here */",'
         '  {{ {} }});'), 
        (r'(.*)=\s*THCTensor_\(newContiguous\)\(\s*(.*)\s*\);', 
         'Tensor {0} = {1}_.contiguous(); /* TODO: add _ to the arg definition above */'),
        (r'accscalar_t\(\s*(.*)\s*\)', 'static_cast<accscalar_t>({})'),
        (r'THCNumerics\<scalar_t\>::ne\(\s*(.*),\s*(.*)\s*\)\s*', '{} != {}'),
    )
    
    for rule, output_format in rules:
        result = re.finditer(rule, text, re.MULTILINE)
        for r in result:
            _in = r.group(0)
            if output_format is None:
                _out = r.group(1)
            else:
                _out = output_format.format(*r.groups())
            text = text.replace(_in, apply_rules(RULES_NAME_EXTRA, _out))

    return text


def cuda_th2at(files_path: list, extra_rules: list = []):
    for f_path in files_path:
        with open(f_path, 'r') as f:
            f_content = th2at(f.read(), extra_rules)
        
        with open(f_path, 'w') as f:
            f.write(f_content)
            

In [39]:
# test
# refresh output files
create_aten_cuda_files(
    output_path, 
    thcunn_path,
    at_cuda_path,
    zip(thcunn_files, at_cuda_files)
)

at_cuda_files_path = [
    os.path.join(output_path, fn) 
    for fn in at_cuda_files
]

extra_rules = [
    ('#include <THCUNN/upsampling.h>', '#include <ATen/cuda/UpSample.h>'),
    ('#include <THCUNN/im2col.h>', '#include <ATen/cuda/Im2Col.h>')
]
cuda_th2at(at_cuda_files_path, extra_rules)
!cat {output_path}/{at_cuda_files[0]}

/* TODO: remove duplicated includes */
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/NativeFunctions.h>
#include <ATen/TensorUtils.h>
#include <ATen/Utils.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

#include <THCUNN/common.h>
#include <ATen/cuda/Im2Col.h>

#include <TH/THHalf.h>
#include <THCUNN/THCHalfAutoNumerics.cuh>
#include <THC/THCTensor.hpp>
#include <THC/THCStorage.hpp>

#include <THCUNN/generic/Im2Col.cu>
#include <THC/THCGenerateFloatTypes.h>


// THCUNN/generic
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "THCUNN/generic/Im2Col.cu"
#else

#include <ATen/div_rtn.h>

static inline void Im2Col_shapeCheck(
                         
                         Tensor *input,
                         Tensor *grad_output,
                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {

  AT_CHECK(kW > 0 && kH > 0, 4,
             "ker

## Experimental

In [7]:
for fn in at_cuda_files:
    print('=' * 80)
    print(fn)
    
    # with open(os.path.join(output_path, fn), 'r') as f:
    #     text = f.read()
    text = '''
      dim3 grid(THCCeilDiv(outputPlaneSize, 128),
            devInput.getSize(1),
            devInput.getSize(0));
    '''
    _rules = [
        (r'THNN_\((.*)\)', None),
        (r'THCTensor_\(size\)\(\s*([^,]*),\s*(.*)\s*\)', '{}.size({})'),
        (r'THCTensor_\(nDimensionLegacyNoScalars\)\(\s*(.*)\s*\)', '{}.ndimension()'),
        (r'[!](.*)->is_empty\(\)', '{}.numel() != 0'),
        (r'(\w)\s*!=\s*NULL', '{}.defined()'),
        (r'THCUNN_assertSameGPU\([0-9]*,\s*(.*)\s*\);', 
         'checkAllSameGPU("check_all_same_gpu", {{ {} }}); // TODO: use the name of the function as description here'),
        (r'THCTensor_\(zero\)\(\s*(.*)\s*\)', '{0}.zero_()'),
        (r'THCTensor_\(newContiguous\)\(\s*(.*)\s*\);', 
         'auto {0} = {0}_.contiguous(); // TODO: add _ to the variable definition above'),
        (r'accscalar_t\(\s*(.*)\s*\)', 'static_cast<accscalar_t>({})'),
        (r'THCNumerics\<scalar_t\>::ne\(\s*(.*),\s*(.*)\s*\)\s*', '{} != {}'),
        (r'THCCeilDiv\(\s*([^()]*),\s*([^()]*)\s*\)', '({0} + {1}-1) / {1}')
    ]

    for rule, output_format in _rules:
        print('-' * 80)
        print('rule: ', rule )
        result = re.finditer(rule, text, re.MULTILINE)

        for r in result:
            _in = r.group(0)

            if output_format is None:
                _out = r.group(1)
            else:
                try:
                    _out = output_format.format(*r.groups())
                except:
                    print('[EE]', r.groups())
            text = text.replace(_in, apply_rules(RULES_NAME_EXTRA, _out))
            
            print('replace: ', _in, ' by: ', apply_rules(RULES_NAME_EXTRA, _out))
    break

UpSampleBicubic2d.cu
--------------------------------------------------------------------------------
rule:  THNN_\((.*)\)
--------------------------------------------------------------------------------
rule:  THCTensor_\(size\)\(\s*([^,]*),\s*(.*)\s*\)
--------------------------------------------------------------------------------
rule:  THCTensor_\(nDimensionLegacyNoScalars\)\(\s*(.*)\s*\)
--------------------------------------------------------------------------------
rule:  [!](.*)->is_empty\(\)
--------------------------------------------------------------------------------
rule:  (\w)\s*!=\s*NULL
--------------------------------------------------------------------------------
rule:  THCUNN_assertSameGPU\([0-9]*,\s*(.*)\s*\);
--------------------------------------------------------------------------------
rule:  THCTensor_\(zero\)\(\s*(.*)\s*\)
--------------------------------------------------------------------------------
rule:  THCTensor_\(newContiguous\)\(\s*(.*)\s*\);
-----

In [12]:
fn = '/home/xmn/dev/quansight/pytorch-project/pytorch/aten/src/ATen/native/cuda/UpSample.h'

at_cuda_files_path = [fn]

cuda_th2at(at_cuda_files_path)
!cat {fn}