# GPU Porting

In [93]:
import os
import re
import shutil

In [77]:
output_path = '/home/xmn/dev/quansight/tmp/pytorch/output'
os.makedirs(output_path, exist_ok=True)

pytorch_path = '/home/xmn/dev/quansight/pytorch-project/pytorch'
thcunn_path = os.path.join(pytorch_path, 'aten/src/THCUNN')
at_cuda_path = os.path.join(pytorch_path, 'aten/src/ATen/native/cuda')

thcunn_files = [
    'SpatialUpSamplingBicubic.cu',
    'SpatialUpSamplingBilinear.cu',
    'SpatialUpSamplingNearest.cu',
    'TemporalUpSamplingLinear.cu',
    'TemporalUpSamplingNearest.cu',
    'VolumetricUpSamplingNearest.cu',
    'VolumetricUpSamplingTrilinear.cu'
]
thcunn_h_files = []

In [47]:
# Remove `aten/src/THNN/generic/*.c` files that is being ported
# Remove functions to be ported from:
# `/aten/src/THCUNN/CMakeLists.txt`
# `/aten/src/THCUNN/generic/THCUNN.h`
# `/aten/src/THNN/init.cpp`
# `/aten/src/ATen/nn.yaml`

In [141]:
def _remove_ext(v):
    if '.' in v:
        return v.split('.')[0]
    return v

def _get_ext(v):
    if '.' in v:
        return '.' + v.split('.')[-1]
    return ''
    
RULES_NAME = [
    lambda v, w='Temporal': (
        _remove_ext(v).replace(w, '') + '1d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Spatial': (
        _remove_ext(v).replace(w, '') + '2d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
    lambda v, w='Volumetric': (
        _remove_ext(v).replace(w, '') + '3d' + _get_ext(v)
        if v.startswith(w)
        else v
    ),
]

RULES = [] + RULES_NAME


def apply_rules(rules, text):
    _fn = text
    for r in rules:
        _fn = r(_fn)
    return _fn


def convert_filenames(filenames, extra_rules: list = []):
    rules = RULES + extra_rules
    
    result = []
    for fn in filenames:
        result.append(apply_rules(rules, fn))
    return result


# test
at_cuda_files = convert_filenames(thcunn_files, [lambda v: v.replace('Sampling', 'Sample')])
at_cuda_files

['UpSampleBicubic2d.cu',
 'UpSampleBilinear2d.cu',
 'UpSampleNearest2d.cu',
 'UpSampleLinear1d.cu',
 'UpSampleNearest1d.cu',
 'UpSampleNearest3d.cu',
 'UpSampleTrilinear3d.cu']

In [84]:
def create_aten_cuda_files(
    output_path: str,
    thcunn_path: str,
    at_cuda_path: str,
    th_at_filenames: list
): 
    """Porting code from `/aten/src/THCUNN/generic` and `/aten/src/THCUNN`
    to `/aten/src/ATen/native/cuda/`
    
    """
    for th_fn, at_fn in th_at_filenames:
        # get file data from THCUNN
        path_src = os.path.join(thcunn_path, th_fn)
        at_file_output_path = os.path.join(output_path, at_fn)
        # copy also properties and metadata
        shutil.copy2(path_src, at_file_output_path)
        # write output file
        with open(at_file_output_path, 'a') as f_dst:
            # get file data from THCUNN/generic
            f_dst.write('\n')
            path_src = os.path.join(thcunn_path, 'generic', th_fn) 
            with open(path_src, 'r') as f_src:
                f_dst.write('// THCUNN/generic ')
                f_dst.write(f_src.read())
            
            # get file data from ATen/native/cuda
            f_dst.write('\n')
            path_src = os.path.join(at_cuda_path, at_fn)
            with open(path_src, 'r') as f_src:
                f_dst.write('// ATen/native/cuda ')
                f_dst.write(f_src.read())

# test
create_aten_cuda_files(
    output_path, 
    thcunn_path,
    at_cuda_path,
    zip(thcunn_files, at_cuda_files)
)

print(output_path)
!ls -lah {output_path}

/home/xmn/dev/quansight/tmp/pytorch/output
total 92K
drwxrwxr-x 2 xmn xmn 4,0K abr 17 21:05 .
drwxrwxr-x 3 xmn xmn 4,0K abr 17 20:36 ..
-rw-rw-r-- 1 xmn xmn  11K abr 17 22:18 UpSampleBicubic2d.cu
-rw-rw-r-- 1 xmn xmn  11K abr 17 22:18 UpSampleBilinear2d.cu
-rw-rw-r-- 1 xmn xmn 8,6K abr 17 22:18 UpSampleLinear1d.cu
-rw-rw-r-- 1 xmn xmn 7,4K abr 17 22:18 UpSampleNearest1d.cu
-rw-rw-r-- 1 xmn xmn 8,6K abr 17 22:18 UpSampleNearest2d.cu
-rw-rw-r-- 1 xmn xmn 9,9K abr 17 22:18 UpSampleNearest3d.cu
-rw-rw-r-- 1 xmn xmn  14K abr 17 22:18 UpSampleTrilinear3d.cu


In [91]:
def cuda_th2at(files_path: list):
    def add_replace_rule(by, to):
        return lambda v: v.replace(by, to)
    
    rules = [
        add_replace_rule(' int ', ' int64_t '),
        add_replace_rule('Acctype', 'accscalar_t'),
        add_replace_rule('Dtype', 'scalar_t'),
        add_replace_rule(
            'ScalarConvert<Dtype, Acctype>::to',
            'static_cast<accscalar_t>'
        ), 
        add_replace_rule('output.getSize', 'output.size'),
        add_replace_rule(
            'THCNumerics<Dtype>::min()',
            'at::numeric_lmits<scalar_t>::lowest()'
        )
    ]
    
    for f_path in files_path:
        with open(f_path, 'r') as f:
            f_content = f.read()
        
        for rule in rules:
            f_content = rule(f_content)
            
        with open(f_path, 'w') as f:
            f.write(f_content)

cuda_th2at([
    os.path.join(output_path, fn) 
    for fn in at_cuda_files
])

In [142]:
# experimental
with open(os.path.join(output_path, at_cuda_files[0]), 'r') as f:
    text = f.read()

result = re.finditer('THNN_\((.*)\)', text, re.MULTILINE)
for r in result:
    print(r.group(0), apply_rules(RULES_NAME, r.group(1)))

THNN_(SpatialUpSamplingBicubic_shapeCheck) UpSamplingBicubic_shapeCheck2d
THNN_(SpatialUpSamplingBicubic_updateOutput) UpSamplingBicubic_updateOutput2d
THNN_(SpatialUpSamplingBicubic_shapeCheck) UpSamplingBicubic_shapeCheck2d
THNN_(SpatialUpSamplingBicubic_updateGradInput) UpSamplingBicubic_updateGradInput2d
THNN_(SpatialUpSamplingBicubic_shapeCheck) UpSamplingBicubic_shapeCheck2d


In [92]:
!cat {output_path}/{aten_cuda_files[0]}

#include <THCUNN/THCUNN.h>
#include <THC/THCTensor.hpp>
#include <THCUNN/common.h>
#include <THCUNN/upsampling.h>
#include <THC/THCDeviceTensor.cuh>
#include <THC/THCDeviceTensorUtils.cuh>
#include <THC/THCDeviceUtils.cuh>
#include <TH/THHalf.h>
#include <THCUNN/THCHalfAutoNumerics.cuh>
#include <THC/THCAtomics.cuh>

template<typename scalar_t, typename accscalar_t>
#if defined(__HIP_PLATFORM_HCC__)
__launch_bounds__(1024)
#endif
__global__ void bicubic_interp2d_kernel(
  const int64_t num_elements,
  const accscalar_t height_scale,
  const accscalar_t width_scale,
  const THCDeviceTensor<scalar_t, 4> in_data,
  THCDeviceTensor<scalar_t, 4> out_data
) {

  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
  const int64_t batchsize = in_data.getSize(0);
  const int64_t channels = in_data.getSize(1);
  const int64_t input_height = in_data.getSize(2);
  const int64_t input_width = in_data.getSize(3);
  const int64_t output_height = out_data.getSize(2);
  const int64_t output_width = 