---
Setup

In [1]:
!cat /home/project/ml/pytorch/torch/version.py

__version__ = '1.9.0a0+gitb5647dd'
debug = False
cuda = None
git_version = 'b5647dd52b48a51fec1387382deb5b59c7651512'
hip = None


In [2]:
!PYTHONPATH=/home/project/ml/pytorch/ python -c "import torch; print(torch.__file__, torch._C._GLIBCXX_USE_CXX11_ABI)"

  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}
/home/project/ml/pytorch/torch/__init__.py True


In [3]:
#pragma cling add_include_path("/home/project/ml/pytorch/torch/include")
#pragma cling add_include_path("/home/project/ml/pytorch/torch/include/torch/csrc/api/include")
// If you want to add library path
#pragma cling add_library_path("/home/project/ml/pytorch/torch/lib")
// If you want to load library
#pragma cling load("libtorch")
#pragma cling load("libtorch_cpu")
#pragma cling load("libc10")

----

Test

In [4]:
#include <iostream>
#include <ATen/ATen.h>

auto p = at::CPU(at::kFloat);
std::cout << p << "\n";
auto t = at::ones({3, 10}, p);
std::cout << t << "\n";

CPUFloatType
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
[ CPUFloatType{3,10} ]


In [5]:
t.sizes().size()

2

----

Iterate over input (B, C, H, w), output (B, C, h, w), weights (1, 1, h * M, 1)

In [6]:
#include <vector>
#include <ATen/native/TensorIterator.h>

In [26]:
at::Tensor input = at::arange(2 * 3 * 10 * 5, at::CPU(at::kFloat)).reshape({2, 3, 10, 5});
at::Tensor output = at::zeros({2, 3, 4, 5});

int ndims = input.dim();
int reshape_dim = 2;
int output_size = output.sizes()[reshape_dim];

// Restride input
{
    auto shape = input.sizes().vec();
    auto strides = input.strides().vec();
    auto oshape = output.sizes();

    for (int i=2; i<ndims; i++) {
        shape[i] = oshape[i];
        strides[i] = 0;
    }
    input = input.as_strided(shape, strides);
}

// Define indices
at::Tensor indices;
auto new_shape = std::vector<int64_t>(ndims);
for (int j=0; j<new_shape.size(); j++) {
    new_shape[j] = 1;
}
new_shape[reshape_dim] = output_size;

indices = at::arange(new_shape[reshape_dim], at::CPU(at::kLong)).reshape(new_shape);
indices *= (int64_t) sizeof(float);

In [27]:
// Define and restride weights
int weights_size = 3;
at::Tensor weights;
auto new_shape = std::vector<int64_t>(ndims);
for (int j=0; j<new_shape.size(); j++) {
    new_shape[j] = 1;
}
new_shape[reshape_dim] = output_size * weights_size;

weights = at::arange(new_shape[reshape_dim], at::CPU(at::kFloat)).reshape(new_shape);
auto strides = weights.strides().vec();
strides[reshape_dim] = 0;
new_shape[reshape_dim] = output_size;
weights = weights.as_strided(new_shape, strides);

In [28]:
std::cout << "-- Input strides: " << input.strides() << std::endl;

-- Input strides: [150, 50, 0, 0]


In [29]:
std::cout << "-- Indices strides: " << indices.strides() << std::endl;

-- Indices strides: [4, 4, 1, 1]


In [30]:
std::cout << "-- Weights strides: " << weights.strides() << std::endl;

-- Weights strides: [12, 12, 0, 1]


In [31]:
auto iter = at::TensorIteratorConfig()
    .check_all_same_dtype(false)
    .declare_static_dtype_and_device(input.scalar_type(), input.device())
    .add_output(output)
    .add_input(input)
    .add_input(indices)    
    .add_input(weights)
    .build();

In [34]:
auto test_loop = [&](char **data, const int64_t* strides, int64_t n) {

    std::cout << "n : " << n << std::endl;
    std::cout << "Output stride: " << strides[0] << std::endl;
    std::cout << "Input stride: " << strides[1] << std::endl;
    std::cout << "Indices stride: " << strides[2] << std::endl;
    std::cout << "Weights stride: " << strides[3] << std::endl;
    
    auto * out = data[0];
    auto * in = data[1];
    auto * idx = data[2];
    auto * wts = data[3];

    
    // assume float data type for this example.
    std::cout << " - input data: " << std::endl;
    for (int i = 0; i < n; i++) {
        std::cout << *reinterpret_cast<float*>(&in[i * strides[1] + idx[i * strides[2]]]) << " ";
    }
    std::cout << std::endl;

    std::cout << " - indices data: " << std::endl;
    for (int i = 0; i < n; i++) {
        std::cout << *reinterpret_cast<long*>(&idx[i * strides[2]]) << " ";
    }
    std::cout << std::endl;
        
    std::cout << " - weights data: " << std::endl;
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < weights_size; j++) {
            std::cout << *reinterpret_cast<float*>(&wts[i * strides[3] + weights_size * idx[i * strides[2]] + j * sizeof(float)]) << " ";            
        }
        std::cout << "| ";

    }
    std::cout << std::endl;
    
    std::cout << std::endl;
};



iter.for_each(test_loop);

n : 5
Output stride: 4
Input stride: 0
Indices stride: 0
Weights stride: 0
 - input data: 
0 0 0 0 0 
 - indices data: 
0 0 0 0 0 
 - weights data: 
0 1 2 | 0 1 2 | 0 1 2 | 0 1 2 | 0 1 2 | 

n : 5
Output stride: 4
Input stride: 0
Indices stride: 0
Weights stride: 0
 - input data: 
1 1 1 1 1 
 - indices data: 
4 4 4 4 4 
 - weights data: 
3 4 5 | 3 4 5 | 3 4 5 | 3 4 5 | 3 4 5 | 

n : 5
Output stride: 4
Input stride: 0
Indices stride: 0
Weights stride: 0
 - input data: 
2 2 2 2 2 
 - indices data: 
8 8 8 8 8 
 - weights data: 
6 7 8 | 6 7 8 | 6 7 8 | 6 7 8 | 6 7 8 | 

n : 5
Output stride: 4
Input stride: 0
Indices stride: 0
Weights stride: 0
 - input data: 
3 3 3 3 3 
 - indices data: 
12 12 12 12 12 
 - weights data: 
9 10 11 | 9 10 11 | 9 10 11 | 9 10 11 | 9 10 11 | 

n : 5
Output stride: 4
Input stride: 0
Indices stride: 0
Weights stride: 0
 - input data: 
50 50 50 50 50 
 - indices data: 
0 0 0 0 0 
 - weights data: 
0 1 2 | 0 1 2 | 0 1 2 | 0 1 2 | 0 1 2 | 

n : 5
Output stride: 4
In