In [1]:
#pragma cling add_include_path("./libtorch/include")
#pragma cling add_include_path("./libtorch/include/torch/csrc/api/include")
#pragma cling add_library_path("./libtorch/lib")
#pragma cling load("libtorch")

In [2]:
#include <iostream>
#include <tuple>
#include <string>
#include <vector>
#include <memory>
#include <torch/torch.h>

# 1 Overview of at::Tensor

## 1.1 the raw input keeps a Continuous memory addresses
> The std::vector::data() is an STL in C++ which returns a direct pointer to the memory array used internally by the vector to store its owned elements. 

In [3]:
std::vector<float> flatted_data_vector1d = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18};
std::cout << "data vector 1d: \n" << flatted_data_vector1d << std::endl;

data vector 1d: 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18


In [4]:
float* float_ptr = flatted_data_vector1d.data();
std::cout << float_ptr << std::endl;

0x55aeee3cd0e0


In [5]:
std::cout << *float_ptr << std::endl;
std::cout << *(float_ptr++) << std::endl;

1
1


## 1.2 create at::Tensor from from_blob function

In [6]:
torch::TensorOptions options = torch::TensorOptions().dtype(torch::kFloat32).layout(torch::kStrided).device(torch::kCPU).requires_grad(false);

In [7]:
at::Tensor tensor2d = at::from_blob(/*void**/flatted_data_vector1d.data(), /*IntArrayRef*/{3,6}, /*const TensorOptions&*/options);

In [8]:
std::cout << tensor2d << std::endl;

  1   2   3   4   5   6
  7   8   9  10  11  12
 13  14  15  16  17  18
[ CPUFloatType{3,6} ]


![tensor](./tensor_hierarchy.jpg)

https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/ops/from_blob.h

~~~
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/ops/from_blob.h
class TORCH_API TensorMaker {
  friend TensorMaker for_blob(void* data, IntArrayRef sizes) noexcept;

 public:
  using ContextDeleter = DeleterFnPtr;

  TensorMaker& strides(OptionalIntArrayRef value) noexcept {
    strides_ = value;
    return *this;
  }

  TensorMaker& storage_offset(optional<int64_t> value) noexcept {
    storage_offset_ = value;
    return *this;
  }

  TensorMaker& context(void* value, ContextDeleter deleter = nullptr) noexcept {
    ctx_ = std::unique_ptr<void, ContextDeleter>{
        value, deleter != nullptr ? deleter : detail::noopDelete};

    return *this;
  }


  TensorMaker& options(TensorOptions value) noexcept {
    opts_ = value;

    return *this;
  }

  Tensor make_tensor();

 private:
  explicit TensorMaker(void* data, IntArrayRef sizes) noexcept : data_{data}, sizes_{sizes} {}
  std::size_t computeStorageSize() const noexcept;
  DataPtr makeDataPtrFromContext() noexcept;
  IntArrayRef makeTempSizes() const noexcept;

  void* data_;
  IntArrayRef sizes_;
  OptionalIntArrayRef strides_{};
  optional<int64_t> storage_offset_{};
  std::function<void(void*)> deleter_{};
  std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete};
  optional<Device> device_{};
  TensorOptions opts_{};
};

inline TensorMaker for_blob(void* data, IntArrayRef sizes) noexcept {
  return TensorMaker{data, sizes};
}

inline Tensor from_blob(
    void* data,
    IntArrayRef sizes,
    const TensorOptions& options = {}) {
  return for_blob(data, sizes).options(options).make_tensor();
}
~~~

In [9]:
// the implementation is in https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/templates/Functions.cpp

~~~
namespace at {

Tensor TensorMaker::make_tensor() {
   AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
   tracer::impl::NoTracerDispatchMode tracer_guard{};

   if (device_ == nullopt) {
     device_ = globalContext().getDeviceFromPtr(data_, opts_.device().type());
   }

   std::size_t size_bytes = computeStorageSize();

   DataPtr data_ptr{};
   if (deleter_) {
     data_ptr = makeDataPtrFromDeleter();
   } else {
     data_ptr = makeDataPtrFromContext();
   }

   Storage storage{Storage::use_byte_size_t{}, size_bytes, std::move(data_ptr)};

   Tensor tensor = detail::make_tensor<TensorImpl>(std::move(storage), opts_.computeDispatchKey(), opts_.dtype());

  TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
  if (strides_) {
    tensor_impl->set_sizes_and_strides(sizes_, *strides_);
  } else {
    tensor_impl->set_sizes_contiguous(sizes_);
  }
  if (storage_offset_) {
    tensor_impl->set_storage_offset(*storage_offset_);
  }
   return tensor;
 }
~~~

In [10]:
// entry_point to create at::Tensor

~~~
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/templates/TensorBody.h

namespace detail {
// Helper creator for Tensor class which doesn't requires the users to pass
// in an intrusive_ptr instead it just converts the argument passed to
// requested intrusive_ptr type.

template <typename T, typename... Args>
Tensor make_tensor(Args&&... args) {
  return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
}
~~~

In [11]:
at::TensorMaker tensor_maker = at::for_blob(/*void**/flatted_data_vector1d.data(), /*IntArrayRef*/{3,6});

In [12]:
//std::size_t size_bytes = tensor_maker.computeStorageSize();
//std::cout << "size_bytes: " << size_bytes << std::endl;

In [13]:
std::size_t itemsize = options.dtype().itemsize();
std::cout << "itemsize: " << itemsize << std::endl;

itemsize: 4


In [14]:
at::IntArrayRef tensor_sizes = tensor2d.sizes();
std::cout << tensor_sizes << std::endl;

at::IntArrayRef tensor_strides = tensor2d.strides();
std::cout << tensor_strides << std::endl;

std::size_t itemsize = tensor2d.dtype().itemsize();
std::cout << itemsize << std::endl;

std::size_t storage_offset = tensor2d.storage_offset();
std::cout << storage_offset << std::endl;

auto size_bytes = at::detail::computeStorageNbytes(tensor_sizes, tensor_strides, itemsize);
std::cout << "storage_n_bytes: " << size_bytes << std::endl;

[3, 6]
[6, 1]
4
0
storage_n_bytes: 72


PyTorch引入了一个叫做步伐（Stride）的概念，其本质上是逻辑索引的一个相对距离，表明当你从某个元素沿着某一维度移动一个元素时的距离。图中是一个二维矩阵，所以其Stride是个size为2的一维向量。当我们从左往右移动的时候（1.0 -> 1.1），由于这两个数在内存中紧挨着，所以我们只移动了一次，Stride在这一维度的值是1；当我们从上往下移动的时候（1.0 -> 2.0），在内存中这两个数之间隔着2个数，所以我们移动了3次，Stride在这一维度的值是3。最终对于这个Tensor来说，它的Stride是（3，1）
![](logic_physics_mapping.jpg)

## 1.3 create a at::Tensor from scratch

In [15]:
std::vector<float> raw_flatted_data_vector = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18};
std::cout << "data vector 1d: \n" << raw_flatted_data_vector << std::endl;

data vector 1d: 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18


In [16]:
// step1: create DataPtr
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/ops/from_blob.h
using DeleterFnPtr = void (*)(void*);
using ContextDeleter = DeleterFnPtr;
std::unique_ptr<void, ContextDeleter> ctx_{nullptr, at::detail::noopDelete};
at::DataPtr data_ptr{};
data_ptr = at::DataPtr{raw_flatted_data_vector.data(), ctx_.release(), ctx_.get_deleter(), options.device()};

In [17]:
// step2: create storage
// https://github.com/pytorch/pytorch/blob/master/c10/core/Storage.h
c10::Storage storage{/*use_byte_size*/c10::Storage::use_byte_size_t{}, /*size_bytes*/size_bytes, /*data_ptr*/std::move(data_ptr)};

In [18]:
std::cout << storage.nbytes() << std::endl;

72


In [19]:
std::cout << options.computeDispatchKey() << std::endl;

CPU


In [20]:
// step3: create TensorImpl ptr
// https://github.com/pytorch/pytorch/blob/master/c10/core/TensorImpl.h 
c10::intrusive_ptr<c10::TensorImpl> impl_ = c10::make_intrusive<c10::TensorImpl>(std::move(storage),options.computeDispatchKey(), options.dtype());

In [21]:
// step4: create at::Tensor
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/templates/TensorBody.h
at::Tensor a_tensor = at::Tensor(/*tensor_impl*/impl_);

In [22]:
std::cout << a_tensor << std::endl;
std::cout << "current sizes: " << a_tensor.sizes() << std::endl;
std::cout << "current stride: " << a_tensor.strides() << std::endl;

[ CPUFloatType{0} ]
current sizes: [0]
current stride: [1]


In [23]:
at::TensorImpl* tensor_impl = a_tensor.unsafeGetTensorImpl();

In [24]:
c10::IntArrayRef target_size{{3,6}};
tensor_impl->set_sizes_contiguous(target_size);

In [25]:
std::cout << a_tensor << std::endl;
std::cout << "current sizes: " << a_tensor.sizes() << std::endl;
std::cout << "current stride: " << a_tensor.strides() << std::endl;

  1   2   3   4   5   6
  7   8   9  10  11  12
 13  14  15  16  17  18
[ CPUFloatType{3,6} ]
current sizes: [3, 6]
current stride: [6, 1]


In [26]:
std::cout << a_tensor.unsafeGetTensorImpl()->version_counter().current_version() << std::endl;

0


In [27]:
std::cout << a_tensor.use_count() << std::endl;

2


In [28]:
at::Tensor test_tensor = a_tensor;
std::cout << test_tensor.use_count() << std::endl;
std::cout << a_tensor.use_count() << std::endl;

3
3


## 1.4 what happens when reshape and slice

当我们有了Stride后就可以快速的计算出一个元素的物理地址。熟悉C/C++的朋友会发现这其实和C/C++里的多级指针寻址方式非常类似。当我们在Tensor中查找索引是[1，1]的数时，我们通过计算索引和Stride的点积就可以得到逻辑表示的相对位置。而此元数的物理地址就是Tensor的第一个元素地址加上逻辑相对位置与字节数的乘积。最终我们计算出Tensor [1,1] 的物理地址是0x3f, 值是2.1。当然Stride在PyTorch里不仅仅是用来为某一个特定元素寻址。


前面也说了Tensor是支持Python的切片操作，所以我们经常会有个需求是查看当前Tensor里的部分元素，当我们使用切片操作时就会获得一个新的Tensor。如果我们为每个切片操作都开辟一个新的内存空间，那势必会降低程序的运行效率以及造成内存空间浪费，所以PyTorch引入了一个叫做视图的概念，所有的视图都会共享相同的内存空间，而Stride就是创建新Tesnor视图的关键所在。当我们想要访问Tensor[0,1:] 时，我们创建了一个包含2个元素的一维Tensor视图，这两个元数在内存中是相邻的，所以其Stride是1。但当我们想要访问Tensor[:, 0] 时,我们虽然也创建了一个包含2个元素的一维Tensor，但是这两个数在内存中却相隔2个元素，所以其Stride是3. 正是由于Stride的存在，PyTorch才可以便捷地使所有Tensor视图共享一块内存空间。那PyTorch是怎么做到这些的呢？
![](share_same_memory.jpg)

In [29]:
std::cout << tensor_impl->data() << std::endl;

0x55aef6a3adf0


In [30]:
at::Tensor another_tensor_from_reshape = a_tensor.reshape({6,3});

In [31]:
std::cout << another_tensor_from_reshape << std::endl;
std::cout << "current sizes: " << another_tensor_from_reshape.sizes() << std::endl;
std::cout << "current stride: " << another_tensor_from_reshape.strides() << std::endl;

  1   2   3
  4   5   6
  7   8   9
 10  11  12
 13  14  15
 16  17  18
[ CPUFloatType{6,3} ]
current sizes: [6, 3]
current stride: [3, 1]


In [32]:
std::cout << another_tensor_from_reshape.unsafeGetTensorImpl()->data() << std::endl;

0x55aef6a3adf0


In [33]:
// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
//
// For example:
//
// void func(Tensor a) {
//   Tensor b = a;
//   ...
// }
//
// In this example, when we say Tensor b = a, we are creating a new object that points to the
// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
// destructor decrements the reference count by calling release() on the TensorImpl it points to.
// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
//
// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
// special care must be taken to handle this.

In [34]:
using torch::indexing::Slice;
using torch::indexing::None;
at::Tensor tensor_from_slice = a_tensor.index({Slice(None,2),Slice()});
std::cout << tensor_from_slice << std::endl;
std::cout << "current sizes: " << tensor_from_slice.sizes() << std::endl;
std::cout << "current stride: " << tensor_from_slice.strides() << std::endl;
std::cout << tensor_from_slice.unsafeGetTensorImpl()->data() << std::endl;

  1   2   3   4   5   6
  7   8   9  10  11  12
[ CPUFloatType{2,6} ]
current sizes: [2, 6]
current stride: [6, 1]
0x55aef6a3adf0


# 2 how to slice a tensor?

## 2.1 the underlying method in at::Tensor

In [35]:
std::cout << a_tensor << std::endl;

  1   2   3   4   5   6
  7   8   9  10  11  12
 13  14  15  16  17  18
[ CPUFloatType{3,6} ]


In [36]:
std::cout << a_tensor.slice(/*dim*/0, /*start*/0, /*end*/2, /*step*/1) << std::endl;

  1   2   3   4   5   6
  7   8   9  10  11  12
[ CPUFloatType{2,6} ]


In [37]:
std::cout << a_tensor.slice(/*dim*/1, /*start*/0, /*end*/3, /*step*/1) << std::endl;

  1   2   3
  7   8   9
 13  14  15
[ CPUFloatType{3,3} ]


In [38]:
std::cout << a_tensor.slice(/*dim*/1, /*start*/0, /*end*/5, /*step*/2) << std::endl;

  1   3   5
  7   9  11
 13  15  17
[ CPUFloatType{3,3} ]


## 2.2 high-level wrapper

~~~
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/TensorIndexing.cpp
Tensor Tensor::index(ArrayRef<at::indexing::TensorIndex> indices) const {
  TORCH_CHECK(indices.size() > 0, "Passing an empty index list to Tensor::index() is not valid syntax");
  OptionalDeviceGuard device_guard(device_of(*this));
  return at::indexing::get_item(*this, indices);
}
Tensor Tensor::index(std::initializer_list<at::indexing::TensorIndex> indices) const {
  return index(ArrayRef<at::indexing::TensorIndex>(indices));
}
~~~

~~~
//https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/TensorIndexing.h
static inline Tensor get_item(
    const Tensor& self,
    const ArrayRef<TensorIndex>& indices,
    bool disable_slice_optimization = false) {
  at::Device self_device = self.device();
  // NOTE [nested tensor size for indexing]
  // nested tensor does not have a size (yet) so for now we represent its size
  // as null may need to be changed after we reach a better solution for nested
  // tensor size
  c10::optional<SymIntArrayRef> self_sizes = self.is_nested()
      ? c10::optional<SymIntArrayRef>(c10::nullopt)
      : c10::optional<SymIntArrayRef>(self.sym_sizes());

  // handle simple types: integers, slices, none, ellipsis, bool
  if (indices.size() == 1) {
    const TensorIndex& index = indices[0];
    if (index.is_integer()) {
      return impl::applySelect(
          self, 0, index.integer(), 0, self_device, self_sizes);
    } else if (index.is_slice()) {
      return impl::applySlice(
          self,
          0,
          index.slice().start(),
          index.slice().stop(),
          index.slice().step(),
          /*disable_slice_optimization=*/true,
          self_device,
          self_sizes);
    } else if (index.is_none()) {
      return self.unsqueeze(0);
    } else if (index.is_ellipsis()) {
      return at::alias(self);
    } else if (index.is_boolean()) {
      Tensor result = self.unsqueeze(0);
      return dispatch_index(
          result,
          std::vector<Tensor>{impl::boolToIndexingTensor(
              result, index.boolean(), self_device)});
    }
  }
~~~

~~~
static inline Tensor applySlice(
    const Tensor& self,
    int64_t dim,
    int64_t start,
    int64_t stop,
    int64_t step,
    bool disable_slice_optimization,
    const at::Device& self_device,
    const c10::optional<SymIntArrayRef>& self_sizes) {
  // TODO: implement negative step
  TORCH_CHECK_VALUE(step > 0, "step must be greater than zero");

  // See NOTE [nested tensor size for indexing]
  if (self_sizes.has_value()) {
    // Skip this optimization if we are tracing, as the trace may be polymorphic
    // over the shape of the `self` tensor, and we still want to record
    // the slice.
    SymInt length = (self_device == at::kCPU || self_device == at::kCUDA)
        ? (*self_sizes)[dim]
        : self.sym_size(dim);
    if (!disable_slice_optimization && start == 0 && length == stop &&
        step == 1) {
      return self;
    }
  }
  return self.slice(dim, start, stop, step);
}
~~~

## 2.3 how to implement Tensor.slice ?

https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native

ATen "native" functions are the modern mechanism for adding operators and functions to ATen. Native functions are declared in native_functions.yaml and have implementations defined in one of the cpp files in this directory.

Like all ATen methods/functions, native functions are made available from both ATen's C++ and Python APIs. In C++, they are made available either as methods on Tensor (t.mymeth()) and functions in the ATen namespace (at::myfunc()). In PyTorch, they are made available as methods on Variable or as functions on torch._C._FunctionBase. (It is the user's responsibility to re-export these functions in a more user-facing module.)

### Registering a function in native_functions.yaml
Every native function must have an entry in native_functions.yaml
### the low_level operatipon of reshape is as_strided
~~~
- func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
  variants: function, method
  dispatch:
    ZeroTensor, CPU, CUDA: as_strided_tensorimpl
    Meta: as_strided_tensorimpl_meta_symint
    MPS: as_strided_tensorimpl_mps
    QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
  device_check: NoCheck
  device_guard: False
  
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.h

Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
  auto storage_offset = storage_offset_.value_or(self.storage_offset());
  auto result = at::detail::make_tensor<TensorImpl>(c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());
  setStrided(result, size, stride, storage_offset);
  return result;
}

// https://github.com/pytorch/pytorch/aten/src/ATen/native/Resize.h

template <typename T>
inline void setStrided(
    const Tensor& self,
    ArrayRef<T> size,
    ArrayRef<T> stride,
    T storage_offset) {
  TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape");
  auto* self_ = self.unsafeGetTensorImpl();
  /* storage offset */
  self_->set_sizes_and_strides(size, stride, c10::make_optional(storage_offset));
}
~~~

#### what happens when Tensor.permute()?

~~~
https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.h

Tensor permute(const Tensor& self, IntArrayRef dims) {
  DimVector new_sizes, new_strides;
  std::vector<int64_t> _;
  std::tie(new_sizes, new_strides, _) = _permute_size_stride_estimation(self, dims);
  return self.as_strided(new_sizes, new_strides);
}
~~~

#### what happens when Tensor.reshape()?

~~~
Tensor view(const Tensor& self,
            at::IntArrayRef size) {
  return view_impl(self, size);
}

Tensor view_impl(const Tensor& self, IntArrayRef size) {

  at::DimVector inferred_size = at::infer_size_dv(size, self.numel());
  auto stride = at::detail::computeStride(self.sizes(),
                                          self.strides(),
                                          inferred_size);
  return alias_with_sizes_and_strides(self, inferred_size, *stride);
}

template <typename Vec>
Tensor alias_with_sizes_and_strides(
    const Tensor& self,
    const Vec& sizes,
    const Vec& strides) {
  //caller should make sure that sizes and strides are valid for self
  //(storage is sufficient, strides are non-negative, strides and sizes array size is the same)
  Tensor self_;
  if (self.is_quantized()) {
    self_ = at::detail::make_tensor<QTensorImpl>(
      c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype(), get_qtensorimpl(self)->quantizer());
    auto* self_tmp_ = self_.unsafeGetTensorImpl();
    self_tmp_->set_storage_offset(self.storage_offset());
    self_tmp_->set_sizes_and_strides(sizes, strides);
  } else {
    self_ = at::detail::make_tensor<TensorImpl>(
      c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());
    auto* self_tmp_ = self_.unsafeGetTensorImpl();
    self_tmp_->set_storage_offset(self.storage_offset());
    self_tmp_->set_sizes_and_strides(sizes, strides);
  }
  namedinference::propagate_names(self_, self);
  return self_;
}

Tensor reshape(const Tensor& self, IntArrayRef proposed_shape) {
  if (self.is_sparse()) {
    AT_ERROR("reshape is not implemented for sparse tensors");
  }
  DimVector shape = infer_size_dv(proposed_shape, self.numel());
  
  auto stride = at::detail::computeStride(self.sizes(), self.strides(), shape);
  if (stride.has_value()) {
    // Temporary check to revert to the old behavior/view in cases where the
    // device is not supported (e.g. for XLA the operation is not supported
    // so we use `view` instead).
    //
    // We need to do the checks here instead of in `native_functions.yaml`
    // to preserve backwards compatibility.
    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu()) {
      return self._reshape_alias(shape, stride.value());
    } else {
      return self.view(shape);
    }
  }
  return at::_unsafe_view(self.clone(at::MemoryFormat::Contiguous), shape);
}

~~~

#### understand Tensor.slice()

~~~
- func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
  variants: function, method
  device_check: NoCheck
  device_guard: False
  dispatch:
    CompositeExplicitAutograd: slice
  tags: canonical

// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.h

Tensor slice(
    const Tensor& self,
    int64_t dim,
    c10::optional<int64_t> start,
    c10::optional<int64_t> end,
    int64_t step) {
  int64_t ndim = self.dim();
  if (ndim == 0) {
    TORCH_CHECK_INDEX(false, "slice() cannot be applied to a 0-dim tensor.");
  }
  dim = maybe_wrap_dim(dim, ndim);
  DimVector sizes(self.sizes().begin(), self.sizes().end());
  DimVector strides(self.strides().begin(), self.strides().end());
  // handle optional parameters
  int64_t start_val = start.has_value() ? start.value() : 0;
  int64_t end_val = end.has_value() ? end.value() : INT64_MAX;

  // TODO: support negative strides
  TORCH_CHECK(step > 0, "slice step must be positive");

  if (start_val < 0) {
    start_val += sizes[dim];
  }
  if (end_val < 0) {
    end_val += sizes[dim];
  }
  if (start_val < 0) {
    start_val = 0;
  } else if (start_val >= sizes[dim]) {
    start_val = sizes[dim];
  }
  if (end_val < start_val) {
    end_val = start_val;
  } else if (end_val >= sizes[dim]) {
    end_val = sizes[dim];
  }
  auto storage_offset = self.storage_offset() + start_val * strides[dim];
  auto len = end_val - start_val;
  sizes[dim] = (len + step - 1) / step; // round-up
  strides[dim] *= step;

  Tensor result;
  if (self.is_quantized()) {
    auto quantizer = create_subtensor_quantizer(self, false, start_val, end_val, dim, step);
    result = as_strided_qtensorimpl(self, sizes, strides, storage_offset, std::move(quantizer));
  } else {
    // NB: it is extremely important to perform a redispatch here for
    // the MPS backend; if you call directly to as_strided_tensorimpl,
    // the necessary metadata for MPS will not get setup and you will
    // get silently wrong results
    result = self.as_strided(sizes, strides, storage_offset);
  }
  namedinference::propagate_names(result, self);
  return result;
}
~~~

## 2.5 how tensor operations?  if tensor share the same data pointer

In [39]:
std::cout << a_tensor.data_ptr() << std::endl;

0x55aef6a3adf0


In [40]:
std::cout << tensor_from_slice.data_ptr() << std::endl;

0x55aef6a3adf0


In [41]:
std::cout << tensor_from_slice.data() << std::endl;

  1   2   3   4   5   6
  7   8   9  10  11  12
[ CPUFloatType{2,6} ]


# reference

* https://zhuanlan.zhihu.com/p/569278062

~~~
struct C10_API TensorImpl : public c10::intrusive_ptr_target {

 protected:
  Storage storage_;
  c10::VariableVersion version_counter_;

  c10::impl::SizesAndStrides sizes_and_strides_;
  int64_t storage_offset_ = 0;
  int64_t numel_ = 1;


  TensorImpl() = delete;
  virtual ~TensorImpl() override;
  
  TensorImpl(
      Storage&& storage,
      DispatchKey dispatch_key,
      const caffe2::TypeMeta data_type)
      : TensorImpl(
            std::move(storage),
            DispatchKeySet(dispatch_key),
            data_type) {}
            
  public:
  TensorImpl(const TensorImpl&) = delete;
  TensorImpl& operator=(const TensorImpl&) = delete;
  TensorImpl(TensorImpl&&) = delete;
  TensorImpl& operator=(TensorImpl&&) = delete;

  IntArrayRef sizes() const {
    return sizes_and_strides_.sizes_arrayref();
  }

  IntArrayRef strides() const {
    return sizes_and_strides_.strides_arrayref();
  }
  
  int64_t storage_offset() const {
    return storage_offset_;
  }
  
  int64_t numel() const {
    return numel_;
  }
  
  void refresh_numel() {
      numel_ = c10::multiply_integers(sizes_and_strides_.sizes_arrayref());
  }

  void set_sizes_and_strides(
      IntArrayRef new_size,
      IntArrayRef new_stride,
      c10::optional<int64_t> storage_offset = c10::nullopt) {

    const auto new_dim = new_size.size();

    sizes_and_strides_.set_sizes(new_size);

    if (new_dim > 0) {
      for (size_t dim = new_dim - 1;; dim--) {
        if (new_stride[dim] >= 0) {
          sizes_and_strides_.stride_at_unchecked(dim) = new_stride[dim];
        } else {
          // XXX: This behavior is surprising and may need to be removed to
          // support negative strides. Some pytorch functions rely on it:
          // for example, torch.cat (run TestTorch.test_cat_empty).
          if (dim == new_dim - 1) {
            sizes_and_strides_.stride_at_unchecked(dim) = 1;
          } else {
            // Keep stride monotonically increasing to match NumPy.
            sizes_and_strides_.stride_at_unchecked(dim) =
                std::max<int64_t>(
                    sizes_and_strides_.size_at_unchecked(dim + 1), 1) *
                sizes_and_strides_.stride_at_unchecked(dim + 1);
          }
        }
        if (dim == 0)
          break;
      }
    }

    refresh_numel();
    //refresh_contiguous();

    if (storage_offset.has_value()) {
      storage_offset_ = *storage_offset;
    }
  }
  //
  template <typename T>
  inline T* data_ptr_impl() const {
    // Caller does the type check.
    return storage_.unsafe_data<T>() + storage_offset_;
  }
~~~

In [42]:
~~~
class TensorBase {

protected:
  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
  
public:
  TensorBase() = default;
  // This constructor should not be used by end users and is an implementation
  // detail invoked by autogenerated code.
  explicit TensorBase(
      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
      : impl_(std::move(tensor_impl)) {
    if (impl_.get() == nullptr) {
      throw std::runtime_error("TensorImpl with nullptr is not supported");
    }
  }
  TensorBase(const TensorBase&) = default;
  TensorBase(TensorBase&&) = default;
  
  TensorImpl* unsafeGetTensorImpl() const {
    return impl_.get();
  }
  
  void* data_ptr() const {
    return this->unsafeGetTensorImpl()->data_ptr_impl();
  }
  
  
  int64_t numel() const {
    return impl_->numel();
  }
  
  IntArrayRef sizes() const {
    return impl_->sizes();
  }

  IntArrayRef strides() const {
    return impl_->strides();
  }
  
  int64_t storage_offset() const {
    return impl_->storage_offset();
  }

~~~

[1minput_line_48:3:1: [0m[0;1;31merror: [0m[1mexpected expression[0m
class TensorBase {
[0;1;32m^
[0m[1m<<< cling interactive line includer >>>:1:1: [0m[0;1;31merror: [0m[1mexpected '}'[0m
[1minput_line_48:1:43: [0m[0;1;30mnote: [0mto match this '{'[0m
void __cling_Un1Qu337(void* vpClingValue) {
[0;1;32m                                          ^
[0m

Interpreter Error: 