In [None]:
import torch
import torch.utils.dlpack
import open3d as o3d
import numpy as np

## Tensor creation

```cpp
// With initial values
Tensor a(std::vector<int>{0, 1, 2, 3, 4, 5}, // init_vals
         {2, 3},                             // shape
         Dtype::Int32);                      // dtype

// Without initial values
Tensor a({2, 3},                             // shape
         Dtype::Int32);                      // dtype

// Optionally, specify a device
Tensor a({2, 3},                             // shape
         Dtype::Int32,                       // dtype
         Device("CUDA:0"));                  // device
```

## Properties of a tensor

```cpp
Tensor a({2, 3}, Dtype::Int32, Device("CPU:0"));
SizeVector shape   = a.GetShape();
SizeVector strides = a.GetStrides();
Dtype dtype        = a.GetDtype();
Device device      = a.GetDevice();
```

## Copy & device transfer

```cpp
// Host -> Device
Tensor a({2, 3}, Dtype::Int32, Device("CPU:0"));
Tensor b = a.Copy(Device("CUDA:0"));

// Device -> Host
Tensor a({2, 3}, Dtype::Int32, Device("CUDA:0"));
Tensor b = a.Copy(Device("CPU:0"));

// Device -> Device
Tensor a({2, 3}, Dtype::Int32, Device("CUDA:0"));
Tensor b = a.Copy(Device("CUDA:1"));
```

## Type casting

```cpp
// E.g. Float32 -> Int32
Tensor a({2, 3}, Dtype::Float32, Device("CPU:0"));
Tensor b = a.To(Dtype::Int32)
```

## Numpy I/O with direct memory map

```cpp
// N/A in C++, see Python docs.
```

## PyTorch I/O with DLPack memory map

```cpp
// N/A in C++, see Python docs.
```

## Binary element-wise operation: add, sub, mul, div, ...

```cpp
Tensor a(std::vector<float>{1, 1, 1}, {3,}, Dtype::Float32, Device("CPU:0"));
Tensor b(std::vector<float>{2, 2, 2}, {3,}, Dtype::Float32, Device("CPU:0"));
Tensor c = a + b;
Tensor d = a - b;
Tensor e = a * b;
Tensor f = a / b;

// Automated broadcasting
Tensor a({2, 3}, Dtype::Int32, Device("CPU:0"));
Tensor b({3}, Dtype::Int32, Device("CPU:0"));
Tensor c = a + b;
```

## Unary element-wise operation: sqrt, sin, cos, ...

```cpp
Tensor a(std::vector<float>{4, 9, 16}, {3,}, Dtype::Float32, Device("CPU:0"));print(a.sqrt())
Tensor b = a.Sqrt()
Tensor c = a.Sin()
Tensor d = a.Cos()
```

## Reduction: sum, prod, min, max

```cpp
std::vector<float> vals{0,  1,  2,  3,  4,  5,  
                        6,  7,  8,  9,  10, 11,
                        12, 13, 14, 15, 16, 17, 
                        18, 19, 20, 21, 22, 23};
Tensor a(vals, {2, 3, 4}, Dtype::Float32, device);

Tensor b = a.Sum()
Tensor c = a.Prod()
Tensor d = a.Min()
Tensor e = a.Max()
```

```cpp
// With specified dimension
Tensor b = a.Sum({0})
Tensor c = a.Sum({0, 2})
Tensor c = a.Sum({0, 2}, /*keepdim*/True)
```

## Slicing, indexing, getitem (returns a view), and setitem

```cpp
std::vector<float> vals{0,  1,  2,  3,  4,  5,  
                        6,  7,  8,  9,  10, 11,
                        12, 13, 14, 15, 16, 17, 
                        18, 19, 20, 21, 22, 23};
Tensor a(vals, {2, 3, 4}, Dtype::Float32, device);

// Slicing __getitem__
// a[1:]
Tensor b = a.IndexGet(TensorKey::Slice(1, None, None))

// Indexing __getitem__
// a[1, 2]
Tensor c = a.IndexGet({TensorKey::Index(1), TensorKey::Index(2)})

// Combined __getitem__
// a[:-1, 0:3:2, 2]
Tensor d = a.IndexGet({TensorKey::Slice(None, None, -1), 
                       TensorKey::Slice(0, 3, 2),
                       TensorKey::Index(2)})
```

```cpp
// a[:, :, 2] = val
Tensor val(std::vector<float>{100}, {}, Dtype::Float32, device);
a.IndexGet({TensorKey::Slice(None, None, None), 
            TensorKey::Slice(None, None, None),
            TensorKey::Index(2)}) = val;
```

## Advanced indexing

```cpp
std::vector<float> vals{0,  1,  2,  3,  4,  5,  
                        6,  7,  8,  9,  10, 11,
                        12, 13, 14, 15, 16, 17, 
                        18, 19, 20, 21, 22, 23};
Tensor a(vals, {2, 3, 4}, Dtype::Float32, device);

// a[:, [1, 2], [1, 2]]
std::vector<Tensor> indices = {
        Tensor(SizeVector(), Dtype::Int64, device),
        Tensor(std::vector<int64_t>({1, 2}), {2}, Dtype::Int64, device),
        Tensor(std::vector<int64_t>({1, 2}), {2}, Dtype::Int64, device)};
Tensor b = a.IndexGet(indices);


// a[:. [1, 2], [1, 2]] = value
a.IndexSet(indices, value);
```