In [1]:
import open3d as o3d
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from pointnet2_ops import pointnet2_utils
from knn_cuda import KNN

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
print(device)

cuda


In [13]:
print("Load a ply point cloud, print it, and render it")
path = "../dataset/OldDataset/Barn_is/Barn/Barn01.ply"
pcd = o3d.io.read_point_cloud(path)
print(pcd)

voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd,
                                                        voxel_size=0.01)
voxels = voxel_grid.get_voxels()  # returns list of voxels
indices = np.stack(list(vx.grid_index for vx in voxels))
colors = np.stack(list(vx.color for vx in voxels))
# print(indices[0:10])
# print(voxel_grid)
o3d.visualization.draw_geometries([voxel_grid])

voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd,
                                                    voxel_size=0.2)
voxels = voxel_grid.get_voxels()  # returns list of voxels
indices = np.stack(list(vx.grid_index for vx in voxels))
colors = np.stack(list(vx.color for vx in voxels))
# print(indices[0:10])
# print(voxel_grid)
o3d.visualization.draw_geometries([voxel_grid])


Load a ply point cloud, print it, and render it
PointCloud with 0 points.


RPly: Unable to open file


ValueError: need at least one array to stack

In [15]:
class Tokenizer (nn.Module):
    def __init__(self, num_group, group_size):
        super().__init__()
        self.num_group = num_group
        self.group_size = group_size
        self.knn = KNN(k=self.group_size, transpose_mode=True)
        self.mask_ratio = 0.6

    def forward(self, xyz):
            '''
                input: B N 3
                ---------------------------
                output: B G M 3
                center : B G 3
            '''
            batch_size, num_points, _ = xyz.shape
            # fps the centers out
            center = self.fps(xyz, self.num_group) # B G 3
            # knn to get the neighborhood
            _, idx = self.knn(xyz, center) # B G M
            assert idx.size(1) == self.num_group
            assert idx.size(2) == self.group_size
            idx_base = torch.arange(0, batch_size, device=xyz.device).view(-1, 1, 1) * num_points
            idx = idx + idx_base
            idx = idx.view(-1)
            neighborhood = xyz.view(batch_size * num_points, -1)[idx, :]
            neighborhood = neighborhood.view(batch_size, self.num_group, self.group_size, 3).contiguous()
            # normalize
            neighborhood = neighborhood - center.unsqueeze(2)
            return neighborhood, center
    
    def masking(self, center, noaug = False):
        '''
            center : B G 3
            --------------
            mask : B G (bool)
        '''
        B, G, _ = center.shape
        # skip the mask
        if noaug or self.mask_ratio == 0:
            return torch.zeros(center.shape[:2]).bool()

        self.num_mask = int(self.mask_ratio * G)

        overall_mask = np.zeros([B, G])
        for i in range(B):
            mask = np.hstack([
                np.zeros(G-self.num_mask),
                np.ones(self.num_mask),
            ])
            np.random.shuffle(mask)
            overall_mask[i, :] = mask
        overall_mask = torch.from_numpy(overall_mask).to(torch.bool)

        return overall_mask.to(center.device) # B G
    
    def fps(self, data, number):
        '''
            data B N 3
            number int
        '''
        # print(number)
        # print("yoyoyo",data.scalar_type())
        fps_idx = pointnet2_utils.furthest_point_sample(data, number) 
        fps_data = pointnet2_utils.gather_operation(data.transpose(1, 2).contiguous(), fps_idx).transpose(1,2).contiguous()
        print(fps_data)
        return fps_data
    
def farthest_point_sample(point, npoint):
    """
    Input:
        xyz: pointcloud data, [N, D]
        npoint: number of samples
    Return:
        centroids: sampled pointcloud index, [npoint, D]
    """
    N, D = point.shape
    xyz = point[:,:3]
    centroids = np.zeros((npoint,))
    distance = np.ones((N,)) * 1e10
    farthest = np.random.randint(0, N)
    for i in range(npoint):
        centroids[i] = farthest
        centroid = xyz[farthest, :]
        dist = np.sum((xyz - centroid) ** 2, -1)
        mask = dist < distance
        distance[mask] = dist[mask]
        farthest = np.argmax(distance, -1)
    point = point[centroids.astype(np.int32)]
    return point


In [16]:
tokenizer = Tokenizer(32,128)

In [17]:
from torchvision import transforms

class PointcloudScaleAndTranslate(object):
    def __init__(self, scale_low=2. / 3., scale_high=3. / 2., translate_range=0.2):
        self.scale_low = scale_low
        self.scale_high = scale_high
        self.translate_range = translate_range

    def __call__(self, pc):
        bsize = pc.size()[0]
        for i in range(bsize):
            xyz1 = np.random.uniform(low=self.scale_low, high=self.scale_high, size=[3])
            xyz2 = np.random.uniform(low=-self.translate_range, high=self.translate_range, size=[3])
            
            pc[i, :, 0:3] = torch.mul(pc[i, :, 0:3], torch.from_numpy(xyz1).float().cuda()) + torch.from_numpy(xyz2).float().cuda()
            
        return pc


train_transforms = transforms.Compose(
    [
        # data_transforms.PointcloudScale(),
        # data_transforms.PointcloudRotate(),
        # data_transforms.PointcloudRotatePerturbation(),
        # data_transforms.PointcloudTranslate(),
        # data_transforms.PointcloudJitter(),
        # data_transforms.PointcloudRandomInputDropout(),
        PointcloudScaleAndTranslate(),
    ]
)



In [18]:
torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
path = "../dataset/TanksAndTemples/pcd/Barn01.npy"
np_pcd = np.load(path)

print(np_pcd.shape)
tensor_pcd = torch.from_numpy(np.reshape(np_pcd, (1, np_pcd.shape[0], np_pcd.shape[1]))).to(device).cuda().float()

(16384, 3)


In [21]:
tensor_pcd = train_transforms(tensor_pcd)

In [22]:
print(tensor_pcd)

tensor([[[  3.6213,  -3.8718, -30.1582],
         [-18.8036,  -3.0308, -27.6767],
         [ -3.9251,   4.7235, -23.9526],
         ...,
         [  2.2797,  -3.9714, -30.3885],
         [  2.3024,  -3.9701, -28.3812],
         [  3.6038,  -3.4454, -30.5889]]], device='cuda:0')


In [23]:
test, centers = tokenizer.forward(tensor_pcd)

tensor([[[  3.6213,  -3.8718, -30.1582],
         [-18.8036,  -3.0308, -27.6767],
         [ -3.9251,   4.7235, -23.9526],
         [ -7.8995,  -3.7772, -31.2021],
         [  4.4982,   5.7477, -31.1705],
         [ -2.3236,  -2.9521, -24.9997],
         [  1.6275,   1.9024, -26.8119],
         [-13.8521,  -3.8867, -31.2086],
         [ -2.0782,  -2.9137, -31.1197],
         [ -9.7824,  -2.8209, -26.6483],
         [  4.3347,   0.8771, -31.1768],
         [ -5.4326,  -2.8684, -27.8820],
         [  1.5097,  -2.3424, -26.8467],
         [-13.9261,  -2.5668, -27.1863],
         [ -1.9032,   1.0804, -25.0669],
         [ -0.1743,   4.7108, -25.8325],
         [  3.9415,   4.2433, -28.1107],
         [-17.2273,  -3.3755, -31.1153],
         [  3.7811,  -0.2665, -28.1048],
         [ -5.6008,  -2.9063, -24.5514],
         [-10.7339,  -2.8871, -29.7989],
         [  0.6886,  -4.3652, -31.2128],
         [ -4.8332,  -4.2503, -31.1903],
         [ -2.0804,  -3.0258, -28.0697],
         [ -4.11

In [25]:
print(test.shape)

# print(test[0].shape)
# print(test[1].shape)

torch.Size([1, 32, 128, 3])


In [26]:
pcd_patch = test.cpu().detach().numpy()

In [27]:
print(pcd_patch.shape)

(1, 32, 128, 3)


In [18]:
import open3d as o3d


# Pass xyz to Open3D.o3d.geometry.PointCloud and visualize
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(pcd_patch[0][31])
o3d.visualization.draw_geometries([pcd])

In [24]:
bool_masked_pos = tokenizer.masking(centers, False)

In [25]:
print(bool_masked_pos)

tensor([[False,  True,  True,  True,  True, False,  True, False,  True, False,
          True, False,  True, False, False,  True, False,  True, False,  True,
          True,  True,  True,  True,  True, False, False, False,  True,  True,
         False,  True]], device='cuda:0')


In [None]:
group_input_tokens = self.encoder(neighborhood)  #  B G C

batch_size, seq_len, C = group_input_tokens.size()

x_vis = group_input_tokens[~bool_masked_pos].reshape(batch_size, -1, C)
# add pos embedding
# mask pos center
masked_center = center[~bool_masked_pos].reshape(batch_size, -1, 3)
pos = self.pos_embed(masked_center)

# transformer
x_vis = self.blocks(x_vis, pos)
x_vis = self.norm(x_vis)

In [4]:
from transformers import AutoImageProcessor, ViTModel
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

cats-image.py:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/173k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

[1, 197, 768]