Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Due to KITTI Semantic datasets' dimension is 4D(xyz+brightness),I tried to imitate a 4d version of chamfer,can you help me to inspect whether my code is correct? #18

Open
LeopoldACC opened this issue Sep 20, 2021 · 1 comment

Comments

@LeopoldACC
Copy link

LeopoldACC commented Sep 20, 2021

Hi,@ThibaultGROUEIX Thanks for your work!My 4D version is as below by imitation.

#include <stdio.h>
#include <ATen/ATen.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <vector>



__global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){
	const int batch=2048;
	__shared__ float buf[batch*4];
	for (int i=blockIdx.x;i<b;i+=gridDim.x){
		for (int k2=0;k2<m;k2+=batch){
			int end_k=min(m,k2+batch)-k2;
			for (int j=threadIdx.x;j<end_k*4;j+=blockDim.x){
				buf[j]=xyz2[(i*m+k2)*4+j];
			}
			__syncthreads();
			for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
				float x1=xyz[(i*n+j)*4+0];
				float y1=xyz[(i*n+j)*4+1];
				float r1=xyz[(i*n+j)*4+2];
				float g1=xyz[(i*n+j)*4+3];
				int best_i=0;
				float best=0;
				int end_ka=end_k-(end_k&4);
				if (end_ka==batch){
					for (int k=0;k<batch;k+=4){
						{
							float x2=buf[k*4+0]-x1;
							float y2=buf[k*4+1]-y1;
							float r2=buf[k*4+2]-r1;
							float g2=buf[k*4+3]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (k==0 || d<best){
								best=d;
								best_i=k+k2;
							}
						}
						{
							float x2=buf[k*4+4]-x1;
							float y2=buf[k*4+5]-y1;
							float r2=buf[k*4+6]-r1;
							float g2=buf[k*4+7]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (d<best){
								best=d;
								best_i=k+k2+1;
							}
						}
						{
							float x2=buf[k*4+8]-x1;
							float y2=buf[k*4+9]-y1;
							float r2=buf[k*4+10]-r1;
							float g2=buf[k*4+11]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (d<best){
								best=d;
								best_i=k+k2+2;
							}
						}
						{
							float x2=buf[k*4+12]-x1;
							float y2=buf[k*4+13]-y1;
							float r2=buf[k*4+14]-r1;
							float g2=buf[k*4+15]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (d<best){
								best=d;
								best_i=k+k2+3;
							}
						}
					}
				}else{
					for (int k=0;k<end_ka;k+=4){
						{
							float x2=buf[k*4+0]-x1;
							float y2=buf[k*4+1]-y1;
							float r2=buf[k*4+2]-r1;
							float g2=buf[k*4+3]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (k==0 || d<best){
								best=d;
								best_i=k+k2;
							}
						}
						{
							float x2=buf[k*4+4]-x1;
							float y2=buf[k*4+5]-y1;
							float r2=buf[k*4+6]-r1;
							float g2=buf[k*4+7]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (d<best){
								best=d;
								best_i=k+k2+1;
							}
						}
						{
							float x2=buf[k*4+8]-x1;
							float y2=buf[k*4+9]-y1;
							float r2=buf[k*4+10]-r1;
							float g2=buf[k*4+11]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (d<best){
								best=d;
								best_i=k+k2+2;
							}
						}
						{
							float x2=buf[k*5+12]-x1;
							float y2=buf[k*5+13]-y1;
							float r2=buf[k*5+14]-r1;
							float g2=buf[k*5+15]-g1;
							float d=x2*x2+y2*y2+r2*r2+g2*g2;
							if (d<best){
								best=d;
								best_i=k+k2+3;
							}
						}
					}
				}
				for (int k=end_ka;k<end_k;k++){
					float x2=buf[k*5+0]-x1;
					float y2=buf[k*5+1]-y1;
					float r2=buf[k*5+2]-r1;
					float g2=buf[k*5+3]-g1;
					float d=x2*x2+y2*y2+r2*r2+g2*g2;
					if (k==0 || d<best){
						best=d;
						best_i=k+k2;
					}
				}
				if (k2==0 || result[(i*n+j)]>best){
					result[(i*n+j)]=best;
					result_i[(i*n+j)]=best_i;
				}
			}
			__syncthreads();
		}
	}
}
// int chamfer_cuda_forward(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i,float * result2,int * result2_i, cudaStream_t stream){
int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){

	const auto batch_size = xyz1.size(0);
	const auto n = xyz1.size(1); //num_points point cloud A
	const auto m = xyz2.size(1); //num_points point cloud B

	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, n, xyz1.data<float>(), m, xyz2.data<float>(), dist1.data<float>(), idx1.data<int>());
	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, m, xyz2.data<float>(), n, xyz1.data<float>(), dist2.data<float>(), idx2.data<int>());

	cudaError_t err = cudaGetLastError();
	  if (err != cudaSuccess) {
	    printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err));
	    //THError("aborting");
	    return 0;
	  }
	  return 1;


}
__global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){
	for (int i=blockIdx.x;i<b;i+=gridDim.x){
		for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
			float x1=xyz1[(i*n+j)*5+0];
			float y1=xyz1[(i*n+j)*5+1];
			float r1=xyz1[(i*n+j)*5+2];
			float g1=xyz1[(i*n+j)*5+3];
			int j2=idx1[i*n+j];
			float x2=xyz2[(i*m+j2)*5+0];
			float y2=xyz2[(i*m+j2)*5+1];
			float r2=xyz2[(i*m+j2)*5+2];
			float g2=xyz2[(i*m+j2)*5+3];
			float g=grad_dist1[i*n+j]*2;
			atomicAdd(&(grad_xyz1[(i*n+j)*4+0]),g*(x1-x2));
			atomicAdd(&(grad_xyz1[(i*n+j)*4+1]),g*(y1-y2));
			atomicAdd(&(grad_xyz1[(i*n+j)*4+2]),g*(r1-r2));
			atomicAdd(&(grad_xyz1[(i*n+j)*4+3]),g*(g1-g2));
			atomicAdd(&(grad_xyz2[(i*m+j2)*4+0]),-(g*(x1-x2)));
			atomicAdd(&(grad_xyz2[(i*m+j2)*4+1]),-(g*(y1-y2)));
			atomicAdd(&(grad_xyz2[(i*m+j2)*4+2]),-(g*(r1-r2)));
			atomicAdd(&(grad_xyz2[(i*m+j2)*4+3]),-(g*(g1-g2)));
		}
	}
}
// int chamfer_cuda_backward(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,const float * grad_dist2,const int * idx2,float * grad_xyz1,float * grad_xyz2, cudaStream_t stream){
int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2){
	// cudaMemset(grad_xyz1,0,b*n*3*4);
	// cudaMemset(grad_xyz2,0,b*m*3*4);

	const auto batch_size = xyz1.size(0);
	const auto n = xyz1.size(1); //num_points point cloud A
	const auto m = xyz2.size(1); //num_points point cloud B

	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,n,xyz1.data<float>(),m,xyz2.data<float>(),graddist1.data<float>(),idx1.data<int>(),gradxyz1.data<float>(),gradxyz2.data<float>());
	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,m,xyz2.data<float>(),n,xyz1.data<float>(),graddist2.data<float>(),idx2.data<int>(),gradxyz2.data<float>(),gradxyz1.data<float>());

	cudaError_t err = cudaGetLastError();
	  if (err != cudaSuccess) {
	    printf("error in nnd get grad: %s\n", cudaGetErrorString(err));
	    //THError("aborting");
	    return 0;
	  }
	  return 1;

}

Another question is there any good tutorial or book to learn how to write cuda op?Thanks a lot!

@LeopoldACC LeopoldACC changed the title Hi,@ ThibaultGROUEIX Thanks for your work!Due to KITTI Semantic datasets' dimension is 4D(xyz+brightness),I tried to imitate a 4d version of chamfer,can you help me to inspect whether my code is correct? Due to KITTI Semantic datasets' dimension is 4D(xyz+brightness),I tried to imitate a 4d version of chamfer,can you help me to inspect whether my code is correct? Sep 20, 2021
@ThibaultGROUEIX
Copy link
Owner

Looks good ! Sorry I don't know of any good teaching ressource. You could also have used Chamfer5D or 6D and set a few dimensions to 0 in your tensors.
Best regards,

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants