Skip to content

Commit

Permalink
Rewrite crop cuda kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
erictzeng committed Apr 19, 2017
1 parent eeebdab commit 33f8612
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 80 deletions.
6 changes: 4 additions & 2 deletions include/caffe/layers/crop_layer.hpp
Expand Up @@ -41,13 +41,15 @@ class CropLayer : public Layer<Dtype> {
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

vector<int> offsets;
Blob<int> offsets;
Blob<int> src_strides_;
Blob<int> dest_strides_;

private:
// Recursive copy function.
void crop_copy(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top,
const vector<int>& offsets,
const int* offsets,
vector<int> indices,
int cur_dim,
const Dtype* src_data,
Expand Down
21 changes: 16 additions & 5 deletions src/caffe/layers/crop_layer.cpp
Expand Up @@ -40,8 +40,10 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const int start_axis = bottom[0]->CanonicalAxisIndex(param.axis());

// Initialize offsets to 0 and the new shape to the current shape of the data.
offsets = vector<int>(input_dim, 0);
vector<int> new_shape(bottom[0]->shape());
vector<int> offsets_shape(1, input_dim);
offsets.Reshape(offsets_shape);
int* offset_data = offsets.mutable_cpu_data();

// Determine crop offsets and the new shape post-crop.
for (int i = 0; i < input_dim; ++i) {
Expand All @@ -63,15 +65,22 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
<< "size " << bottom[1]->shape(i) << " and offset " << crop_offset;
}
new_shape[i] = new_size;
offsets[i] = crop_offset;
offset_data[i] = crop_offset;
}
top[0]->Reshape(new_shape);
// Compute strides
src_strides_.Reshape(offsets_shape);
dest_strides_.Reshape(offsets_shape);
for (int i = 0; i < input_dim; ++i) {
src_strides_.mutable_cpu_data()[i] = bottom[0]->count(i + 1, input_dim);
dest_strides_.mutable_cpu_data()[i] = top[0]->count(i + 1, input_dim);
}
}

template <typename Dtype>
void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top,
const vector<int>& offsets,
const int* offsets,
vector<int> indices,
int cur_dim,
const Dtype* src_data,
Expand Down Expand Up @@ -115,7 +124,8 @@ void CropLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
std::vector<int> indices(top[0]->num_axes(), 0);
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
crop_copy(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
crop_copy(bottom, top, offsets.cpu_data(), indices, 0, bottom_data, top_data,
true);
}

template <typename Dtype>
Expand All @@ -127,7 +137,8 @@ void CropLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
if (propagate_down[0]) {
caffe_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
std::vector<int> indices(top[0]->num_axes(), 0);
crop_copy(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false);
crop_copy(bottom, top, offsets.cpu_data(), indices, 0, top_diff,
bottom_diff, false);
}
}

Expand Down
122 changes: 49 additions & 73 deletions src/caffe/layers/crop_layer.cu
Expand Up @@ -4,103 +4,79 @@

namespace caffe {

// Copy (one line per thread) from one array to another, with arbitrary
// strides in the last two dimensions.
__device__ int compute_uncropped_index(
int index,
const int ndims,
const int* src_strides,
const int* dest_strides,
const int* offsets) {
int dest_index = index;
int src_index = 0;
for (int i = 0; i < ndims; ++i) {
int coord = dest_index / dest_strides[i];
dest_index -= coord * dest_strides[i];
src_index += src_strides[i] * (coord + offsets[i]);
}
return src_index;
}

template <typename Dtype>
__global__ void copy_kernel(const int n, const int height, const int width,
const int src_inner_stride,
const int dest_inner_stride,
__global__ void crop_kernel_forward(const int nthreads,
const int ndims,
const int* src_strides,
const int* dest_strides,
const int* offsets,
const Dtype* src, Dtype* dest) {
CUDA_KERNEL_LOOP(index, n) {
int src_start = index * src_inner_stride;
int dest_start = index * dest_inner_stride;
for (int i = 0; i < width; ++i) {
dest[dest_start + i] = src[src_start + i];
}
CUDA_KERNEL_LOOP(index, nthreads) {
int src_index = compute_uncropped_index(
index, ndims, src_strides, dest_strides, offsets);
dest[index] = src[src_index];
}
}

template <typename Dtype>
void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top,
const vector<int>& offsets,
vector<int> indices,
int cur_dim,
const Dtype* src_data,
Dtype* dest_data,
bool is_forward) {
if (cur_dim + 2 < top[0]->num_axes()) {
// We are not yet at the final dimension, call copy recursivley
for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
indices[cur_dim] = i;
crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1,
src_data, dest_data, is_forward);
}
} else {
// We are at the last two dimensions, which are stored continuously in
// memory. With (N,C,H,W)
// (0,1,2,3) cur_dim -> H
// cur_dim+1 -> W
const int lines = top[0]->shape(cur_dim);
const int height = top[0]->shape(cur_dim);
const int width = top[0]->shape(cur_dim+1);
std::vector<int> ind_off(cur_dim+2, 0);
for (int j = 0; j < cur_dim; ++j) {
ind_off[j] = indices[j] + offsets[j];
}
ind_off[cur_dim] = offsets[cur_dim];
ind_off[cur_dim+1] = offsets[cur_dim+1];
// Compute copy strides
const int src_inner_stride = bottom[0]->shape(cur_dim+1);
const int dest_inner_stride = top[0]->shape(cur_dim+1);

if (is_forward) {
const Dtype* bottom_data = bottom[0]->gpu_data() +
bottom[0]->offset(ind_off);
Dtype* top_data = top[0]->mutable_gpu_data() +
top[0]->offset(indices);
// NOLINT_NEXT_LINE(whitespace/operators)
copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
lines, height, width,
src_inner_stride,
dest_inner_stride,
bottom_data, top_data);

} else {
const Dtype* top_diff = top[0]->gpu_diff() +
top[0]->offset(indices);
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() +
bottom[0]->offset(ind_off);
// NOLINT_NEXT_LINE(whitespace/operators)
copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
lines, height, width,
dest_inner_stride,
src_inner_stride,
top_diff, bottom_diff);
}
__global__ void crop_kernel_backward(const int nthreads,
const int ndims,
const int* src_strides,
const int* dest_strides,
const int* offsets,
Dtype* src, const Dtype* dest) {
CUDA_KERNEL_LOOP(index, nthreads) {
int src_index = compute_uncropped_index(
index, ndims, src_strides, dest_strides, offsets);
src[src_index] = dest[index];
}
}

template <typename Dtype>
void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
std::vector<int> indices(top[0]->num_axes(), 0);
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
int n = top[0]->count();
crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
bottom[0]->num_axes(),
src_strides_.gpu_data(),
dest_strides_.gpu_data(),
offsets.gpu_data(),
bottom_data, top_data);
}

template <typename Dtype>
void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* top_diff = top[0]->gpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
int n = top[0]->count();

if (propagate_down[0]) {
caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
std::vector<int> indices(top[0]->num_axes(), 0);
crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff,
false);
crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
bottom[0]->num_axes(),
src_strides_.gpu_data(),
dest_strides_.gpu_data(),
offsets.gpu_data(),
bottom_diff, top_diff);
}
}

Expand Down

0 comments on commit 33f8612

Please sign in to comment.