<a href="https://colab.research.google.com/github/trefftzc/cis677/blob/main/Thrust_algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Thrust's algorithms

Based on
https://nvidia.github.io/cccl/thrust/api_docs/algorithms.html

Nine groups of algorithms:

1. Copying
2. Merging
3. Prefix sums
4. Reductions
5. Reordering
6. Searching
7. Set Operations
8. Sorting
9. Transformations

## 1. Copying

a. Gather

b. Scatter

c. swap_ranges

d. copy

e. copy_n

f. unitialized_copy



1.a. Gather:

gather copies elements from a source array into a destination range according to a map. For each input iterator i in the range [map_first, map_last), the value input_first[*i] is assigned to *(result + (i - map_first)). RandomAccessIterator must permit random access.

In [None]:
%%writefile gather.cu
#include <thrust/gather.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>

int main() {
  // mark even indices with a 1; odd indices with a 0
  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
  thrust::device_vector<int> d_values(values, values + 10);

  // gather all even indices into the first half of the range
  // and odd indices to the last half of the range
  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
  thrust::device_vector<int> d_map(map, map + 10);

  thrust::device_vector<int> d_output(10);
  thrust::gather(d_map.begin(), d_map.end(),
               d_values.begin(),
               d_output.begin());
// d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
  thrust::host_vector<int> h_output(10);
  thrust::copy(d_output.begin(), d_output.end(), h_output.begin());
  for(int value : h_output) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  return 0;
}

Overwriting gather.cu


In [None]:
!!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub gather.cu -o gather -arch sm_75


[]

In [None]:
!./gather

1 1 1 1 1 0 0 0 0 0 


1.b. scatter

scatter copies elements from a source range into an output array according to a map. For each iterator i in the range [first, last), the value *i is assigned to output[*(map + (i - first))]. The output iterator must permit random access. If the same index appears more than once in the range [map, map + (last - first)), the result is undefined.

In [None]:
%%writefile scatter.cu
#include <thrust/scatter.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>

int main() {
  // mark even indices with a 1; odd indices with a 0
  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
  thrust::device_vector<int> d_values(values, values + 10);

  // scatter all even indices into the first half of the
  // range, and odd indices vice versa
  int map[10]   = {0, 5, 1, 6, 2, 7, 3, 8, 4, 9};
  thrust::device_vector<int> d_map(map, map + 10);

  thrust::device_vector<int> d_output(10);
  thrust::scatter(d_values.begin(), d_values.end(),
                d_map.begin(), d_output.begin());
  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
  thrust::host_vector<int> h_output(10);
  thrust::copy(d_output.begin(), d_output.end(), h_output.begin());
  for(int value : h_output) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  return 0;
}



Writing scatter.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub scatter.cu -o scatter -arch sm_75

In [None]:
!./scatter

1 1 1 1 1 0 0 0 0 0 


1.c. swap_ranges

swap_ranges swaps each of the elements in the range [first1, last1) with the corresponding element in the range [first2, first2 + (last1 - first1)). That is, for each integer n such that 0 <= n < (last1 - first1), it swaps *(first1 + n) and *(first2 + n). The return value is first2 + (last1 - first1).

In [None]:
%%writefile swap_ranges.cu
#include <thrust/swap.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>

int main() {
 thrust::device_vector<int> v1(2), v2(2);
  v1[0] = 1;
  v1[1] = 2;
  v2[0] = 3;
  v2[1] = 4;

  thrust::swap_ranges(v1.begin(), v1.end(), v2.begin());
// v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
  thrust::host_vector<int> h_v1(2);
  thrust::host_vector<int> h_v2(2);
  thrust::copy(v1.begin(), v1.end(), h_v1.begin());
  thrust::copy(v2.begin(), v2.end(), h_v2.begin());
  std::cout << "v1: ";
  for(int value : h_v1) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  std::cout << "v2: ";
  for(int value : h_v2) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  return 0;
}


Overwriting swap_ranges.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub swap_ranges.cu -o swap_ranges -arch sm_75

In [None]:
!./swap_ranges

v1: 3 4 
v2: 1 2 


1.d. copy

copy copies elements from the range [first, last) to the range [result, result + (last - first)). That is, it performs the assignments *result = *first, *(result + 1) = *(first + 1), and so on. Generally, for every integer n from 0 to last - first, copy performs the assignment *(result + n) = *(first + n). Unlike std::copy, copy offers no guarantee on order of operation. As a result, calling copy with overlapping source and destination ranges has undefined behavior.

The return value is result + (last - first).

In [None]:
%%writefile copy.cu
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>

int main() {
  thrust::device_vector<int> vec0(10);
  thrust::device_vector<int> vec1(10);
  for(int i = 0; i < 10; ++i) {
    vec0[i] = i;
  }

  thrust::copy(vec0.begin(), vec0.end(),
             vec1.begin());

// vec1 is now a copy of vec0
  thrust::host_vector<int> h_v0(10);
  thrust::host_vector<int> h_v1(10);
  thrust::copy(vec0.begin(), vec0.end(), h_v0.begin());
  thrust::copy(vec1.begin(), vec1.end(), h_v1.begin());
  std::cout << "vec0: ";
  for(int value : h_v0) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  std::cout << "vec1: ";
  for(int value : h_v1) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  return 0;
}


Writing copy.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub copy.cu -o copy -arch sm_75

In [None]:
!./copy

vec0: 0 1 2 3 4 5 6 7 8 9 
vec1: 0 1 2 3 4 5 6 7 8 9 


1.e. copy_n

copy_n copies elements from the range [first, first + n) to the range [result, result + n). That is, it performs the assignments *result = *first, *(result + 1) = *(first + 1), and so on. Generally, for every integer i from 0 to n, copy performs the assignment *(result

i) = *(first + i). Unlike std::copy_n, copy_n offers no guarantee on order of operation. As a result, calling copy_n with overlapping source and destination ranges has undefined behavior.

The return value is result + n.

The algorithm’s execution is parallelized as determined by exec.

The following code snippet demonstrates how to use copy to copy from

In [None]:
%%writefile copy_n.cu
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
#include <iostream>

int main() {
  thrust::device_vector<int> vec0(10);
  thrust::device_vector<int> vec1(10);
  for(int i = 0; i < 10; ++i) {
    vec0[i] = i;
  }
  int n = 5;
  thrust::copy_n(thrust::device,vec0.begin(), n,
             vec1.begin());

// vec1 now contains the first 5 elements of vec0
  thrust::host_vector<int> h_v0(10);
  thrust::host_vector<int> h_v1(10);
  thrust::copy(vec0.begin(), vec0.end(), h_v0.begin());
  thrust::copy(vec1.begin(), vec1.end(), h_v1.begin());
  std::cout << "vec0: ";
  for(int value : h_v0) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  std::cout << "vec1: ";
  for(int value : h_v1) {
    std::cout << value << " ";
  }
  std::cout << std::endl;
  return 0;
}

Overwriting copy_n.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub copy_n.cu -o copy_n -arch sm_75

In [None]:
!./copy_n

vec0: 0 1 2 3 4 5 6 7 8 9 
vec1: 0 1 2 3 4 0 0 0 0 0 


1.f. unitialized_copy

In thrust, the function thrust::device_new allocates memory for an object and then creates an object at that location by calling a constructor. Occasionally, however, it is useful to separate those two operations. If each iterator in the range [result, result + (last - first)) points to uninitialized memory, then uninitialized_copy creates a copy of [first, last) in that range. That is, for each iterator i in the input, uninitialized_copy creates a copy of *i in the location pointed to by the corresponding iterator in the output range by ForwardIterator's value_type's copy constructor with *i as its argument.

The algorithm’s execution is parallelized as determined by exec.


In [None]:
%%writefile unitialized_copy.cu
#include <thrust/uninitialized_copy.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/device_malloc.h>
#include <iostream>

struct Int
{
  __host__ __device__
  Int(int x) : val(x) {}
  int val;
};

const int N = 137;


int main() {
  Int val(46);
  thrust::device_vector<Int> input(N, val);
  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
  thrust::uninitialized_copy(thrust::device, input.begin(), input.end(), array);

// Int x = array[i];
// x.val == 46 for all 0 <= i < N


  return 0;
}

Overwriting unitialized_copy.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub unitialized_copy.cu -o unitialized_copy -arch sm_75

In [None]:
!./unitialized_copy

## 2. merge

a. merge

b. merge_by_key

2.a. merge

merge combines two sorted ranges [first1, last1) and [first2, last2) into a single sorted range. That is, it copies from [first1, last1) and [first2, last2) into [result, result + (last1 - first1) + (last2 - first2)) such that the resulting range is in ascending order. merge is stable, meaning both that the relative order of elements within each input range is preserved, and that for equivalent elements in both input ranges the element from the first range precedes the element from the second. The return value is result + (last1 - first1) + (last2 - first2).

This version of merge compares elements using operator<.

In [None]:
%%writefile merge.cu
#include <thrust/merge.h>
#include <iostream>

using namespace std;

int main() {
  int A1[6] = {1, 3, 5, 7, 9, 11};
  int A2[7] = {1, 1, 2, 3, 5,  8, 13};

  int result[13];

  int *result_end = thrust::merge(A1, A1 + 6, A2, A2 + 7, result);
  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}

  for(int i = 0; i < result_end - result; i++) {
    cout << result[i] << " ";
  }
  cout << endl;

}

Writing merge.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub merge.cu -o merge -arch sm_75

In [None]:
!./merge

1 1 1 2 3 3 5 5 7 8 9 11 13 


2.b. merge_by_key

merge_by_key performs a key-value merge. That is, merge_by_key copies elements from [keys_first1, keys_last1) and [keys_first2, keys_last2) into a single range, [keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2)) such that the resulting range is in ascending key order.

At the same time, merge_by_key copies elements from the two associated ranges [values_first1 + (keys_last1 - keys_first1)) and [values_first2 + (keys_last2 - keys_first2)) into a single range, [values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2)) such that the resulting range is in ascending order implied by each input element’s associated key.

merge_by_key is stable, meaning both that the relative order of elements within each input range is preserved, and that for equivalent elements in all input key ranges the element from the first range precedes the element from the second.

The return value is is (keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2)) and (values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2)).

This version of merge_by_key compares key elements using a function object comp.

The algorithm’s execution is parallelized using exec.

In [None]:
%%writefile merge_by_key.cu


#include <thrust/functional.h>
#include <thrust/execution_policy.h>
#include <iostream>

using namespace std;

int main() {
  int A_keys[6] = {11, 9, 7, 5, 3, 1};
  int A_vals[6] = { 0, 0, 0, 0, 0, 0};

  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
  int B_vals[7] = { 1, 1, 1, 1, 1, 1, 1};

  int keys_result[13];
  int vals_result[13];

  thrust::pair<int*,int*> end =
    thrust::merge_by_key(thrust::host,
                       A_keys, A_keys + 6,
                       B_keys, B_keys + 7,
                       A_vals, B_vals,
                       keys_result, vals_result,
                       ::cuda::std::greater<int>());

// keys_result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
// vals_result = { 1,  0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1}

  for(int i = 0; i < end.first - keys_result; i++) {
    cout << keys_result[i] << " " << vals_result[i] << endl;
  }
  cout << endl;

}

Writing merge_by_key.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub merge_by_key.cu -o merge_by_key -arch sm_75

In [None]:
!./merge_by_key

13 1
11 0
9 0
8 1
7 0
5 0
5 1
3 0
3 1
2 1
1 0
1 1
1 1



## 3. Prefix Sums

a. inclusive_scan

b. exclusive_scan


3.a. inclusive_scan

inclusive_scan computes an inclusive prefix sum operation. The term ‘inclusive’ means that each result includes the corresponding input operand in the partial sum. When the input and output sequences are the same, the scan is performed in-place.

inclusive_scan is similar to std::partial_sum in the STL. The primary difference between the two functions is that std::partial_sum guarantees a serial summation order, while inclusive_scan requires associativity of the binary operation to parallelize the prefix sum.

Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.

The algorithm’s execution is parallelized as determined by exec.

In [None]:
%%writefile inclusive_scan.cu
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <iostream>
#include <bits/stdc++.h>

using namespace std;

int main() {

  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};

  thrust::maximum<int> binary_op;

  thrust::inclusive_scan(thrust::host, data, data + 10, data, binary_op); // in-place scan

  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}

  for(int i = 0; i < 10; i++) {
    cout << data[i] << " ";
  }
  cout << endl;

  return 0;

}


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub inclusive_scan.cu -o inclusive_scan -arch sm_75

In [None]:
!./inclusive_scan


-5 0 2 2 2 4 4 4 4 8 


3.b. exclusive_scan

exclusive_scan computes an exclusive prefix sum operation. The term ‘exclusive’ means that each result does not include the corresponding input operand in the partial sum. More precisely, init is assigned to *result and the value binary_op(init, *first) is assigned to *(result + 1), and so on. This version of the function requires both an associative operator and an initial value init. When the input and output sequences are the same, the scan is performed in-place.

Results are not deterministic for pseudo-associative operators (e.g., addition of floating-point types). Results for pseudo-associative operators may vary from run to run.

The algorithm’s execution is parallelized as determined by exec.

In [None]:
%%writefile exclusive_scan.cu
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <iostream>
#include <bits/stdc++.h>

using namespace std;

int main() {

  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};

  thrust::maximum<int> binary_op;
  // The initial value is 1
  thrust::exclusive_scan(thrust::host, data, data + 10, data, 1, binary_op); // in-place scan

  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }

  for(int i = 0; i < 10; i++) {
    cout << data[i] << " ";
  }
  cout << endl;

  return 0;

}


Writing exclusive_scan.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub exclusive_scan.cu -o exclusive_scan -arch sm_75

In [None]:
!./exclusive_scan

1 1 1 2 2 2 4 4 4 4 


## 4. Reductions

a. Comparisons

b. Counting

c. Extrema

d. Logical

e. Predicates

f. Transformed Reductions

g. thrust::reduce_by_key

h. thrust::reduce_into

4.a. Comparisons

equal returns true if the two ranges [first1, last1) and [first2, first2 + (last1 - first1)) are identical when compared element-by-element, and otherwise returns false.

This version of equal returns true if and only if for every iterator i in [first1, last1), binary_pred(*i, *(first2 + (i - first1))) is true.

The following code snippet demonstrates how to use equal to compare the elements in two ranges modulo 2.

In [None]:
%%writefile equal.cu
#include <thrust/equal.h>
#include <iostream>

using namespace std;

struct compare_modulo_two
{
  __host__ __device__
  bool operator()(int x, int y) const
  {
    return (x % 2) == (y % 2);
  }
};

int main() {
  int x[6] = {0, 2, 4, 6, 8, 10};
  int y[6] = {1, 3, 5, 7, 9, 11};

  bool result = thrust::equal(x, x + 5, y, compare_modulo_two());

  // result is false
  cout << result << endl;

  return 0;
}

Writing equal.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub equal.cu -o equal -arch sm_75

In [None]:
!./equal

0


4.b. Counting

count finds the number of elements in [first,last) that are equal to value. More precisely, count returns the number of iterators i in [first, last) such that *i == value.

count_if finds the number of elements in [first,last) for which a predicate is true. More precisely, count_if returns the number of iterators i in [first, last) such that pred(*i) == true.

The algorithm’s execution is parallelized as determined by exec.

In [None]:
%%writefile count.cu
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <iostream>

using namespace std;

struct is_odd
{
  __host__ __device__
  bool operator()(int x)
  {
    return x % 2 == 1;
  }
};

int main() {
// Example of count
 thrust::device_vector<int> vec(5,0);
  vec[1] = 1;
  vec[3] = 1;
  vec[4] = 1;

  // count the 1s
  int result = thrust::count(vec.begin(), vec.end(), 1);
  // result == 3
  cout << result << endl;
// ------------------------------------------------
// Example of count_if
// fill a device_vector with even & odd numbers

  vec[0] = 0;
  vec[1] = 1;
  vec[2] = 2;
  vec[3] = 3;
  vec[4] = 4;

// count the odd elements in vec
   result = thrust::count_if(thrust::device, vec.begin(), vec.end(), is_odd());
// result == 2
  cout << result << endl;
  return 0;
}

Overwriting count.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub count.cu -o count -arch sm_75

In [None]:
!./count

3
2


4.c. Extrema

minmax_element finds the smallest and largest elements in the range [first, last). It returns a pair of iterators (imin, imax) where imin is the same iterator returned by min_element and imax is the same iterator returned by max_element. This function is potentially more efficient than separate calls to min_element and max_element.

In [None]:
%%writefile minmax.cu
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <iostream>

using namespace std;

struct key_value
{
  int key;
  int value;
};

struct compare_key_value
{
  __host__ __device__
  bool operator()(key_value lhs, key_value rhs)
  {
    return lhs.key < rhs.key;
  }
};

int main() {
  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };

  thrust::pair<key_value*,key_value*> extrema = thrust::minmax_element(data, data + 4, compare_key_value());

  // extrema.first   == data + 1
  // *extrema.first  == {0,7}
  // extrema.second  == data + 3
  // *extrema.second == {6,1}
  cout << (*extrema.first).key << " " << (*extrema.first).value << endl;
  cout << (*extrema.second).key << " " << (*extrema.second).value << endl;

  return 0;
}

Overwriting minmax.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub minmax.cu -o minmax -arch sm_75

In [None]:
!./minmax

0 7
6 1


In [None]:
%%writefile maxelement.cu
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <iostream>

using namespace std;

struct key_value
{
  int key;
  int value;
};

struct compare_key_value
{
  __host__ __device__
  bool operator()(key_value lhs, key_value rhs)
  {
    return lhs.key < rhs.key;
  }
};

int main() {
  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };

  key_value *largest = thrust::max_element(thrust::host, data, data + 4, compare_key_value());

  // largest == data + 3
  // *largest == {6,1}
  cout << (*largest).key << " " << (*largest).value << endl;

  return 0;
}

Overwriting maxelement.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub maxelement.cu -o maxelement -arch sm_75

In [None]:
!./maxelement

6 1


In [None]:
%%writefile minelement.cu
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <iostream>

using namespace std;

struct key_value
{
  int key;
  int value;
};

struct compare_key_value
{
  __host__ __device__
  bool operator()(key_value lhs, key_value rhs)
  {
    return lhs.key < rhs.key;
  }
};

int main() {

  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };

  key_value *smallest = thrust::min_element(thrust::host, data, data + 4, compare_key_value());

// smallest == data + 1
// *smallest == {0,7}
  cout << (*smallest).key << " " << (*smallest).value << endl;

  return 0;
}

Writing minelement.cu


In [None]:
!nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub minelement.cu -o minelement -arch sm_75

In [None]:
!./minelement

0 7
