Skip to content

Commit

Permalink
Improve counter handling for CBRNGs (arrayfire#2122)
Browse files Browse the repository at this point in the history
* Improve counter handling for CBRNGs (CPU backend)

* Use the full 64 bits of the counter
* Separate counter and key/seed
* Check for carry when increasing counter (only Threefry)

* Improve counter handling for CBRNGs (OpenCL backend)

* Use the full 64 bits of the counter
* Separate counter and key/seed
* Check for carry when increasing counter
* Use unused counter values for second threefry round

* Improve counter handling for CBRNGs (CUDA backend)

* Use the full 64 bits of the counter
* Separate counter and key/seed
* Check for carry when increasing counter

* Add (disabled) Test for RNG period

This test repeatedly draws 2^20 random numbers and compares them to the
first set of numbers. This fails if a RNG has a period of n * 2^20 with
integer n <= 2^12, i.e. in particular 2^32.

* Add (disabled) test for RNG quality

This test repeatedly draws random numbers and generates a histogram from
them. It calculates the chi^2 statistic for the individual step as well
as for the accumulated random numbers. The test fails if two consecutive
statistics are too large or too small. Too large means that the random
numbers are not uniform enough. Too small means that they are "too
uniform", since some amount of random noise is expected.

Additional notes:
* One should also test randn, but that would increase the run-time even
  more.
* The failure conditions are fragile. In principle one should accept a
  larger range of chi^2 values together with more steps, possibly with
  different initial seeds. But this would increase run-time even more.
* As a consequence of the fragile conditions, false positives can occur.
  For example, MERSENNE with double on CPU failed early in some tests.

* Add program to test RNGs with PractRand
  • Loading branch information
rstub authored and syurkevi committed Jul 25, 2018
1 parent b1494c1 commit 12749f6
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 49 deletions.
33 changes: 22 additions & 11 deletions src/backend/cpu/kernel/random_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,10 @@ namespace kernel
{
uint hi = seed>>32;
uint lo = seed;
uint key[2] = {(uint)counter, hi};
uint ctr[4] = {(uint)counter, 0, 0, lo};
uint hic = counter>>32;
uint loc = counter;
uint key[2] = {lo, hi};
uint ctr[4] = {loc, hic, 0, 0};

int reset = (4*sizeof(uint))/sizeof(T);
for (int i = 0; i < (int)elements; i += reset) {
Expand All @@ -119,14 +121,17 @@ namespace kernel
{
uint hi = seed>>32;
uint lo = seed;
uint key[2] = {(uint)counter, hi};
uint ctr[2] = {(uint)counter, lo};
uint hic = counter>>32;
uint loc = counter;
uint key[2] = {lo, hi};
uint ctr[2] = {loc, hic};
uint val[2];

int reset = (2*sizeof(uint))/sizeof(T);
for (int i = 0; i < (int)elements; i += reset) {
threefry(key, ctr, val);
++ctr[0]; ++key[0];
++ctr[0];
ctr[1] += (ctr[0] == 0);
int lim = (reset < (int)(elements - i))? reset : (int)(elements - i);
for (int j = 0; j < lim; ++j) {
out[i + j] = transform<T>(val, j);
Expand Down Expand Up @@ -162,8 +167,10 @@ namespace kernel
{
uint hi = seed>>32;
uint lo = seed;
uint key[2] = {(uint)counter, hi};
uint ctr[4] = {(uint)counter, 0, 0, lo};
uint hic = counter>>32;
uint loc = counter;
uint key[2] = {lo, hi};
uint ctr[4] = {loc, hic, 0, 0};
T temp[(4*sizeof(uint))/sizeof(T)];

int reset = (4*sizeof(uint))/sizeof(T);
Expand All @@ -182,17 +189,21 @@ namespace kernel
{
uint hi = seed>>32;
uint lo = seed;
uint key[2] = {(uint)counter, hi};
uint ctr[2] = {(uint)counter, lo};
uint hic = counter>>32;
uint loc = counter;
uint key[2] = {lo, hi};
uint ctr[2] = {loc, hic};
uint val[4];
T temp[(4*sizeof(uint))/sizeof(T)];

int reset = (4*sizeof(uint))/sizeof(T);
for (int i = 0; i < (int)elements; i += reset) {
threefry(key, ctr, val);
++ctr[0]; ++key[0];
++ctr[0];
ctr[1] += (ctr[0] == 0);
threefry(key, ctr, val+2);
++ctr[0]; ++key[0];
++ctr[0];
ctr[1] += (ctr[0] == 0);
boxMullerTransform(val, temp);
int lim = (reset < (int)(elements - i))? reset : (int)(elements - i);
for (int j = 0; j < lim; ++j) {
Expand Down
74 changes: 45 additions & 29 deletions src/backend/cuda/kernel/random_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,14 @@ namespace kernel
}

template <typename T>
__global__ void uniformPhilox(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements)
__global__ void uniformPhilox(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements)
{
uint index = blockIdx.x*elementsPerBlock + threadIdx.x;
uint key[2] = {index+counter, hi};
uint ctr[4] = {index+counter, 0, 0, lo};
uint key[2] = {lo, hi};
uint ctr[4] = {loc, hic, 0, 0};
ctr[0] += index;
ctr[1] += (ctr[0] < loc);
ctr[2] += (ctr[1] < hic);
if (blockIdx.x != (gridDim.x - 1)) {
philox(key, ctr);
writeOut128Bytes(out, index, ctr[0], ctr[1], ctr[2], ctr[3]);
Expand All @@ -415,21 +418,24 @@ namespace kernel
}

template <typename T>
__global__ void uniformThreefry(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements)
__global__ void uniformThreefry(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements)
{
uint index = blockIdx.x*elementsPerBlock + threadIdx.x;
uint key[2] = {index+counter, hi};
uint ctr[2] = {index+counter, lo};
uint key[2] = {lo, hi};
uint ctr[2] = {loc, hic};
ctr[0] += index;
ctr[1] += (ctr[0] < loc);
uint o[4];

threefry(key, ctr, o);
uint step = elementsPerBlock / 2;
ctr[0] += step;
ctr[1] += (ctr[0] < step);
threefry(key, ctr, o + 2);

if (blockIdx.x != (gridDim.x - 1)) {
threefry(key, ctr, o);
ctr[0] += elements;
threefry(key, ctr, o + 2);
writeOut128Bytes(out, index, o[0], o[1], o[2], o[3]);
} else {
threefry(key, ctr, o);
ctr[0] += elements;
threefry(key, ctr, o + 2);
partialWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3], elements);
}
}
Expand Down Expand Up @@ -496,11 +502,14 @@ namespace kernel
}

template <typename T>
__global__ void normalPhilox(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements)
__global__ void normalPhilox(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements)
{
uint index = blockIdx.x*elementsPerBlock + threadIdx.x;
uint key[2] = {index+counter, hi};
uint ctr[4] = {index+counter, 0, 0, lo};
uint key[2] = {lo, hi};
uint ctr[4] = {loc, hic, 0, 0};
ctr[0] += index;
ctr[1] += (ctr[0] < loc);
ctr[2] += (ctr[1] < hic);
if (blockIdx.x != (gridDim.x - 1)) {
philox(key, ctr);
boxMullerWriteOut128Bytes(out, index, ctr[0], ctr[1], ctr[2], ctr[3]);
Expand All @@ -511,21 +520,24 @@ namespace kernel
}

template <typename T>
__global__ void normalThreefry(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements)
__global__ void normalThreefry(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements)
{
uint index = blockIdx.x*elementsPerBlock + threadIdx.x;
uint key[2] = {index+counter, hi};
uint ctr[2] = {index+counter, lo};
uint key[2] = {lo, hi};
uint ctr[2] = {loc, hic};
ctr[0] += index;
ctr[1] += (ctr[0] < loc);
uint o[4];
if (blockIdx.x != (gridDim.x - 1)) {
threefry(key, ctr, o);
ctr[0] += elements;
threefry(key, ctr, o + 2);

threefry(key, ctr, o);
uint step = elementsPerBlock / 2;
ctr[0] += step;
ctr[1] += (ctr[0] < step);
threefry(key, ctr, o + 2);

if (blockIdx.x != (gridDim.x - 1)) {
boxMullerWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3]);
} else {
threefry(key, ctr, o);
ctr[0] += elements;
threefry(key, ctr, o + 2);
partialBoxMullerWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3], elements);
}
}
Expand Down Expand Up @@ -636,11 +648,13 @@ namespace kernel
int blocks = divup(elements, elementsPerBlock);
uint hi = seed>>32;
uint lo = seed;
uint hic = counter>>32;
uint loc = counter;
switch (type) {
case AF_RANDOM_ENGINE_PHILOX_4X32_10 :
CUDA_LAUNCH(uniformPhilox, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break;
CUDA_LAUNCH(uniformPhilox, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break;
case AF_RANDOM_ENGINE_THREEFRY_2X32_16 :
CUDA_LAUNCH(uniformThreefry, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break;
CUDA_LAUNCH(uniformThreefry, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break;
default : AF_ERROR("Random Engine Type Not Supported", AF_ERR_NOT_SUPPORTED);
}
counter += elements;
Expand All @@ -654,11 +668,13 @@ namespace kernel
int blocks = divup(elements, elementsPerBlock);
uint hi = seed>>32;
uint lo = seed;
uint hic = counter>>32;
uint loc = counter;
switch (type) {
case AF_RANDOM_ENGINE_PHILOX_4X32_10 :
CUDA_LAUNCH(normalPhilox, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break;
CUDA_LAUNCH(normalPhilox, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break;
case AF_RANDOM_ENGINE_THREEFRY_2X32_16 :
CUDA_LAUNCH(normalThreefry, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break;
CUDA_LAUNCH(normalThreefry, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break;
default : AF_ERROR("Random Engine Type Not Supported", AF_ERR_NOT_SUPPORTED);
}
counter += elements;
Expand Down
6 changes: 4 additions & 2 deletions src/backend/opencl/kernel/random_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,17 @@ namespace opencl

uint hi = seed>>32;
uint lo = seed;
uint hic = counter>>32;
uint loc = counter;

NDRange local(THREADS, 1);
NDRange global(THREADS * groups, 1);

if ((type == AF_RANDOM_ENGINE_PHILOX_4X32_10) || (type == AF_RANDOM_ENGINE_THREEFRY_2X32_16)) {
Kernel ker = get_random_engine_kernel<T>(type, kerIdx, elementsPerBlock);
auto randomEngineOp = KernelFunctor<cl::Buffer, uint, uint, uint, uint>(ker);
auto randomEngineOp = KernelFunctor<cl::Buffer, uint, uint, uint, uint, uint>(ker);
randomEngineOp(EnqueueArgs(getQueue(), global, local),
out, elements, counter, hi, lo);
out, elements, hic, loc, hi, lo);
}

counter += elements;
Expand Down
10 changes: 7 additions & 3 deletions src/backend/opencl/kernel/random_engine_philox.cl
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,18 @@ void philox(uint key[2], uint ctr[4])
}

__kernel void generate(__global T *output, unsigned elements,
unsigned counter, unsigned hi, unsigned lo)
unsigned hic, unsigned loc, unsigned hi, unsigned lo)
{
unsigned gid = get_group_id(0);
unsigned off = get_local_size(0);
unsigned index = gid * ELEMENTS_PER_BLOCK + get_local_id(0);

uint key[2] = {index+counter, hi};
uint ctr[4] = {index+counter, 0, 0, lo};
uint key[2] = {lo, hi};
uint ctr[4] = {loc, hic, 0, 0};
ctr[0] += index;
ctr[1] += (ctr[0] < loc);
ctr[2] += (ctr[1] < hic);

philox(key, ctr);

if (gid != get_num_groups(0) - 1) {
Expand Down
13 changes: 9 additions & 4 deletions src/backend/opencl/kernel/random_engine_threefry.cl
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,23 @@ inline void threefry(uint k[2], uint c[2], uint X[2])
}

__kernel void generate(__global T *output, unsigned elements,
unsigned counter, unsigned hi, unsigned lo)
unsigned hic, unsigned loc, unsigned hi, unsigned lo)
{
unsigned gid = get_group_id(0);
unsigned off = get_local_size(0);
unsigned index = gid * ELEMENTS_PER_BLOCK + get_local_id(0);

uint key[2] = {index+counter, hi};
uint ctr[2] = {index+counter, lo};
uint key[2] = {lo, hi};
uint ctr[2] = {loc, hic};
uint o[4];

ctr[0] += index;
ctr[1] += (ctr[0] < index);

threefry(key, ctr, o);
ctr[0] += elements;
uint step = ELEMENTS_PER_BLOCK / 2;
ctr[0] += step;
ctr[1] += (ctr[0] < step);
threefry(key, ctr, o+2);

if (gid != get_num_groups(0) - 1) {
Expand Down
97 changes: 97 additions & 0 deletions test/random.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,3 +427,100 @@ TYPED_TEST(RandomEngineSeed, mersenneSeedUniform)
{
testRandomEngineSeed<TypeParam>(AF_RANDOM_ENGINE_MERSENNE_GP11213);
}

template <typename T>
void testRandomEnginePeriod(randomEngineType type)
{
if (noDoubleTests<T>()) return;
af::dtype ty = (af::dtype)af::dtype_traits<T>::af_type;

uint elem = 1024*1024;
uint steps = 4*1024;
af::randomEngine r(type, 0);

af::array first = af::randu(elem, ty, r);

for (int i = 0; i < steps; ++i) {
af::array step = af::randu(elem, ty, r);
bool different = !af::allTrue<bool>(first == step);
ASSERT_TRUE(different);
}
}

TYPED_TEST(RandomEngine, DISABLED_philoxRandomEnginePeriod)
{
testRandomEnginePeriod<TypeParam>(AF_RANDOM_ENGINE_PHILOX_4X32_10);
}

TYPED_TEST(RandomEngine, DISABLED_threefryRandomEnginePeriod)
{
testRandomEnginePeriod<TypeParam>(AF_RANDOM_ENGINE_THREEFRY_2X32_16);
}

TYPED_TEST(RandomEngine, DISABLED_mersenneRandomEnginePeriod)
{
testRandomEnginePeriod<TypeParam>(AF_RANDOM_ENGINE_MERSENNE_GP11213);
}

template <typename T>
T chi2_statistic(array input, array expected) {
expected *= af::sum<T>(input) / af::sum<T>(expected);
array diff = input - expected;
return af::sum<T>((diff * diff) / expected);
}

template <typename T>
void testRandomEngineUniformChi2(randomEngineType type)
{
if (noDoubleTests<T>()) return;
af::dtype ty = (af::dtype)af::dtype_traits<T>::af_type;

int elem = 256*1024*1024;
int steps = 32;
int bins = 100;

array total_hist = af::constant(0.0, bins, ty);
array expected = af::constant(1.0/bins, bins, ty);

af::randomEngine r(type, 0);

// R> qchisq(c(5e-6, 1 - 5e-6), 99)
// [1] 48.68125 173.87456
T lower = 48.68125;
T upper = 173.87456;

bool prev_step = true;
bool prev_total = true;
for (int i = 0; i < steps; ++i) {
array step_hist = af::histogram(af::randu(elem, ty, r), bins, 0.0, 1.0);
T step_chi2 = chi2_statistic<T>(step_hist, expected);
if (!prev_step) {
EXPECT_GT(step_chi2, lower) << "at step: " << i;
EXPECT_LT(step_chi2, upper) << "at step: " << i;
}
prev_step = step_chi2 > lower && step_chi2 < upper;

total_hist += step_hist;
T total_chi2 = chi2_statistic<T>(total_hist, expected);
if (!prev_total) {
EXPECT_GT(total_chi2, lower) << "at step: " << i;
EXPECT_LT(total_chi2, upper) << "at step: " << i;
}
prev_total = total_chi2 > lower && total_chi2 < upper;
}
}

TYPED_TEST(RandomEngine, DISABLED_philoxRandomEngineUniformChi2)
{
testRandomEngineUniformChi2<TypeParam>(AF_RANDOM_ENGINE_PHILOX_4X32_10);
}

TYPED_TEST(RandomEngine, DISABLED_threefryRandomEngineUniformChi2)
{
testRandomEngineUniformChi2<TypeParam>(AF_RANDOM_ENGINE_THREEFRY_2X32_16);
}

TYPED_TEST(RandomEngine, DISABLED_mersenneRandomEngineUniformChi2)
{
testRandomEngineUniformChi2<TypeParam>(AF_RANDOM_ENGINE_MERSENNE_GP11213);
}
Loading

0 comments on commit 12749f6

Please sign in to comment.