From 12749f651d1e466abe04d3601f13b2b5549fa52c Mon Sep 17 00:00:00 2001 From: Ralf Stubner Date: Sat, 14 Apr 2018 01:00:15 +0200 Subject: [PATCH] Improve counter handling for CBRNGs (#2122) * Improve counter handling for CBRNGs (CPU backend) * Use the full 64 bits of the counter * Separate counter and key/seed * Check for carry when increasing counter (only Threefry) * Improve counter handling for CBRNGs (OpenCL backend) * Use the full 64 bits of the counter * Separate counter and key/seed * Check for carry when increasing counter * Use unused counter values for second threefry round * Improve counter handling for CBRNGs (CUDA backend) * Use the full 64 bits of the counter * Separate counter and key/seed * Check for carry when increasing counter * Add (disabled) Test for RNG period This test repeatedly draws 2^20 random numbers and compares them to the first set of numbers. This fails if a RNG has a period of n * 2^20 with integer n <= 2^12, i.e. in particular 2^32. * Add (disabled) test for RNG quality This test repeatedly draws random numbers and generates a histogram from them. It calculates the chi^2 statistic for the individual step as well as for the accumulated random numbers. The test fails if two consecutive statistics are too large or too small. Too large means that the random numbers are not uniform enough. Too small means that they are "too uniform", since some amount of random noise is expected. Additional notes: * One should also test randn, but that would increase the run-time even more. * The failure conditions are fragile. In principle one should accept a larger range of chi^2 values together with more steps, possibly with different initial seeds. But this would increase run-time even more. * As a consequence of the fragile conditions, false positives can occur. For example, MERSENNE with double on CPU failed early in some tests. * Add program to test RNGs with PractRand --- src/backend/cpu/kernel/random_engine.hpp | 33 ++++--- src/backend/cuda/kernel/random_engine.hpp | 74 ++++++++------ src/backend/opencl/kernel/random_engine.hpp | 6 +- .../opencl/kernel/random_engine_philox.cl | 10 +- .../opencl/kernel/random_engine_threefry.cl | 13 ++- test/random.cpp | 97 +++++++++++++++++++ test/random_practrand.cpp | 27 ++++++ 7 files changed, 211 insertions(+), 49 deletions(-) create mode 100644 test/random_practrand.cpp diff --git a/src/backend/cpu/kernel/random_engine.hpp b/src/backend/cpu/kernel/random_engine.hpp index 304eb8e24e..0e08e83e3a 100644 --- a/src/backend/cpu/kernel/random_engine.hpp +++ b/src/backend/cpu/kernel/random_engine.hpp @@ -101,8 +101,10 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[4] = {(uint)counter, 0, 0, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; int reset = (4*sizeof(uint))/sizeof(T); for (int i = 0; i < (int)elements; i += reset) { @@ -119,14 +121,17 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[2] = {(uint)counter, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; uint val[2]; int reset = (2*sizeof(uint))/sizeof(T); for (int i = 0; i < (int)elements; i += reset) { threefry(key, ctr, val); - ++ctr[0]; ++key[0]; + ++ctr[0]; + ctr[1] += (ctr[0] == 0); int lim = (reset < (int)(elements - i))? reset : (int)(elements - i); for (int j = 0; j < lim; ++j) { out[i + j] = transform(val, j); @@ -162,8 +167,10 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[4] = {(uint)counter, 0, 0, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; T temp[(4*sizeof(uint))/sizeof(T)]; int reset = (4*sizeof(uint))/sizeof(T); @@ -182,17 +189,21 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[2] = {(uint)counter, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; uint val[4]; T temp[(4*sizeof(uint))/sizeof(T)]; int reset = (4*sizeof(uint))/sizeof(T); for (int i = 0; i < (int)elements; i += reset) { threefry(key, ctr, val); - ++ctr[0]; ++key[0]; + ++ctr[0]; + ctr[1] += (ctr[0] == 0); threefry(key, ctr, val+2); - ++ctr[0]; ++key[0]; + ++ctr[0]; + ctr[1] += (ctr[0] == 0); boxMullerTransform(val, temp); int lim = (reset < (int)(elements - i))? reset : (int)(elements - i); for (int j = 0; j < lim; ++j) { diff --git a/src/backend/cuda/kernel/random_engine.hpp b/src/backend/cuda/kernel/random_engine.hpp index 6a6ce32560..9e01e948f3 100644 --- a/src/backend/cuda/kernel/random_engine.hpp +++ b/src/backend/cuda/kernel/random_engine.hpp @@ -400,11 +400,14 @@ namespace kernel } template - __global__ void uniformPhilox(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void uniformPhilox(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[4] = {index+counter, 0, 0, lo}; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); + ctr[2] += (ctr[1] < hic); if (blockIdx.x != (gridDim.x - 1)) { philox(key, ctr); writeOut128Bytes(out, index, ctr[0], ctr[1], ctr[2], ctr[3]); @@ -415,21 +418,24 @@ namespace kernel } template - __global__ void uniformThreefry(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void uniformThreefry(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[2] = {index+counter, lo}; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); uint o[4]; + + threefry(key, ctr, o); + uint step = elementsPerBlock / 2; + ctr[0] += step; + ctr[1] += (ctr[0] < step); + threefry(key, ctr, o + 2); + if (blockIdx.x != (gridDim.x - 1)) { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); writeOut128Bytes(out, index, o[0], o[1], o[2], o[3]); } else { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); partialWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3], elements); } } @@ -496,11 +502,14 @@ namespace kernel } template - __global__ void normalPhilox(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void normalPhilox(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[4] = {index+counter, 0, 0, lo}; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); + ctr[2] += (ctr[1] < hic); if (blockIdx.x != (gridDim.x - 1)) { philox(key, ctr); boxMullerWriteOut128Bytes(out, index, ctr[0], ctr[1], ctr[2], ctr[3]); @@ -511,21 +520,24 @@ namespace kernel } template - __global__ void normalThreefry(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void normalThreefry(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[2] = {index+counter, lo}; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); uint o[4]; - if (blockIdx.x != (gridDim.x - 1)) { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); + + threefry(key, ctr, o); + uint step = elementsPerBlock / 2; + ctr[0] += step; + ctr[1] += (ctr[0] < step); + threefry(key, ctr, o + 2); + + if (blockIdx.x != (gridDim.x - 1)) { boxMullerWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3]); } else { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); partialBoxMullerWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3], elements); } } @@ -636,11 +648,13 @@ namespace kernel int blocks = divup(elements, elementsPerBlock); uint hi = seed>>32; uint lo = seed; + uint hic = counter>>32; + uint loc = counter; switch (type) { case AF_RANDOM_ENGINE_PHILOX_4X32_10 : - CUDA_LAUNCH(uniformPhilox, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(uniformPhilox, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; case AF_RANDOM_ENGINE_THREEFRY_2X32_16 : - CUDA_LAUNCH(uniformThreefry, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(uniformThreefry, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; default : AF_ERROR("Random Engine Type Not Supported", AF_ERR_NOT_SUPPORTED); } counter += elements; @@ -654,11 +668,13 @@ namespace kernel int blocks = divup(elements, elementsPerBlock); uint hi = seed>>32; uint lo = seed; + uint hic = counter>>32; + uint loc = counter; switch (type) { case AF_RANDOM_ENGINE_PHILOX_4X32_10 : - CUDA_LAUNCH(normalPhilox, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(normalPhilox, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; case AF_RANDOM_ENGINE_THREEFRY_2X32_16 : - CUDA_LAUNCH(normalThreefry, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(normalThreefry, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; default : AF_ERROR("Random Engine Type Not Supported", AF_ERR_NOT_SUPPORTED); } counter += elements; diff --git a/src/backend/opencl/kernel/random_engine.hpp b/src/backend/opencl/kernel/random_engine.hpp index 60e676f8fb..41d16a3fc2 100644 --- a/src/backend/opencl/kernel/random_engine.hpp +++ b/src/backend/opencl/kernel/random_engine.hpp @@ -146,15 +146,17 @@ namespace opencl uint hi = seed>>32; uint lo = seed; + uint hic = counter>>32; + uint loc = counter; NDRange local(THREADS, 1); NDRange global(THREADS * groups, 1); if ((type == AF_RANDOM_ENGINE_PHILOX_4X32_10) || (type == AF_RANDOM_ENGINE_THREEFRY_2X32_16)) { Kernel ker = get_random_engine_kernel(type, kerIdx, elementsPerBlock); - auto randomEngineOp = KernelFunctor(ker); + auto randomEngineOp = KernelFunctor(ker); randomEngineOp(EnqueueArgs(getQueue(), global, local), - out, elements, counter, hi, lo); + out, elements, hic, loc, hi, lo); } counter += elements; diff --git a/src/backend/opencl/kernel/random_engine_philox.cl b/src/backend/opencl/kernel/random_engine_philox.cl index d70232c4a7..7e67309eb3 100644 --- a/src/backend/opencl/kernel/random_engine_philox.cl +++ b/src/backend/opencl/kernel/random_engine_philox.cl @@ -93,14 +93,18 @@ void philox(uint key[2], uint ctr[4]) } __kernel void generate(__global T *output, unsigned elements, - unsigned counter, unsigned hi, unsigned lo) + unsigned hic, unsigned loc, unsigned hi, unsigned lo) { unsigned gid = get_group_id(0); unsigned off = get_local_size(0); unsigned index = gid * ELEMENTS_PER_BLOCK + get_local_id(0); - uint key[2] = {index+counter, hi}; - uint ctr[4] = {index+counter, 0, 0, lo}; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); + ctr[2] += (ctr[1] < hic); + philox(key, ctr); if (gid != get_num_groups(0) - 1) { diff --git a/src/backend/opencl/kernel/random_engine_threefry.cl b/src/backend/opencl/kernel/random_engine_threefry.cl index ff458c1d79..1c48837869 100644 --- a/src/backend/opencl/kernel/random_engine_threefry.cl +++ b/src/backend/opencl/kernel/random_engine_threefry.cl @@ -117,18 +117,23 @@ inline void threefry(uint k[2], uint c[2], uint X[2]) } __kernel void generate(__global T *output, unsigned elements, - unsigned counter, unsigned hi, unsigned lo) + unsigned hic, unsigned loc, unsigned hi, unsigned lo) { unsigned gid = get_group_id(0); unsigned off = get_local_size(0); unsigned index = gid * ELEMENTS_PER_BLOCK + get_local_id(0); - uint key[2] = {index+counter, hi}; - uint ctr[2] = {index+counter, lo}; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; uint o[4]; + ctr[0] += index; + ctr[1] += (ctr[0] < index); + threefry(key, ctr, o); - ctr[0] += elements; + uint step = ELEMENTS_PER_BLOCK / 2; + ctr[0] += step; + ctr[1] += (ctr[0] < step); threefry(key, ctr, o+2); if (gid != get_num_groups(0) - 1) { diff --git a/test/random.cpp b/test/random.cpp index 226c6ab1bf..272d3f248d 100644 --- a/test/random.cpp +++ b/test/random.cpp @@ -427,3 +427,100 @@ TYPED_TEST(RandomEngineSeed, mersenneSeedUniform) { testRandomEngineSeed(AF_RANDOM_ENGINE_MERSENNE_GP11213); } + +template +void testRandomEnginePeriod(randomEngineType type) +{ + if (noDoubleTests()) return; + af::dtype ty = (af::dtype)af::dtype_traits::af_type; + + uint elem = 1024*1024; + uint steps = 4*1024; + af::randomEngine r(type, 0); + + af::array first = af::randu(elem, ty, r); + + for (int i = 0; i < steps; ++i) { + af::array step = af::randu(elem, ty, r); + bool different = !af::allTrue(first == step); + ASSERT_TRUE(different); + } +} + +TYPED_TEST(RandomEngine, DISABLED_philoxRandomEnginePeriod) +{ + testRandomEnginePeriod(AF_RANDOM_ENGINE_PHILOX_4X32_10); +} + +TYPED_TEST(RandomEngine, DISABLED_threefryRandomEnginePeriod) +{ + testRandomEnginePeriod(AF_RANDOM_ENGINE_THREEFRY_2X32_16); +} + +TYPED_TEST(RandomEngine, DISABLED_mersenneRandomEnginePeriod) +{ + testRandomEnginePeriod(AF_RANDOM_ENGINE_MERSENNE_GP11213); +} + +template +T chi2_statistic(array input, array expected) { + expected *= af::sum(input) / af::sum(expected); + array diff = input - expected; + return af::sum((diff * diff) / expected); +} + +template +void testRandomEngineUniformChi2(randomEngineType type) +{ + if (noDoubleTests()) return; + af::dtype ty = (af::dtype)af::dtype_traits::af_type; + + int elem = 256*1024*1024; + int steps = 32; + int bins = 100; + + array total_hist = af::constant(0.0, bins, ty); + array expected = af::constant(1.0/bins, bins, ty); + + af::randomEngine r(type, 0); + + // R> qchisq(c(5e-6, 1 - 5e-6), 99) + // [1] 48.68125 173.87456 + T lower = 48.68125; + T upper = 173.87456; + + bool prev_step = true; + bool prev_total = true; + for (int i = 0; i < steps; ++i) { + array step_hist = af::histogram(af::randu(elem, ty, r), bins, 0.0, 1.0); + T step_chi2 = chi2_statistic(step_hist, expected); + if (!prev_step) { + EXPECT_GT(step_chi2, lower) << "at step: " << i; + EXPECT_LT(step_chi2, upper) << "at step: " << i; + } + prev_step = step_chi2 > lower && step_chi2 < upper; + + total_hist += step_hist; + T total_chi2 = chi2_statistic(total_hist, expected); + if (!prev_total) { + EXPECT_GT(total_chi2, lower) << "at step: " << i; + EXPECT_LT(total_chi2, upper) << "at step: " << i; + } + prev_total = total_chi2 > lower && total_chi2 < upper; + } +} + +TYPED_TEST(RandomEngine, DISABLED_philoxRandomEngineUniformChi2) +{ + testRandomEngineUniformChi2(AF_RANDOM_ENGINE_PHILOX_4X32_10); +} + +TYPED_TEST(RandomEngine, DISABLED_threefryRandomEngineUniformChi2) +{ + testRandomEngineUniformChi2(AF_RANDOM_ENGINE_THREEFRY_2X32_16); +} + +TYPED_TEST(RandomEngine, DISABLED_mersenneRandomEngineUniformChi2) +{ + testRandomEngineUniformChi2(AF_RANDOM_ENGINE_MERSENNE_GP11213); +} diff --git a/test/random_practrand.cpp b/test/random_practrand.cpp new file mode 100644 index 0000000000..806c9a4693 --- /dev/null +++ b/test/random_practrand.cpp @@ -0,0 +1,27 @@ +// Generate random bits and send them to STDOUT. +// Suitable for testing with PractRand, c.f. http://pracrand.sourceforge.net/ +// and http://www.pcg-random.org/posts/how-to-test-with-practrand.html +// Commandline arguments: backend, device, rng_type +// Example: +// random_practrand 0 0 200 | RNG_test stdin32 +#include +#include +#include + +int main(int argc, char ** argv) { + int backend = argc > 1 ? atoi(argv[1]) : 0; + af::setBackend(static_cast(backend)); + int device = argc > 2 ? atoi(argv[2]) : 0; + af::setDevice(device); + int rng = argc > 3 ? atoi(argv[3]) : 100; + af::setDefaultRandomEngineType(static_cast(rng)); + + af::setSeed(0xfe47fe0cc078ec30ULL); + int samples = 1024 * 1024; + while (1) { + af::array values = af::randu(samples, u32); + uint32_t *pvalues = values.host(); + fwrite((void*) pvalues, samples * sizeof(*pvalues), 1, stdout); + free(pvalues); + } +}