diff --git a/src/backend/cpu/kernel/random_engine.hpp b/src/backend/cpu/kernel/random_engine.hpp index 304eb8e24e..0e08e83e3a 100644 --- a/src/backend/cpu/kernel/random_engine.hpp +++ b/src/backend/cpu/kernel/random_engine.hpp @@ -101,8 +101,10 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[4] = {(uint)counter, 0, 0, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; int reset = (4*sizeof(uint))/sizeof(T); for (int i = 0; i < (int)elements; i += reset) { @@ -119,14 +121,17 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[2] = {(uint)counter, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; uint val[2]; int reset = (2*sizeof(uint))/sizeof(T); for (int i = 0; i < (int)elements; i += reset) { threefry(key, ctr, val); - ++ctr[0]; ++key[0]; + ++ctr[0]; + ctr[1] += (ctr[0] == 0); int lim = (reset < (int)(elements - i))? reset : (int)(elements - i); for (int j = 0; j < lim; ++j) { out[i + j] = transform(val, j); @@ -162,8 +167,10 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[4] = {(uint)counter, 0, 0, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; T temp[(4*sizeof(uint))/sizeof(T)]; int reset = (4*sizeof(uint))/sizeof(T); @@ -182,17 +189,21 @@ namespace kernel { uint hi = seed>>32; uint lo = seed; - uint key[2] = {(uint)counter, hi}; - uint ctr[2] = {(uint)counter, lo}; + uint hic = counter>>32; + uint loc = counter; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; uint val[4]; T temp[(4*sizeof(uint))/sizeof(T)]; int reset = (4*sizeof(uint))/sizeof(T); for (int i = 0; i < (int)elements; i += reset) { threefry(key, ctr, val); - ++ctr[0]; ++key[0]; + ++ctr[0]; + ctr[1] += (ctr[0] == 0); threefry(key, ctr, val+2); - ++ctr[0]; ++key[0]; + ++ctr[0]; + ctr[1] += (ctr[0] == 0); boxMullerTransform(val, temp); int lim = (reset < (int)(elements - i))? reset : (int)(elements - i); for (int j = 0; j < lim; ++j) { diff --git a/src/backend/cuda/kernel/random_engine.hpp b/src/backend/cuda/kernel/random_engine.hpp index 6a6ce32560..9e01e948f3 100644 --- a/src/backend/cuda/kernel/random_engine.hpp +++ b/src/backend/cuda/kernel/random_engine.hpp @@ -400,11 +400,14 @@ namespace kernel } template - __global__ void uniformPhilox(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void uniformPhilox(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[4] = {index+counter, 0, 0, lo}; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); + ctr[2] += (ctr[1] < hic); if (blockIdx.x != (gridDim.x - 1)) { philox(key, ctr); writeOut128Bytes(out, index, ctr[0], ctr[1], ctr[2], ctr[3]); @@ -415,21 +418,24 @@ namespace kernel } template - __global__ void uniformThreefry(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void uniformThreefry(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[2] = {index+counter, lo}; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); uint o[4]; + + threefry(key, ctr, o); + uint step = elementsPerBlock / 2; + ctr[0] += step; + ctr[1] += (ctr[0] < step); + threefry(key, ctr, o + 2); + if (blockIdx.x != (gridDim.x - 1)) { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); writeOut128Bytes(out, index, o[0], o[1], o[2], o[3]); } else { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); partialWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3], elements); } } @@ -496,11 +502,14 @@ namespace kernel } template - __global__ void normalPhilox(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void normalPhilox(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[4] = {index+counter, 0, 0, lo}; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); + ctr[2] += (ctr[1] < hic); if (blockIdx.x != (gridDim.x - 1)) { philox(key, ctr); boxMullerWriteOut128Bytes(out, index, ctr[0], ctr[1], ctr[2], ctr[3]); @@ -511,21 +520,24 @@ namespace kernel } template - __global__ void normalThreefry(T *out, uint hi, uint lo, uint counter, uint elementsPerBlock, uint elements) + __global__ void normalThreefry(T *out, uint hi, uint lo, uint hic, uint loc, uint elementsPerBlock, uint elements) { uint index = blockIdx.x*elementsPerBlock + threadIdx.x; - uint key[2] = {index+counter, hi}; - uint ctr[2] = {index+counter, lo}; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); uint o[4]; - if (blockIdx.x != (gridDim.x - 1)) { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); + + threefry(key, ctr, o); + uint step = elementsPerBlock / 2; + ctr[0] += step; + ctr[1] += (ctr[0] < step); + threefry(key, ctr, o + 2); + + if (blockIdx.x != (gridDim.x - 1)) { boxMullerWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3]); } else { - threefry(key, ctr, o); - ctr[0] += elements; - threefry(key, ctr, o + 2); partialBoxMullerWriteOut128Bytes(out, index, o[0], o[1], o[2], o[3], elements); } } @@ -636,11 +648,13 @@ namespace kernel int blocks = divup(elements, elementsPerBlock); uint hi = seed>>32; uint lo = seed; + uint hic = counter>>32; + uint loc = counter; switch (type) { case AF_RANDOM_ENGINE_PHILOX_4X32_10 : - CUDA_LAUNCH(uniformPhilox, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(uniformPhilox, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; case AF_RANDOM_ENGINE_THREEFRY_2X32_16 : - CUDA_LAUNCH(uniformThreefry, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(uniformThreefry, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; default : AF_ERROR("Random Engine Type Not Supported", AF_ERR_NOT_SUPPORTED); } counter += elements; @@ -654,11 +668,13 @@ namespace kernel int blocks = divup(elements, elementsPerBlock); uint hi = seed>>32; uint lo = seed; + uint hic = counter>>32; + uint loc = counter; switch (type) { case AF_RANDOM_ENGINE_PHILOX_4X32_10 : - CUDA_LAUNCH(normalPhilox, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(normalPhilox, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; case AF_RANDOM_ENGINE_THREEFRY_2X32_16 : - CUDA_LAUNCH(normalThreefry, blocks, threads, out, hi, lo, counter, elementsPerBlock, elements); break; + CUDA_LAUNCH(normalThreefry, blocks, threads, out, hi, lo, hic, loc, elementsPerBlock, elements); break; default : AF_ERROR("Random Engine Type Not Supported", AF_ERR_NOT_SUPPORTED); } counter += elements; diff --git a/src/backend/opencl/kernel/random_engine.hpp b/src/backend/opencl/kernel/random_engine.hpp index 60e676f8fb..41d16a3fc2 100644 --- a/src/backend/opencl/kernel/random_engine.hpp +++ b/src/backend/opencl/kernel/random_engine.hpp @@ -146,15 +146,17 @@ namespace opencl uint hi = seed>>32; uint lo = seed; + uint hic = counter>>32; + uint loc = counter; NDRange local(THREADS, 1); NDRange global(THREADS * groups, 1); if ((type == AF_RANDOM_ENGINE_PHILOX_4X32_10) || (type == AF_RANDOM_ENGINE_THREEFRY_2X32_16)) { Kernel ker = get_random_engine_kernel(type, kerIdx, elementsPerBlock); - auto randomEngineOp = KernelFunctor(ker); + auto randomEngineOp = KernelFunctor(ker); randomEngineOp(EnqueueArgs(getQueue(), global, local), - out, elements, counter, hi, lo); + out, elements, hic, loc, hi, lo); } counter += elements; diff --git a/src/backend/opencl/kernel/random_engine_philox.cl b/src/backend/opencl/kernel/random_engine_philox.cl index d70232c4a7..7e67309eb3 100644 --- a/src/backend/opencl/kernel/random_engine_philox.cl +++ b/src/backend/opencl/kernel/random_engine_philox.cl @@ -93,14 +93,18 @@ void philox(uint key[2], uint ctr[4]) } __kernel void generate(__global T *output, unsigned elements, - unsigned counter, unsigned hi, unsigned lo) + unsigned hic, unsigned loc, unsigned hi, unsigned lo) { unsigned gid = get_group_id(0); unsigned off = get_local_size(0); unsigned index = gid * ELEMENTS_PER_BLOCK + get_local_id(0); - uint key[2] = {index+counter, hi}; - uint ctr[4] = {index+counter, 0, 0, lo}; + uint key[2] = {lo, hi}; + uint ctr[4] = {loc, hic, 0, 0}; + ctr[0] += index; + ctr[1] += (ctr[0] < loc); + ctr[2] += (ctr[1] < hic); + philox(key, ctr); if (gid != get_num_groups(0) - 1) { diff --git a/src/backend/opencl/kernel/random_engine_threefry.cl b/src/backend/opencl/kernel/random_engine_threefry.cl index ff458c1d79..1c48837869 100644 --- a/src/backend/opencl/kernel/random_engine_threefry.cl +++ b/src/backend/opencl/kernel/random_engine_threefry.cl @@ -117,18 +117,23 @@ inline void threefry(uint k[2], uint c[2], uint X[2]) } __kernel void generate(__global T *output, unsigned elements, - unsigned counter, unsigned hi, unsigned lo) + unsigned hic, unsigned loc, unsigned hi, unsigned lo) { unsigned gid = get_group_id(0); unsigned off = get_local_size(0); unsigned index = gid * ELEMENTS_PER_BLOCK + get_local_id(0); - uint key[2] = {index+counter, hi}; - uint ctr[2] = {index+counter, lo}; + uint key[2] = {lo, hi}; + uint ctr[2] = {loc, hic}; uint o[4]; + ctr[0] += index; + ctr[1] += (ctr[0] < index); + threefry(key, ctr, o); - ctr[0] += elements; + uint step = ELEMENTS_PER_BLOCK / 2; + ctr[0] += step; + ctr[1] += (ctr[0] < step); threefry(key, ctr, o+2); if (gid != get_num_groups(0) - 1) { diff --git a/test/random.cpp b/test/random.cpp index 226c6ab1bf..272d3f248d 100644 --- a/test/random.cpp +++ b/test/random.cpp @@ -427,3 +427,100 @@ TYPED_TEST(RandomEngineSeed, mersenneSeedUniform) { testRandomEngineSeed(AF_RANDOM_ENGINE_MERSENNE_GP11213); } + +template +void testRandomEnginePeriod(randomEngineType type) +{ + if (noDoubleTests()) return; + af::dtype ty = (af::dtype)af::dtype_traits::af_type; + + uint elem = 1024*1024; + uint steps = 4*1024; + af::randomEngine r(type, 0); + + af::array first = af::randu(elem, ty, r); + + for (int i = 0; i < steps; ++i) { + af::array step = af::randu(elem, ty, r); + bool different = !af::allTrue(first == step); + ASSERT_TRUE(different); + } +} + +TYPED_TEST(RandomEngine, DISABLED_philoxRandomEnginePeriod) +{ + testRandomEnginePeriod(AF_RANDOM_ENGINE_PHILOX_4X32_10); +} + +TYPED_TEST(RandomEngine, DISABLED_threefryRandomEnginePeriod) +{ + testRandomEnginePeriod(AF_RANDOM_ENGINE_THREEFRY_2X32_16); +} + +TYPED_TEST(RandomEngine, DISABLED_mersenneRandomEnginePeriod) +{ + testRandomEnginePeriod(AF_RANDOM_ENGINE_MERSENNE_GP11213); +} + +template +T chi2_statistic(array input, array expected) { + expected *= af::sum(input) / af::sum(expected); + array diff = input - expected; + return af::sum((diff * diff) / expected); +} + +template +void testRandomEngineUniformChi2(randomEngineType type) +{ + if (noDoubleTests()) return; + af::dtype ty = (af::dtype)af::dtype_traits::af_type; + + int elem = 256*1024*1024; + int steps = 32; + int bins = 100; + + array total_hist = af::constant(0.0, bins, ty); + array expected = af::constant(1.0/bins, bins, ty); + + af::randomEngine r(type, 0); + + // R> qchisq(c(5e-6, 1 - 5e-6), 99) + // [1] 48.68125 173.87456 + T lower = 48.68125; + T upper = 173.87456; + + bool prev_step = true; + bool prev_total = true; + for (int i = 0; i < steps; ++i) { + array step_hist = af::histogram(af::randu(elem, ty, r), bins, 0.0, 1.0); + T step_chi2 = chi2_statistic(step_hist, expected); + if (!prev_step) { + EXPECT_GT(step_chi2, lower) << "at step: " << i; + EXPECT_LT(step_chi2, upper) << "at step: " << i; + } + prev_step = step_chi2 > lower && step_chi2 < upper; + + total_hist += step_hist; + T total_chi2 = chi2_statistic(total_hist, expected); + if (!prev_total) { + EXPECT_GT(total_chi2, lower) << "at step: " << i; + EXPECT_LT(total_chi2, upper) << "at step: " << i; + } + prev_total = total_chi2 > lower && total_chi2 < upper; + } +} + +TYPED_TEST(RandomEngine, DISABLED_philoxRandomEngineUniformChi2) +{ + testRandomEngineUniformChi2(AF_RANDOM_ENGINE_PHILOX_4X32_10); +} + +TYPED_TEST(RandomEngine, DISABLED_threefryRandomEngineUniformChi2) +{ + testRandomEngineUniformChi2(AF_RANDOM_ENGINE_THREEFRY_2X32_16); +} + +TYPED_TEST(RandomEngine, DISABLED_mersenneRandomEngineUniformChi2) +{ + testRandomEngineUniformChi2(AF_RANDOM_ENGINE_MERSENNE_GP11213); +} diff --git a/test/random_practrand.cpp b/test/random_practrand.cpp new file mode 100644 index 0000000000..806c9a4693 --- /dev/null +++ b/test/random_practrand.cpp @@ -0,0 +1,27 @@ +// Generate random bits and send them to STDOUT. +// Suitable for testing with PractRand, c.f. http://pracrand.sourceforge.net/ +// and http://www.pcg-random.org/posts/how-to-test-with-practrand.html +// Commandline arguments: backend, device, rng_type +// Example: +// random_practrand 0 0 200 | RNG_test stdin32 +#include +#include +#include + +int main(int argc, char ** argv) { + int backend = argc > 1 ? atoi(argv[1]) : 0; + af::setBackend(static_cast(backend)); + int device = argc > 2 ? atoi(argv[2]) : 0; + af::setDevice(device); + int rng = argc > 3 ? atoi(argv[3]) : 100; + af::setDefaultRandomEngineType(static_cast(rng)); + + af::setSeed(0xfe47fe0cc078ec30ULL); + int samples = 1024 * 1024; + while (1) { + af::array values = af::randu(samples, u32); + uint32_t *pvalues = values.host(); + fwrite((void*) pvalues, samples * sizeof(*pvalues), 1, stdout); + free(pvalues); + } +}