diff --git a/sources/accelerators/bvh.cc b/sources/accelerators/bvh.cc index 0df5cfe..413a46a 100644 --- a/sources/accelerators/bvh.cc +++ b/sources/accelerators/bvh.cc @@ -39,7 +39,7 @@ static const int orderTable[] = { 0x40123, 0x40132, 0x41023, 0x41032, 0x42301, 0x43201, 0x42310, 0x43210, // ++|++ }; -int test_AABB(const __m128 bboxes[2][3], const __m128 org[3], const __m128 idir[3], const int sign[3], __m128 tmin, __m128 tmax) { +int test_AABB(const __m128 bboxes[2][3], const __m128 org[3], const __m128 idir[3], const int sign[3], __m128 tmin, __m128 tmax) noexcept { tmin = _mm_max_ps(tmin, _mm_mul_ps(_mm_sub_ps(bboxes[sign[0]][0], org[0]), idir[0])); tmax = _mm_min_ps(tmax, _mm_mul_ps(_mm_sub_ps(bboxes[1 - sign[0]][0], org[0]), idir[0])); tmin = _mm_max_ps(tmin, _mm_mul_ps(_mm_sub_ps(bboxes[sign[1]][1], org[1]), idir[1])); @@ -134,6 +134,7 @@ BVHAccel::BVHAccel(const std::vector> &prims, } BVHAccel::~BVHAccel() { + release(); } Bounds3d BVHAccel::worldBound() const { @@ -144,7 +145,7 @@ void BVHAccel::construct() { if (primitives_.empty()) return; std::vector primitiveInfo(primitives_.size()); - for (int i = 0; i < primitives_.size(); i++) { + for (int i = 0; i < (int)primitives_.size(); i++) { primitiveInfo[i] = { i, primitives_[i]->worldBound() }; } @@ -241,10 +242,10 @@ BVHNode* BVHAccel::constructRec(std::vector& buildData, } void BVHAccel::release() { - for (int i = 0; i < simdNodes_.size(); i++) { - align_free(simdNodes_[i]); + for (auto &node : simdNodes_) { + align_free(node); } - + root_ = nullptr; simdNodes_.clear(); } @@ -253,8 +254,7 @@ void BVHAccel::collapse2QBVH(BVHNode* node) { BVHNode *lc = node->left; BVHNode *rc = node->right; - SIMDBVHNode* n = - static_cast(align_alloc(sizeof(SIMDBVHNode), 16)); + SIMDBVHNode* n = (SIMDBVHNode *)align_alloc(sizeof(SIMDBVHNode), 16); Assertion(n != nullptr, "allocation failed !!"); simdNodes_.push_back(n); @@ -337,7 +337,6 @@ bool BVHAccel::intersectBVH(Ray &ray, SurfaceInteraction *isect) const{ nodeStack.push(root_); bool hit = false; - int cnt = 0; while (!nodeStack.empty()) { BVHNode* node = nodeStack.top(); nodeStack.pop(); @@ -367,7 +366,6 @@ bool BVHAccel::intersectBVH(Ray& ray) const { std::stack nodeStack; nodeStack.push(root_); - bool hit = false; while (!nodeStack.empty()) { BVHNode* node = nodeStack.top(); nodeStack.pop(); diff --git a/sources/core/memory.cc b/sources/core/memory.cc index 9f99478..f92b604 100644 --- a/sources/core/memory.cc +++ b/sources/core/memory.cc @@ -2,6 +2,7 @@ #include "memory.h" #include +#include #include namespace spica { @@ -22,12 +23,13 @@ MemoryArena::MemoryArena(MemoryArena&& arena) noexcept MemoryArena::~MemoryArena() { align_free(currentBlock_); - for (auto& block : usedBlocks_) align_free(block.second); + for (auto& block : usedBlocks_) align_free(block.second); for (auto& block : availableBlocks_) align_free(block.second); } void* MemoryArena::allocBytes(size_t nBytes) { - nBytes = ((nBytes + 0xf) & (~0xf)); + const int align = alignof(std::max_align_t); + nBytes = ((nBytes + align - 1) & ~(align - 1)); if (currentBlockPos_ + nBytes > currentAllocSize_) { if (currentBlock_) { usedBlocks_.emplace_back(currentAllocSize_, currentBlock_); @@ -39,7 +41,7 @@ void* MemoryArena::allocBytes(size_t nBytes) { iter != availableBlocks_.end(); ++iter) { if (iter->first >= nBytes) { currentAllocSize_ = iter->first; - currentBlock_ = iter->second; + currentBlock_ = iter->second; availableBlocks_.erase(iter); break; } @@ -47,7 +49,7 @@ void* MemoryArena::allocBytes(size_t nBytes) { if (!currentBlock_) { currentAllocSize_ = std::max(nBytes, blockSize_); - currentBlock_ = (unsigned char*)align_alloc(currentAllocSize_, 64); + currentBlock_ = (uint8_t*)align_alloc(currentAllocSize_, 64); } currentBlockPos_ = 0; } @@ -63,8 +65,8 @@ void MemoryArena::reset() { size_t MemoryArena::totalAllocated() const { size_t total = currentAllocSize_; - for (auto& block : usedBlocks_) total += block.first; - for (auto& block : availableBlocks_) total += block.first; + for (const auto &block : usedBlocks_) total += block.first; + for (const auto &block : availableBlocks_) total += block.first; return total; } diff --git a/sources/core/memory.h b/sources/core/memory.h index 4ab7841..57cc6fd 100644 --- a/sources/core/memory.h +++ b/sources/core/memory.h @@ -43,7 +43,7 @@ class SPICA_EXPORTS MEM_ALIGN(64) MemoryArena : private Uncopyable { using Elem = typename std::remove_extent::type; Elem *ret = (Elem*)allocBytes(sizeof(Elem) * size); for (size_t i = 0; i < size; i++) { - new (ret + i) Elem(); + new (&ret[i]) Elem(); } return ret; } @@ -60,7 +60,7 @@ class SPICA_EXPORTS MEM_ALIGN(64) MemoryArena : private Uncopyable { const size_t blockSize_; size_t currentBlockPos_ = 0; size_t currentAllocSize_ = 0; - unsigned char* currentBlock_ = nullptr; + uint8_t* currentBlock_ = nullptr; std::list> usedBlocks_, availableBlocks_; }; // class MemoryArena diff --git a/sources/core/parallel.cc b/sources/core/parallel.cc index 9fb5ac9..0fa01a6 100644 --- a/sources/core/parallel.cc +++ b/sources/core/parallel.cc @@ -8,8 +8,6 @@ #include #include -static int numUserThreads = std::thread::hardware_concurrency(); - namespace spica { inline uint64_t doubleToBits(double v) { @@ -45,127 +43,179 @@ void AtomicDouble::add(double v) { } while (!bits.compare_exchange_weak(oldBits, newBits)); } +void Barrier::wait() { + std::unique_lock lock(mutex); + if (--count == 0) { + cv.notify_all(); + } else { + cv.wait(lock, [this] { return count == 0; }); + } +} + } // namespace spica -class WorkerTask; +// --------------------------------------------------------------------------------------------------------------------- +// parallel_for definition +// --------------------------------------------------------------------------------------------------------------------- + +static std::vector threads; +static bool shutdownThreads = false; +class ParallelForLoop; +static ParallelForLoop *workList = nullptr; +static std::mutex workListMutex; + +static int threadCount = std::thread::hardware_concurrency(); static thread_local int threadID; -static std::mutex workerListMutex; static std::condition_variable condval; -static WorkerTask* workerTask; -class WorkerTask { +class ParallelForLoop { public: - WorkerTask(const std::function& f, int csize, int tasks) - : func{ f } - , chunkSize{ csize } - , nTasks { tasks } { + ParallelForLoop(std::function func, int64_t maxIndex, int chunkSize) + : func{ std::move(func) } + , maxIndex{ maxIndex } + , chunkSize{ chunkSize } { } bool finished() const { - return currentIndex >= nTasks && activeWorkers == 0; + return nextIndex >= maxIndex && activeWorkers == 0; } - const std::function& func; +public: + std::function func; + const int64_t maxIndex; const int chunkSize; - const int nTasks; - bool isWorking = false; - int currentIndex = 0; + + int64_t nextIndex = 0; int activeWorkers = 0; + ParallelForLoop *next = nullptr; }; -class WorkerTaskManager { -public: - WorkerTaskManager(const std::function& f, - int chunkSize, int nTasks) { - if (workerTask == nullptr) { - workerTask = new WorkerTask(f, chunkSize, nTasks); - } - } - ~WorkerTaskManager() { - delete workerTask; - workerTask = nullptr; - } -}; +static std::condition_variable workListCondition; -static void workerThreadFunc(int threadIndex) { +static void workerThreadFunc(int threadIndex, std::shared_ptr barrier) { + // Initialize worker threadID = threadIndex; - std::unique_lock lock(workerListMutex); - while (!workerTask->finished()) { - if (!workerTask->isWorking) { - condval.wait(lock); + barrier->wait(); + barrier.reset(); + + std::unique_lock lock(workListMutex); + while (!shutdownThreads) { + if (!workList) { + // Sleep thread while no task found. + workListCondition.wait(lock); } else { - int indexStart = workerTask->currentIndex; - int indexEnd = std::min(workerTask->nTasks, indexStart + workerTask->chunkSize); - workerTask->currentIndex = indexEnd; - if (workerTask->currentIndex == workerTask->nTasks) { - workerTask->isWorking = false; + // Get task and evaluate a function + ParallelForLoop &loop = *workList; + + int64_t indexStart = loop.nextIndex; + int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex); + + // Update loop state + loop.nextIndex = indexEnd; + if (loop.nextIndex == loop.maxIndex) { + workList = loop.next; } - workerTask->activeWorkers++; - lock.unlock(); + loop.activeWorkers++; - for (int i = indexStart; i < indexEnd; i++) { - workerTask->func(i); + // Run loop + lock.unlock(); + for (int64_t index = indexStart; index < indexEnd; index++) { + if (loop.func) { + loop.func(index); + } } lock.lock(); - workerTask->activeWorkers--; - if (workerTask->finished()) { - printf("OK!!"); - condval.notify_all(); + // Finish working for a function + loop.activeWorkers--; + if (loop.finished()) { + workListCondition.notify_all(); } } } } -void parallel_for(int start, int end, const std::function& func, - ParallelSchedule schedule) { - const int nTasks = (end - start); - const int nThreads = numSystemThreads(); - const int chunkSize = schedule == ParallelSchedule::Dynamic ? 1 : (nTasks + nThreads - 1) / nThreads; - WorkerTaskManager manager(func, chunkSize, nTasks); +class ParallelManager { +public: + ParallelManager() { + // Initialize threads + threadID = 0; + const int nThreads = numSystemThreads(); + auto barrier = std::make_shared(nThreads); + for (int i = 0; i < nThreads - 1; i++) { + threads.emplace_back(workerThreadFunc, i + 1, barrier); + } + barrier->wait(); + } - std::vector threads; - threadID = 0; - for (int i = 0; i < nThreads - 1; i++) { - threads.emplace_back(workerThreadFunc, i + 1); + ~ParallelManager() { + // Clean up threads + if (threads.empty()) return; + + { + std::lock_guard lock(workListMutex); + shutdownThreads = true; + workListCondition.notify_all(); + } + + for (auto &thread: threads) { + thread.join(); + } + threads.erase(threads.begin(), threads.end()); + shutdownThreads = false; } +}; - workerListMutex.lock(); - workerTask->isWorking = true; - workerListMutex.unlock(); +static std::unique_ptr manager = nullptr; - { - std::unique_lock lock(workerListMutex); - condval.notify_all(); +void parallel_for(int64_t start, int64_t end, const std::function& func, ParallelSchedule schedule) { + if (!manager) { + manager = std::make_unique(); + } - while (!workerTask->finished()) { - int indexStart = workerTask->currentIndex; - int indexEnd = std::min(workerTask->nTasks, indexStart + workerTask->chunkSize); - workerTask->currentIndex = indexEnd; - if (workerTask->currentIndex == workerTask->nTasks) { - workerTask->isWorking = false; - } - workerTask->activeWorkers++; - lock.unlock(); + // Size parameters + const int64_t count = end - start; + const int nThreads = numSystemThreads(); + const int chunkSize = schedule == ParallelSchedule::Dynamic ? 1 : (count + nThreads - 1) / nThreads; + + // Create and enqueue "ParallelForLoop" object. + ParallelForLoop loop(std::move(func), count, chunkSize); + workListMutex.lock(); + loop.next = workList; + workList = &loop; + workListMutex.unlock(); + + // Notify worker threads of work to be done. + std::unique_lock lock(workListMutex); + workListCondition.notify_all(); + + // Make each worker thread active from a main thread + while (!loop.finished()) { + int64_t indexStart = loop.nextIndex; + int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex); + + loop.nextIndex = indexEnd; + if (loop.nextIndex == loop.maxIndex) { + workList = loop.next; + } + loop.activeWorkers++; - for (int i = indexStart; i < indexEnd; i++) { - workerTask->func(i); + lock.unlock(); + for (int64_t index = indexStart; index < indexEnd; index++) { + if (loop.func) { + loop.func(index); } - lock.lock(); - workerTask->activeWorkers--; } - condval.notify_all(); - } + lock.lock(); - for (auto& t : threads) { - t.join(); + loop.activeWorkers--; } } int numSystemThreads() { - return std::max(1, numUserThreads); + return std::max(1, threadCount); } int getThreadID() { @@ -174,8 +224,8 @@ int getThreadID() { void setNumThreads(uint32_t n) { if (n == 0) { - numUserThreads = std::thread::hardware_concurrency(); + threadCount = std::thread::hardware_concurrency(); } else { - numUserThreads = std::min(n, std::thread::hardware_concurrency()); + threadCount = std::max(1u, std::min(n, std::thread::hardware_concurrency())); } } \ No newline at end of file diff --git a/sources/core/parallel.h b/sources/core/parallel.h index 286f73c..1ac27ee 100644 --- a/sources/core/parallel.h +++ b/sources/core/parallel.h @@ -7,7 +7,9 @@ #include #include +#include #include +#include #include "core/common.h" @@ -24,6 +26,24 @@ class SPICA_EXPORTS AtomicDouble { std::atomic bits; }; +class SPICA_EXPORTS Barrier { +public: + Barrier(int count) + : count{ count } { + Assertion(count >= 0, "# of threads you use should be positive!"); + } + virtual ~Barrier() { + Assertion(count == 0, "Not all the treads finish their task properly!"); + } + + void wait(); + +private: + std::mutex mutex; + std::condition_variable cv; + int count; +}; + } // namespace spica enum class ParallelSchedule { @@ -31,7 +51,7 @@ enum class ParallelSchedule { Dynamic = 0x02 }; -SPICA_EXPORTS void parallel_for(int start, int end, const std::function& func, +SPICA_EXPORTS void parallel_for(int64_t start, int64_t end, const std::function &func, ParallelSchedule schedule = ParallelSchedule::Dynamic); SPICA_EXPORTS int numSystemThreads(); diff --git a/sources/integrators/vcmups/vcmups.cc b/sources/integrators/vcmups/vcmups.cc index 736e6fd..b148f90 100644 --- a/sources/integrators/vcmups/vcmups.cc +++ b/sources/integrators/vcmups/vcmups.cc @@ -267,7 +267,7 @@ VCMUPSIntegrator::VCMUPSIntegrator(const std::shared_ptr &sampler, doub VCMUPSIntegrator::VCMUPSIntegrator(RenderParams ¶ms) : VCMUPSIntegrator{ std::static_pointer_cast(params.getObject("sampler")), - params.getDouble("lookupRadiusRatio", 0.8) } { + params.getDouble("alpha", 0.8) } { } void VCMUPSIntegrator::initialize(const std::shared_ptr &camera, @@ -324,7 +324,7 @@ void VCMUPSIntegrator::render(const std::shared_ptr &camera, // Storage for path samples std::vector lightPathLengths(numPixels); - std::vector> lightPaths(numPixels); + std::vector lightPaths(numPixels); // Sample camera and light paths MsgInfo("Sampling paths"); @@ -335,9 +335,9 @@ void VCMUPSIntegrator::render(const std::shared_ptr &camera, const auto &sampler = samplers[threadID]; sampler->startPixel(); - lightPaths[pid] = std::make_unique(maxBounces + 1); + lightPaths[pid] = arenas[threadID].allocate(maxBounces + 1); lightPathLengths[pid] = calcLightSubpath(scene, *sampler, arenas[threadID], - maxBounces + 1, lightDist, lightPaths[pid].get()); + maxBounces + 1, lightDist, lightPaths[pid]); proc++; if (proc % 1000 == 0 || proc == numPixels) { @@ -385,7 +385,7 @@ void VCMUPSIntegrator::render(const std::shared_ptr &camera, const int nLight = lightPathLengths[pid]; const Point2d randFilm = sampler->get2D(); - Vertex *cameraPath = (Vertex *)subArenas[threadID].allocate(maxBounces + 2); + Vertex *cameraPath = subArenas[threadID].allocate(maxBounces + 2); const int nCamera = calcCameraSubpath(scene, *sampler, subArenas[threadID], maxBounces + 2, *camera, Point2i(x, y), randFilm, cameraPath); @@ -399,8 +399,8 @@ void VCMUPSIntegrator::render(const std::shared_ptr &camera, double misWeight = 0.0; const int lookupSize = params.getInt("lookupSize", 32); - const double lookupRadius = params.getDouble("globalLookupRadius", 0.125) * lookupRadiusScale_; - Spectrum Lpath = connectVCM(scene, lightPaths[pid].get(), cameraPath, + const double lookupRadius = params.getDouble("lookupRadius", 0.125) * lookupRadiusScale_; + Spectrum Lpath = connectVCM(scene, lightPaths[pid], cameraPath, lid, cid, nLight, nCamera, photonMaps, lookupSize, lookupRadius, numPixels, lightDist, *camera, *sampler, &pFilm, &misWeight);