diff --git a/sources/accelerators/bvh.cc b/sources/accelerators/bvh.cc
index 0df5cfe..413a46a 100644
--- a/sources/accelerators/bvh.cc
+++ b/sources/accelerators/bvh.cc
@@ -39,7 +39,7 @@ static const int orderTable[] = {
     0x40123, 0x40132, 0x41023, 0x41032, 0x42301, 0x43201, 0x42310, 0x43210,  // ++|++
 };
 
-int test_AABB(const __m128 bboxes[2][3], const __m128 org[3], const __m128 idir[3], const int sign[3], __m128 tmin, __m128 tmax) {
+int test_AABB(const __m128 bboxes[2][3], const __m128 org[3], const __m128 idir[3], const int sign[3], __m128 tmin, __m128 tmax) noexcept {
     tmin = _mm_max_ps(tmin, _mm_mul_ps(_mm_sub_ps(bboxes[sign[0]][0], org[0]), idir[0]));
     tmax = _mm_min_ps(tmax, _mm_mul_ps(_mm_sub_ps(bboxes[1 - sign[0]][0], org[0]), idir[0]));
     tmin = _mm_max_ps(tmin, _mm_mul_ps(_mm_sub_ps(bboxes[sign[1]][1], org[1]), idir[1]));
@@ -134,6 +134,7 @@ BVHAccel::BVHAccel(const std::vector<std::shared_ptr<Primitive>> &prims,
 }
 
 BVHAccel::~BVHAccel() {
+    release();
 }
 
 Bounds3d BVHAccel::worldBound() const {
@@ -144,7 +145,7 @@ void BVHAccel::construct() {
     if (primitives_.empty()) return;
         
     std::vector<BVHPrimitiveInfo> primitiveInfo(primitives_.size());
-    for (int i = 0; i < primitives_.size(); i++) {
+    for (int i = 0; i < (int)primitives_.size(); i++) {
         primitiveInfo[i] = { i, primitives_[i]->worldBound() };
     }
 
@@ -241,10 +242,10 @@ BVHNode* BVHAccel::constructRec(std::vector<BVHPrimitiveInfo>& buildData,
 }
 
 void BVHAccel::release() {
-    for (int i = 0; i < simdNodes_.size(); i++) {
-        align_free(simdNodes_[i]);
+    for (auto &node : simdNodes_) {
+        align_free(node);
     }
-    
+
     root_ = nullptr;
     simdNodes_.clear();
 }
@@ -253,8 +254,7 @@ void BVHAccel::collapse2QBVH(BVHNode* node) {
     BVHNode *lc = node->left;
     BVHNode *rc = node->right;
     
-    SIMDBVHNode* n =
-    static_cast<SIMDBVHNode*>(align_alloc(sizeof(SIMDBVHNode), 16));
+    SIMDBVHNode* n = (SIMDBVHNode *)align_alloc(sizeof(SIMDBVHNode), 16);
     Assertion(n != nullptr, "allocation failed !!");
     
     simdNodes_.push_back(n);
@@ -337,7 +337,6 @@ bool BVHAccel::intersectBVH(Ray &ray, SurfaceInteraction *isect) const{
     nodeStack.push(root_);
 
     bool hit = false;
-    int cnt = 0;
     while (!nodeStack.empty()) {
         BVHNode* node = nodeStack.top();
         nodeStack.pop();
@@ -367,7 +366,6 @@ bool BVHAccel::intersectBVH(Ray& ray) const {
     std::stack<BVHNode*> nodeStack;
     nodeStack.push(root_);
 
-    bool hit = false;
     while (!nodeStack.empty()) {
         BVHNode* node = nodeStack.top();
         nodeStack.pop();
diff --git a/sources/core/memory.cc b/sources/core/memory.cc
index 9f99478..f92b604 100644
--- a/sources/core/memory.cc
+++ b/sources/core/memory.cc
@@ -2,6 +2,7 @@
 #include "memory.h"
 
 #include <cstdlib>
+#include <cstddef>
 #include <algorithm>
 
 namespace spica {
@@ -22,12 +23,13 @@ MemoryArena::MemoryArena(MemoryArena&& arena) noexcept
 
 MemoryArena::~MemoryArena() {
     align_free(currentBlock_);
-    for (auto& block : usedBlocks_)      align_free(block.second);
+    for (auto& block : usedBlocks_) align_free(block.second);
     for (auto& block : availableBlocks_) align_free(block.second);
 }
 
 void* MemoryArena::allocBytes(size_t nBytes) {
-    nBytes = ((nBytes + 0xf) & (~0xf));
+    const int align = alignof(std::max_align_t);
+    nBytes = ((nBytes + align - 1) & ~(align - 1));
     if (currentBlockPos_ + nBytes > currentAllocSize_) {
         if (currentBlock_) {
             usedBlocks_.emplace_back(currentAllocSize_, currentBlock_);
@@ -39,7 +41,7 @@ void* MemoryArena::allocBytes(size_t nBytes) {
             iter != availableBlocks_.end(); ++iter) {
             if (iter->first >= nBytes) {
                 currentAllocSize_ = iter->first;
-                currentBlock_     = iter->second;
+                currentBlock_ = iter->second;
                 availableBlocks_.erase(iter);
                 break;
             }
@@ -47,7 +49,7 @@ void* MemoryArena::allocBytes(size_t nBytes) {
 
         if (!currentBlock_) {
             currentAllocSize_ = std::max(nBytes, blockSize_);
-            currentBlock_     = (unsigned char*)align_alloc(currentAllocSize_, 64);
+            currentBlock_ = (uint8_t*)align_alloc(currentAllocSize_, 64);
         }
         currentBlockPos_ = 0;
     }
@@ -63,8 +65,8 @@ void MemoryArena::reset() {
 
 size_t MemoryArena::totalAllocated() const {
     size_t total = currentAllocSize_;
-    for (auto& block : usedBlocks_)      total += block.first;
-    for (auto& block : availableBlocks_) total += block.first;
+    for (const auto &block : usedBlocks_)      total += block.first;
+    for (const auto &block : availableBlocks_) total += block.first;
     return total;
 }
 
diff --git a/sources/core/memory.h b/sources/core/memory.h
index 4ab7841..57cc6fd 100644
--- a/sources/core/memory.h
+++ b/sources/core/memory.h
@@ -43,7 +43,7 @@ class SPICA_EXPORTS MEM_ALIGN(64) MemoryArena : private Uncopyable {
         using Elem = typename std::remove_extent<T>::type;
         Elem *ret = (Elem*)allocBytes(sizeof(Elem) * size);
         for (size_t i = 0; i < size; i++) {
-            new (ret + i) Elem();
+            new (&ret[i]) Elem();
         }
         return ret;
     }
@@ -60,7 +60,7 @@ class SPICA_EXPORTS MEM_ALIGN(64) MemoryArena : private Uncopyable {
     const size_t blockSize_;
     size_t currentBlockPos_ = 0;
     size_t currentAllocSize_ = 0;
-    unsigned char* currentBlock_ = nullptr;
+    uint8_t* currentBlock_ = nullptr;
     std::list<std::pair<size_t, unsigned char*>> usedBlocks_, availableBlocks_;
 };  // class MemoryArena
 
diff --git a/sources/core/parallel.cc b/sources/core/parallel.cc
index 9fb5ac9..0fa01a6 100644
--- a/sources/core/parallel.cc
+++ b/sources/core/parallel.cc
@@ -8,8 +8,6 @@
 #include <mutex>
 #include <condition_variable>
 
-static int numUserThreads = std::thread::hardware_concurrency();
-
 namespace spica {
 
 inline uint64_t doubleToBits(double v) {
@@ -45,127 +43,179 @@ void AtomicDouble::add(double v) {
     } while (!bits.compare_exchange_weak(oldBits, newBits));    
 }
 
+void Barrier::wait() {
+    std::unique_lock<std::mutex> lock(mutex);
+    if (--count == 0) {
+        cv.notify_all();
+    } else {
+        cv.wait(lock, [this] { return count == 0; });
+    }
+}
+
 }  // namespace spica
 
-class WorkerTask;
+// ---------------------------------------------------------------------------------------------------------------------
+// parallel_for definition
+// ---------------------------------------------------------------------------------------------------------------------
+
+static std::vector<std::thread> threads;
+static bool shutdownThreads = false;
+class ParallelForLoop;
+static ParallelForLoop *workList = nullptr;
+static std::mutex workListMutex;
+
+static int threadCount = std::thread::hardware_concurrency();
 
 static thread_local int threadID;
-static std::mutex workerListMutex;
 static std::condition_variable condval;
-static WorkerTask* workerTask;
 
-class WorkerTask {
+class ParallelForLoop {
 public:
-    WorkerTask(const std::function<void(int)>& f, int csize, int tasks)
-        : func{ f }
-        , chunkSize{ csize }
-        , nTasks { tasks } {
+    ParallelForLoop(std::function<void(int64_t)> func, int64_t maxIndex, int chunkSize)
+        : func{ std::move(func) }
+        , maxIndex{ maxIndex }
+        , chunkSize{ chunkSize } {
     }
 
     bool finished() const {
-        return currentIndex >= nTasks && activeWorkers == 0;
+        return nextIndex >= maxIndex && activeWorkers == 0;
     }
 
-    const std::function<void(int)>& func;
+public:
+    std::function<void(int64_t)> func;
+    const int64_t maxIndex;
     const int chunkSize;
-    const int nTasks;
-    bool isWorking = false;
-    int currentIndex = 0;
+
+    int64_t nextIndex = 0;
     int activeWorkers = 0;
+    ParallelForLoop *next = nullptr;
 };
 
-class WorkerTaskManager {
-public:
-    WorkerTaskManager(const std::function<void(int)>& f,
-                      int chunkSize, int nTasks) {
-        if (workerTask == nullptr) {
-            workerTask = new WorkerTask(f, chunkSize, nTasks);
-        }
-    }
 
-    ~WorkerTaskManager() {
-        delete workerTask;
-        workerTask = nullptr;
-    }
-};
+static std::condition_variable workListCondition;
 
-static void workerThreadFunc(int threadIndex) {
+static void workerThreadFunc(int threadIndex, std::shared_ptr<spica::Barrier> barrier) {
+    // Initialize worker
     threadID = threadIndex;
-    std::unique_lock<std::mutex> lock(workerListMutex);
-    while (!workerTask->finished()) {
-        if (!workerTask->isWorking) {
-            condval.wait(lock);
+    barrier->wait();
+    barrier.reset();
+
+    std::unique_lock<std::mutex> lock(workListMutex);
+    while (!shutdownThreads) {
+        if (!workList) {
+            // Sleep thread while no task found.
+            workListCondition.wait(lock);
         } else {
-            int indexStart = workerTask->currentIndex;
-            int indexEnd   = std::min(workerTask->nTasks, indexStart + workerTask->chunkSize);
-            workerTask->currentIndex = indexEnd;
-            if (workerTask->currentIndex == workerTask->nTasks) {
-                workerTask->isWorking = false;
+            // Get task and evaluate a function
+            ParallelForLoop &loop = *workList;
+
+            int64_t indexStart = loop.nextIndex;
+            int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+
+            // Update loop state
+            loop.nextIndex = indexEnd;
+            if (loop.nextIndex == loop.maxIndex) {
+                workList = loop.next;
             }
-            workerTask->activeWorkers++;
-            lock.unlock();
+            loop.activeWorkers++;
 
-            for (int i = indexStart; i < indexEnd; i++) {
-                workerTask->func(i);
+            // Run loop
+            lock.unlock();
+            for (int64_t index = indexStart; index < indexEnd; index++) {
+                if (loop.func) {
+                    loop.func(index);
+                }
             }
             lock.lock();
 
-            workerTask->activeWorkers--;
-            if (workerTask->finished()) {
-                printf("OK!!");
-                condval.notify_all();
+            // Finish working for a function
+            loop.activeWorkers--;
+            if (loop.finished()) {
+                workListCondition.notify_all();
             }
         }
     }
 }
 
-void parallel_for(int start, int end, const std::function<void(int)>& func,
-                  ParallelSchedule schedule) {
-    const int nTasks = (end - start);
-    const int nThreads = numSystemThreads();
-    const int chunkSize = schedule == ParallelSchedule::Dynamic ? 1 : (nTasks + nThreads - 1) / nThreads;
-    WorkerTaskManager manager(func, chunkSize, nTasks);    
+class ParallelManager {
+public:
+    ParallelManager() {
+        // Initialize threads
+        threadID = 0;
+        const int nThreads = numSystemThreads();
+        auto barrier = std::make_shared<spica::Barrier>(nThreads);
+        for (int i = 0; i < nThreads - 1; i++) {
+            threads.emplace_back(workerThreadFunc, i + 1, barrier);
+        }
+        barrier->wait();
+    }
 
-    std::vector<std::thread> threads;
-    threadID = 0;
-    for (int i = 0; i < nThreads - 1; i++) {
-        threads.emplace_back(workerThreadFunc, i + 1);
+    ~ParallelManager() {
+        // Clean up threads
+        if (threads.empty()) return;
+
+        {
+            std::lock_guard<std::mutex> lock(workListMutex);
+            shutdownThreads = true;
+            workListCondition.notify_all();
+        }
+
+        for (auto &thread: threads) {
+            thread.join();
+        }
+        threads.erase(threads.begin(), threads.end());
+        shutdownThreads = false;
     }
+};
 
-    workerListMutex.lock();
-    workerTask->isWorking = true;
-    workerListMutex.unlock();
+static std::unique_ptr<ParallelManager> manager = nullptr;
 
-    {
-        std::unique_lock<std::mutex> lock(workerListMutex);
-        condval.notify_all();
+void parallel_for(int64_t start, int64_t end, const std::function<void(int)>& func, ParallelSchedule schedule) {
+    if (!manager) {
+        manager = std::make_unique<ParallelManager>();
+    }
 
-        while (!workerTask->finished()) {
-            int indexStart = workerTask->currentIndex;
-            int indexEnd   = std::min(workerTask->nTasks, indexStart + workerTask->chunkSize);
-            workerTask->currentIndex = indexEnd;
-            if (workerTask->currentIndex == workerTask->nTasks) {
-                workerTask->isWorking = false;
-            }
-            workerTask->activeWorkers++;
-            lock.unlock();
+    // Size parameters
+    const int64_t count = end - start;
+    const int nThreads = numSystemThreads();
+    const int chunkSize = schedule == ParallelSchedule::Dynamic ? 1 : (count + nThreads - 1) / nThreads;
+
+    // Create and enqueue "ParallelForLoop" object.
+    ParallelForLoop loop(std::move(func), count, chunkSize);
+    workListMutex.lock();
+    loop.next = workList;
+    workList = &loop;
+    workListMutex.unlock();
+
+    // Notify worker threads of work to be done.
+    std::unique_lock<std::mutex> lock(workListMutex);
+    workListCondition.notify_all();
+
+    // Make each worker thread active from a main thread
+    while (!loop.finished()) {
+        int64_t indexStart = loop.nextIndex;
+        int64_t indexEnd   = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+
+        loop.nextIndex = indexEnd;
+        if (loop.nextIndex == loop.maxIndex) {
+            workList = loop.next;
+        }
+        loop.activeWorkers++;
 
-            for (int i = indexStart; i < indexEnd; i++) {
-                workerTask->func(i);
+        lock.unlock();
+        for (int64_t index = indexStart; index < indexEnd; index++) {
+            if (loop.func) {
+                loop.func(index);
             }
-            lock.lock();
-            workerTask->activeWorkers--;
         }
-        condval.notify_all();
-    }
+        lock.lock();
 
-    for (auto& t : threads) {
-        t.join();
+        loop.activeWorkers--;
     }
 }
 
 int numSystemThreads() {
-    return std::max(1, numUserThreads);
+    return std::max(1, threadCount);
 }
 
 int getThreadID() {
@@ -174,8 +224,8 @@ int getThreadID() {
 
 void setNumThreads(uint32_t n) {
     if (n == 0) {
-        numUserThreads = std::thread::hardware_concurrency();
+        threadCount = std::thread::hardware_concurrency();
     } else {
-        numUserThreads = std::min(n, std::thread::hardware_concurrency()); 
+        threadCount = std::max(1u, std::min(n, std::thread::hardware_concurrency()));
     }
 }
\ No newline at end of file
diff --git a/sources/core/parallel.h b/sources/core/parallel.h
index 286f73c..1ac27ee 100644
--- a/sources/core/parallel.h
+++ b/sources/core/parallel.h
@@ -7,7 +7,9 @@
 
 #include <iostream>
 #include <atomic>
+#include <mutex>
 #include <functional>
+#include <condition_variable>
 
 #include "core/common.h"
 
@@ -24,6 +26,24 @@ class SPICA_EXPORTS AtomicDouble {
     std::atomic<uint64_t> bits;
 };
 
+class SPICA_EXPORTS Barrier {
+public:
+    Barrier(int count)
+        : count{ count } {
+        Assertion(count >= 0, "# of threads you use should be positive!");
+    }
+    virtual ~Barrier() {
+        Assertion(count == 0, "Not all the treads finish their task properly!");
+    }
+
+    void wait();
+
+private:
+    std::mutex mutex;
+    std::condition_variable cv;
+    int count;
+};
+
 }  // namespace spica
 
 enum class ParallelSchedule {
@@ -31,7 +51,7 @@ enum class ParallelSchedule {
     Dynamic = 0x02
 };
 
-SPICA_EXPORTS void parallel_for(int start, int end, const std::function<void(int)>& func,
+SPICA_EXPORTS void parallel_for(int64_t start, int64_t end, const std::function<void(int)> &func,
                                 ParallelSchedule schedule = ParallelSchedule::Dynamic);
 
 SPICA_EXPORTS int numSystemThreads();
diff --git a/sources/integrators/vcmups/vcmups.cc b/sources/integrators/vcmups/vcmups.cc
index 736e6fd..b148f90 100644
--- a/sources/integrators/vcmups/vcmups.cc
+++ b/sources/integrators/vcmups/vcmups.cc
@@ -267,7 +267,7 @@ VCMUPSIntegrator::VCMUPSIntegrator(const std::shared_ptr<Sampler> &sampler, doub
 
 VCMUPSIntegrator::VCMUPSIntegrator(RenderParams &params)
     : VCMUPSIntegrator{ std::static_pointer_cast<Sampler>(params.getObject("sampler")),
-                        params.getDouble("lookupRadiusRatio", 0.8) } {
+                        params.getDouble("alpha", 0.8) } {
 }
 
 void VCMUPSIntegrator::initialize(const std::shared_ptr<const Camera> &camera,
@@ -324,7 +324,7 @@ void VCMUPSIntegrator::render(const std::shared_ptr<const Camera> &camera,
 
         // Storage for path samples
         std::vector<int> lightPathLengths(numPixels);
-        std::vector<std::unique_ptr<Vertex[]>> lightPaths(numPixels);
+        std::vector<Vertex*> lightPaths(numPixels);
 
         // Sample camera and light paths
         MsgInfo("Sampling paths");
@@ -335,9 +335,9 @@ void VCMUPSIntegrator::render(const std::shared_ptr<const Camera> &camera,
             const auto &sampler = samplers[threadID];
             sampler->startPixel();
 
-            lightPaths[pid] = std::make_unique<Vertex[]>(maxBounces + 1);
+            lightPaths[pid] = arenas[threadID].allocate<Vertex[]>(maxBounces + 1);
             lightPathLengths[pid] = calcLightSubpath(scene, *sampler, arenas[threadID],
-                                                     maxBounces + 1, lightDist, lightPaths[pid].get());
+                                                     maxBounces + 1, lightDist, lightPaths[pid]);
 
             proc++;
             if (proc % 1000 == 0 || proc == numPixels) {
@@ -385,7 +385,7 @@ void VCMUPSIntegrator::render(const std::shared_ptr<const Camera> &camera,
             const int nLight = lightPathLengths[pid];
             const Point2d randFilm = sampler->get2D();
 
-            Vertex *cameraPath = (Vertex *)subArenas[threadID].allocate<Vertex[]>(maxBounces + 2);
+            Vertex *cameraPath = subArenas[threadID].allocate<Vertex[]>(maxBounces + 2);
             const int nCamera = calcCameraSubpath(scene, *sampler, subArenas[threadID], maxBounces + 2,
                                                   *camera, Point2i(x, y), randFilm, cameraPath);
 
@@ -399,8 +399,8 @@ void VCMUPSIntegrator::render(const std::shared_ptr<const Camera> &camera,
                     double misWeight = 0.0;
 
                     const int lookupSize = params.getInt("lookupSize", 32);
-                    const double lookupRadius = params.getDouble("globalLookupRadius", 0.125) * lookupRadiusScale_;
-                    Spectrum Lpath = connectVCM(scene, lightPaths[pid].get(), cameraPath,
+                    const double lookupRadius = params.getDouble("lookupRadius", 0.125) * lookupRadiusScale_;
+                    Spectrum Lpath = connectVCM(scene, lightPaths[pid], cameraPath,
                                                 lid, cid, nLight, nCamera, photonMaps,
                                                 lookupSize, lookupRadius, numPixels,
                                                 lightDist, *camera, *sampler, &pFilm, &misWeight);