diff --git a/dmlc-core b/dmlc-core
index f35f14f30835..78b78be34ac2 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit f35f14f30835af238257b979cc1fac3e41ff3291
+Subproject commit 78b78be34ac27d30f2193f3d51848c62887669c4
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f9dc84cc1fab..0f535b250319 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -282,11 +282,7 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
   std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
 
   API_BEGIN();
-  int nthread;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-  }
+  const int nthread = omp_get_max_threads();
   data::SimpleCSRSource& mat = *source;
   common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
   builder.InitBudget(0, nthread);
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index ae78e3864b15..c7c0b3d1b855 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -83,13 +83,8 @@ void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled,
   // clear rowset
   buffered_rowset_.clear();
   // bit map
-  int nthread;
+  const int nthread = omp_get_max_threads();
   std::vector<bool> bmap;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-  }
-
   pcol->Clear();
   common::ParallelGroupBuilder<SparseBatch::Entry>
       builder(&pcol->offset, &pcol->data);
@@ -204,15 +199,7 @@ void SimpleDMatrix::MakeColPage(const RowBatch& batch,
                                 size_t buffer_begin,
                                 const std::vector<bool>& enabled,
                                 SparsePage* pcol) {
-  int nthread;
-  #pragma omp parallel
-  {
-    nthread = omp_get_num_threads();
-    int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
-    if (nthread > max_nthread) {
-      nthread = max_nthread;
-    }
-  }
+  const int nthread = std::min(omp_get_max_threads(), std::max(omp_get_num_procs() / 2 - 2, 1));
   pcol->Clear();
   common::ParallelGroupBuilder<SparseBatch::Entry>
       builder(&pcol->offset, &pcol->data);
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 74a85e9caa10..61fde3e50e10 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -169,12 +169,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
       SparsePage *pcol) {
     pcol->Clear();
     pcol->min_index = buffered_rowset_[begin];
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-      nthread = std::max(nthread, std::max(omp_get_num_procs() / 2 - 1, 1));
-    }
+    const int nthread = std::max(omp_get_max_threads(), std::max(omp_get_num_procs() / 2 - 1, 1));
     common::ParallelGroupBuilder<SparseBatch::Entry>
     builder(&pcol->offset, &pcol->data);
     builder.InitBudget(info.num_col, nthread);
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 3a8767249459..18d1cea34af3 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -301,11 +301,7 @@ class GBTree : public GradientBooster {
   void PredictLeaf(DMatrix* p_fmat,
                    std::vector<bst_float>* out_preds,
                    unsigned ntree_limit) override {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
+    const int nthread = omp_get_max_threads();
     InitThreadTemp(nthread);
     this->PredPath(p_fmat, out_preds, ntree_limit);
   }
@@ -365,11 +361,7 @@ class GBTree : public GradientBooster {
       unsigned tree_begin,
       unsigned tree_end) {
     const MetaInfo& info = p_fmat->info();
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
+    const int nthread = omp_get_max_threads();
     CHECK_EQ(num_group, mparam.num_output_group);
     InitThreadTemp(nthread);
     std::vector<bst_float> &preds = *out_preds;
diff --git a/src/tree/updater_basemaker-inl.h b/src/tree/updater_basemaker-inl.h
index 6d043b46795e..f70f63ad09ee 100644
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@@ -118,15 +118,6 @@ class BaseMaker: public TreeUpdater {
     }
     return n.cdefault();
   }
-  /*! \brief get number of omp thread in current context */
-  inline static int get_nthread() {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    return nthread;
-  }
   //  ------class member helpers---------
   /*! \brief initialize temp data structure */
   inline void InitData(const std::vector<bst_gpair> &gpair,
@@ -350,7 +341,7 @@ class BaseMaker: public TreeUpdater {
                            std::vector<TStats> *p_node_stats) {
     std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
     const MetaInfo &info = fmat.info();
-    thread_temp.resize(this->get_nthread());
+    thread_temp.resize(omp_get_max_threads());
     p_node_stats->resize(tree.param.num_nodes);
     #pragma omp parallel
     {
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 2d63d9d74d44..5c41267022dd 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -81,7 +81,7 @@ class ColMaker: public TreeUpdater {
   struct Builder {
    public:
     // constructor
-    explicit Builder(const TrainParam& param) : param(param) {}
+    explicit Builder(const TrainParam& param) : param(param), nthread(omp_get_max_threads()) {}
     // update one tree, growing
     virtual void Update(const std::vector<bst_gpair>& gpair,
                         DMatrix* p_fmat,
@@ -166,10 +166,6 @@ class ColMaker: public TreeUpdater {
       }
       {
         // setup temp space for each thread
-        #pragma omp parallel
-        {
-          this->nthread = omp_get_num_threads();
-        }
         // reserve a small space
         stemp.clear();
         stemp.resize(this->nthread, std::vector<ThreadEntry>());
@@ -277,8 +273,7 @@ class ColMaker: public TreeUpdater {
         for (size_t j = 0; j < qexpand.size(); ++j) {
           temp[qexpand[j]].stats.Clear();
         }
-        nthread = omp_get_num_threads();
-        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint step = (col.length + this->nthread - 1) / this->nthread;
         bst_uint end = std::min(col.length, step * (tid + 1));
         for (bst_uint i = tid * step; i < end; ++i) {
           const bst_uint ridx = col[i].index;
@@ -298,7 +293,7 @@ class ColMaker: public TreeUpdater {
       for (bst_omp_uint j = 0; j < nnode; ++j) {
         const int nid = qexpand[j];
         TStats sum(param), tmp(param), c(param);
-        for (int tid = 0; tid < nthread; ++tid) {
+        for (int tid = 0; tid < this->nthread; ++tid) {
           tmp = stemp[tid][nid].stats;
           stemp[tid][nid].stats = sum;
           sum.Add(tmp);
@@ -306,7 +301,7 @@ class ColMaker: public TreeUpdater {
             std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
           }
         }
-        for (int tid = 0; tid < nthread; ++tid) {
+        for (int tid = 0; tid < this->nthread; ++tid) {
           stemp[tid][nid].stats_extra = sum;
           ThreadEntry &e = stemp[tid][nid];
           bst_float fsplit;
@@ -341,7 +336,7 @@ class ColMaker: public TreeUpdater {
         }
         if (need_backward) {
           tmp = sum;
-          ThreadEntry &e = stemp[nthread-1][nid];
+          ThreadEntry &e = stemp[this->nthread-1][nid];
           c.SetSubstract(snode[nid].stats, tmp);
           if (c.sum_hess >= param.min_child_weight &&
               tmp.sum_hess >= param.min_child_weight) {
@@ -357,8 +352,7 @@ class ColMaker: public TreeUpdater {
         TStats c(param), cright(param);
         const int tid = omp_get_thread_num();
         std::vector<ThreadEntry> &temp = stemp[tid];
-        nthread = static_cast<bst_uint>(omp_get_num_threads());
-        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint step = (col.length + this->nthread - 1) / this->nthread;
         bst_uint end = std::min(col.length, step * (tid + 1));
         for (bst_uint i = tid * step; i < end; ++i) {
           const bst_uint ridx = col[i].index;
@@ -599,7 +593,7 @@ class ColMaker: public TreeUpdater {
       #endif
       int poption = param.parallel_option;
       if (poption == 2) {
-        poption = static_cast<int>(nsize) * 2 < nthread ? 1 : 0;
+        poption = static_cast<int>(nsize) * 2 < this->nthread ? 1 : 0;
       }
       if (poption == 0) {
         #pragma omp parallel for schedule(dynamic, batch_size)
@@ -760,7 +754,7 @@ class ColMaker: public TreeUpdater {
     //  --data fields--
     const TrainParam& param;
     // number of omp thread used during training
-    int nthread;
+    const int nthread;
     // Per feature: shuffle index of each feature index
     std::vector<bst_uint> feat_index;
     // Instance Data: current node position in the tree of each instance
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index 8ff5b994e221..c8fe5c7a3092 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -336,7 +336,7 @@ class CQHistMaker: public HistMaker<TStats> {
     auto lazy_get_hist = [&]()
 #endif
     {
-      thread_hist.resize(this->get_nthread());
+      thread_hist.resize(omp_get_max_threads());
       // start accumulating statistics
       dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fset);
       iter->BeforeFirst();
@@ -410,7 +410,7 @@ class CQHistMaker: public HistMaker<TStats> {
     }
     {
       // get smmary
-      thread_sketch.resize(this->get_nthread());
+      thread_sketch.resize(omp_get_max_threads());
 
       // TWOPASS: use the real set + split set in the column iteration.
       this->SetDefaultPostion(p_fmat, tree);
@@ -695,7 +695,7 @@ class GlobalProposalHistMaker: public CQHistMaker<TStats> {
     this->wspace.Init(this->param, 1);
     // to gain speedup in recovery
     {
-      this->thread_hist.resize(this->get_nthread());
+      this->thread_hist.resize(omp_get_max_threads());
 
       // TWOPASS: use the real set + split set in the column iteration.
       this->SetDefaultPostion(p_fmat, tree);
@@ -756,7 +756,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
                           const RegTree &tree) override {
     const MetaInfo &info = p_fmat->info();
     // initialize the data structure
-    int nthread = BaseMaker::get_nthread();
+    const int nthread = omp_get_max_threads();
     sketchs.resize(this->qexpand.size() * tree.param.num_feature);
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 280cea35c078..fb4e72cafa37 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -34,11 +34,7 @@ class TreeRefresher: public TreeUpdater {
     std::vector<std::vector<TStats> > stemp;
     std::vector<RegTree::FVec> fvec_temp;
     // setup temp space for each thread
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
+    const int nthread = omp_get_max_threads();
     fvec_temp.resize(nthread, RegTree::FVec());
     stemp.resize(nthread, std::vector<TStats>());
     #pragma omp parallel
diff --git a/src/tree/updater_skmaker.cc b/src/tree/updater_skmaker.cc
index 4a49b7b3a264..c2320a0ef072 100644
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@@ -141,7 +141,7 @@ class SketchMaker: public BaseMaker {
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
-    thread_sketch.resize(this->get_nthread());
+    thread_sketch.resize(omp_get_max_threads());
     // number of rows in
     const size_t nrows = p_fmat->buffered_rowset().size();
     // start accumulating statistics