[ML] Improve regression and classification QoR for small data sets (e…

…lastic#1960) This makes two changes to deal better with small data sets highlighted by a failure in our QA suite as a result of elastic#1941. In particular, 1. We could miss out rare classes altogether from our validation set for small data sets. 2. We can lose a lot of accuracy by over restricting the number of features we use for small data sets. Problem 1 is a result of the stratified sampling we perform. If a class is rare and data set is small we could choose never to sample it in the validation set because it could constitute fewer than one example per fold. In this case, the fraction of each class is changing significantly in the remaining unsampled set for each fold we sample, but we compute the desired class counts once upfront based on their overall frequency. We simply need to recompute desired counts per class based on the frequencies in the remainder in the loop which samples each new fold. Problem 2 requires that we allow ourselves to use more features than are implied by our default constraint of having n examples per feature for small data sets. Since we automatically remove nuisance features based on their MICe with the target we typically don't suffer loss in QoR from allowing ourselves to select extra features. Furthermore, for small data sets runtime is never problematic. For the multi-class classification problem which showed up this problem accuracy increases from around 0.2 to 0.9 as a result of this change.
tveasey · Aug 16, 2021 · 7fea7c7 · 7fea7c7
1 parent 08ef296
commit 7fea7c7
Show file tree

Hide file tree

Showing 8 changed files with 76 additions and 24 deletions.
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -45,6 +45,8 @@
 
 * Speed up training of regression and classification models on very large data sets.
   (See {ml-pull}1941[#1941].)
+* Improve regression and classification training accuracy for small data sets.
+  (See {ml-pull}1960[#1960].)
 
 == {es} version 7.14.0
 

diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
@@ -272,6 +272,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     //! Estimate test losses for the \p missing folds.
     TMeanVarAccumulatorVec estimateMissingTestLosses(const TSizeVec& missing) const;
 
+    //! Get the minimum number of rows we require per feature.
+    std::size_t rowsPerFeature(std::size_t numberRows) const;
+
     //! Get the number of features including category encoding.
     std::size_t numberFeatures() const;
 
@@ -386,7 +389,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
     TOptionalDouble m_EtaOverride;
     TOptionalDouble m_EtaGrowthRatePerTreeOverride;
     TOptionalSize m_NumberFoldsOverride;
-    TOptionalSize m_TrainFractionPerFoldOverride;
+    TOptionalDouble m_TrainFractionPerFoldOverride;
     TOptionalSize m_MaximumNumberTreesOverride;
     TOptionalDouble m_FeatureBagFractionOverride;
     TOptionalStrDoublePrVec m_ClassificationWeightsOverride;

diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
@@ -493,7 +493,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) {
               << "ms");
 
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(
-                           counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 4500000);
+                           counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6300000);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1910000);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration);
@@ -686,7 +686,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierTraining) {
               << "ms");
 
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(
-                           counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 4500000);
+                           counter_t::E_DFTPMEstimatedPeakMemoryUsage) < 6300000);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < 1910000);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) > 0);
     BOOST_TEST_REQUIRE(core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) <= duration);

diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc
@@ -399,12 +399,14 @@ void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFra
     TSizeVec regressors(frame.numberColumns() - this->numberExtraColumnsForTrain());
     std::iota(regressors.begin(), regressors.end(), 0);
     regressors.erase(regressors.begin() + m_TreeImpl->m_DependentVariable);
+    std::size_t numberTrainingRows{
+        static_cast<std::size_t>(m_TreeImpl->allTrainingRowsMask().manhattan())};
     LOG_TRACE(<< "candidate regressors = " << core::CContainerPrinter::print(regressors));
 
     m_TreeImpl->m_Encoder = std::make_unique<CDataFrameCategoryEncoder>(
         CMakeDataFrameCategoryEncoder{m_TreeImpl->m_NumberThreads, frame,
                                       m_TreeImpl->m_DependentVariable}
-            .minimumRowsPerFeature(m_TreeImpl->m_RowsPerFeature)
+            .minimumRowsPerFeature(m_TreeImpl->rowsPerFeature(numberTrainingRows))
             .minimumFrequencyToOneHotEncode(m_MinimumFrequencyToOneHotEncode)
             .rowMask(m_TreeImpl->allTrainingRowsMask())
             .columnMask(std::move(regressors))

diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
@@ -335,7 +335,8 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows,
     // A binary tree with n + 1 leaves has 2n + 1 nodes in total.
     std::size_t maximumNumberLeaves{this->maximumTreeSize(numberRows) + 1};
     std::size_t maximumNumberNodes{2 * maximumNumberLeaves - 1};
-    std::size_t maximumNumberFeatures{std::min(numberColumns - 1, numberRows / m_RowsPerFeature)};
+    std::size_t maximumNumberFeatures{
+        std::min(numberColumns - 1, numberRows / this->rowsPerFeature(numberRows))};
     std::size_t forestMemoryUsage{
         m_MaximumNumberTrees *
         (sizeof(TNodeVec) + maximumNumberNodes * CBoostedTreeNode::estimateMemoryUsage(
@@ -1107,6 +1108,15 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
     return predictedTestLosses;
 }
 
+std::size_t CBoostedTreeImpl::rowsPerFeature(std::size_t numberRows) const {
+    // For small data sets (fewer than 1k examples) we allow ourselves to use
+    // more features than implied by m_RowsPerFeature. Since we remove nuisance
+    // features which carry little information about the target this is fine
+    // from an accuracy perspective. From a runtime perspective we always train
+    // fast for such small data sets.
+    return std::max(std::min(m_RowsPerFeature, numberRows / 20), std::size_t{1});
+}
+
 std::size_t CBoostedTreeImpl::numberFeatures() const {
     return m_Encoder->numberEncodedColumns();
 }

diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc
@@ -140,15 +140,17 @@ classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads,
 
     TDoubleVec categoryFrequencies{CDataFrameUtils::categoryFrequencies(
         numberThreads, frame, rowMask, {targetColumn})[targetColumn]};
+    LOG_TRACE(<< "category frequencies = "
+              << core::CContainerPrinter::print(categoryFrequencies));
 
     TSizeVec categoryCounts;
     CSampling::weightedSample(desiredCount, categoryFrequencies, categoryCounts);
     LOG_TRACE(<< "desired category counts per test fold = "
               << core::CContainerPrinter::print(categoryCounts));
 
     auto sampler = std::make_unique<CStratifiedSampler>(categoryCounts.size());
-    for (std::size_t i = 0; i < categoryCounts.size(); ++i) {
-        sampler->addSampler(categoryCounts[i], rng);
+    for (auto categoryCount : categoryCounts) {
+        sampler->addSampler(categoryCount, rng);
     }
     sampler->samplerSelector([targetColumn](const TRowRef& row) mutable {
         return static_cast<std::size_t>(row[targetColumn]);
@@ -523,43 +525,36 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
               << ", sample size = " << sampleSize);
 
     TDoubleVec frequencies;
-
-    auto makeSampler = [&](std::size_t size) {
+    auto makeSampler = [&](std::size_t size, const core::CPackedBitVector& rowMask) {
         TStratifiedSamplerUPtr result;
         if (size > 0) {
             if (frame.columnIsCategorical()[targetColumn]) {
                 std::tie(result, frequencies) = classifierStratifiedCrossValidationRowSampler(
-                    numberThreads, frame, targetColumn, rng, size, allTrainingRowsMask);
+                    numberThreads, frame, targetColumn, rng, size, rowMask);
             } else {
                 result = regressionStratifiedCrossValiationRowSampler(
-                    numberThreads, frame, targetColumn, rng, size,
-                    numberBuckets, allTrainingRowsMask);
+                    numberThreads, frame, targetColumn, rng, size, numberBuckets, rowMask);
             }
         }
         return result;
     };
 
-    auto excessSampler = makeSampler(excessSampleSize);
-    auto sampler = makeSampler(sampleSize);
-    if (sampler == nullptr) {
-        HANDLE_FATAL(<< "Internal error: failed to create train/test splits.");
-        return {TPackedBitVectorVec{}, TPackedBitVectorVec{}, TDoubleVec{}};
-    }
+    auto excessSampler = makeSampler(excessSampleSize, allTrainingRowsMask);
 
     LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan());
 
     TPackedBitVectorVec testingRowMasks(numberFolds);
 
     TSizeVec rowIndices;
     auto sample = [&](const TStratifiedSamplerUPtr& sampler_,
-                      const core::CPackedBitVector& candidateTestingRowsMask) {
+                      const core::CPackedBitVector& rowMask) {
         frame.readRows(1, 0, frame.numberRows(),
                        [&](const TRowItr& beginRows, const TRowItr& endRows) {
                            for (auto row = beginRows; row != endRows; ++row) {
                                sampler_->sample(*row);
                            }
                        },
-                       &candidateTestingRowsMask);
+                       &rowMask);
         sampler_->finishSampling(rng, rowIndices);
         std::sort(rowIndices.begin(), rowIndices.end());
         LOG_TRACE(<< "# row indices = " << rowIndices.size());
@@ -569,7 +564,7 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
             result.extend(false, row - result.size());
             result.extend(true);
         }
-        result.extend(false, allTrainingRowsMask.size() - result.size());
+        result.extend(false, rowMask.size() - result.size());
         return result;
     };
 
@@ -579,6 +574,11 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads,
             testingRowMask = std::move(candidateTestingRowsMask);
             candidateTestingRowsMask = core::CPackedBitVector{testingRowMask.size(), false};
         } else {
+            auto sampler = makeSampler(sampleSize, candidateTestingRowsMask);
+            if (sampler == nullptr) {
+                HANDLE_FATAL(<< "Internal error: failed to create train/test splits.");
+                return {TPackedBitVectorVec{}, TPackedBitVectorVec{}, TDoubleVec{}};
+            }
             testingRowMask = sample(sampler, candidateTestingRowsMask);
             candidateTestingRowsMask ^= testingRowMask;
         }

diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc
@@ -500,13 +500,13 @@ BOOST_AUTO_TEST_CASE(testPiecewiseConstant) {
             0.0, modelBias[i][0],
             8.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
         // Good R^2...
-        BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.93);
+        BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.91);
 
         meanModelRSquared.add(modelRSquared[i][0]);
     }
 
     LOG_DEBUG(<< "mean R^2 = " << maths::CBasicStatistics::mean(meanModelRSquared));
-    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.95);
+    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.94);
 }
 
 BOOST_AUTO_TEST_CASE(testLinear) {
@@ -627,7 +627,7 @@ BOOST_AUTO_TEST_CASE(testNonLinear) {
         meanModelRSquared.add(modelRSquared[i][0]);
     }
     LOG_DEBUG(<< "mean R^2 = " << maths::CBasicStatistics::mean(meanModelRSquared));
-    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.98);
+    BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(meanModelRSquared) > 0.97);
 }
 
 BOOST_AUTO_TEST_CASE(testHuber) {

diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc
@@ -649,6 +649,41 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) {
     }
 }
 
+BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasksRareCategories) {
+
+    // Here we test a case that the desired sample size for a specific class
+    // is zero. In this case we should reassess the class frequencies for
+    // the unsampled set and still get 5 splits with all classes represented
+    // in at least one fold.
+
+    std::size_t numberFolds{5};
+    std::size_t numberBins{10};
+    TDoubleVec categories{0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3,
+                          3, 3, 3, 3, 3, 4, 5, 5, 6, 6, 6, 6};
+
+    auto frame = core::makeMainStorageDataFrame(1).first;
+    frame->categoricalColumns(TBoolVec{true});
+    for (auto category : categories) {
+        frame->writeRow([&](core::CDataFrame::TFloatVecItr column,
+                            std::int32_t&) { *column = category; });
+    }
+    frame->finishWritingRows();
+
+    maths::CPRNG::CXorOShiro128Plus rng;
+    maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
+    std::tie(std::ignore, testingRowMasks, std::ignore) =
+        maths::CDataFrameUtils::stratifiedCrossValidationRowMasks(
+            1, *frame, 0, rng, numberFolds, 1.0 - 1.0 / static_cast<double>(numberFolds),
+            numberBins, core::CPackedBitVector{categories.size(), true});
+
+    core::CPackedBitVector allTestingRowsMask(categories.size(), false);
+    for (const auto& testingRowMask : testingRowMasks) {
+        allTestingRowsMask ^= testingRowMask;
+        BOOST_TEST_REQUIRE(5.0, testingRowMask.manhattan());
+    }
+    BOOST_TEST_REQUIRE(25.0, allTestingRowsMask.manhattan());
+}
+
 BOOST_AUTO_TEST_CASE(testMicWithColumn) {
 
     // Test we get the exact MICe value when the number of rows is less than