Skip to content

Commit

Permalink
[ML] Reduce false positives for the periodic component test for anoma…
Browse files Browse the repository at this point in the history
…ly detection (elastic#1177)
  • Loading branch information
tveasey committed May 4, 2020
1 parent b5a2850 commit 0bc6a9d
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 96 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Expand Up @@ -56,6 +56,8 @@
operations. (See {ml-pull}1142[#1142].)
* Fix spurious anomalies for count and sum functions after no data are received for long
periods of time. (See {ml-pull}1158[#1158].)
* Improve false positive rates from periodicity test for time series anomaly detection.
(See {ml-pull}1177[#1177].)
* Break progress reporting of data frame analyses into multiple phases. (See {ml-pull}1179[#1179].)

== {es} version 7.8.0
Expand Down
4 changes: 2 additions & 2 deletions include/maths/CPeriodicityHypothesisTests.h
Expand Up @@ -441,7 +441,7 @@ class MATHS_EXPORT CPeriodicityHypothesisTests final {
STestStats& stats,
double& R,
double& meanRepeats,
double& pVariance,
double& truthVariance,
const TSizeVec& segmentation = TSizeVec{}) const;

//! Run the component amplitude test on the alternative hypothesis.
Expand All @@ -452,7 +452,7 @@ class MATHS_EXPORT CPeriodicityHypothesisTests final {
double v,
double R,
double meanRepeats,
double pVariance,
double truthVariance,
STestStats& stats) const;

private:
Expand Down
167 changes: 100 additions & 67 deletions lib/maths/CPeriodicityHypothesisTests.cc
Expand Up @@ -69,6 +69,44 @@ const std::size_t MINIMUM_REPEATS_TO_TEST_AMPLITUDE{4};
//! A high priority for components we want to take precendence.
double HIGH_PRIORITY{2.0};

//! \brief Fuzzy logical expression with multiplicative AND.
//!
//! DESCRIPTION:
//! This isn't strictly a fuzzy logical expression since we don't ensure
//! that the range of truth values is [0,1]. In fact, we arrange for TRUE
//! to correspond to value > 1. We roll in an implicit threshold such that
//! if individual conditions have values > 0.5 then the expression (just)
//! maps to true.
class CFuzzyExpression {
public:
explicit CFuzzyExpression(double value = 0.0) : m_Value{value} {}

operator bool() const { return m_Value > 1.0; }
bool operator<(const CFuzzyExpression& rhs) const {
return m_Value < rhs.m_Value;
}

double truthValue() const { return m_Value; }

friend CFuzzyExpression operator&&(const CFuzzyExpression& lhs,
const CFuzzyExpression& rhs) {
return CFuzzyExpression{lhs.m_Value * rhs.m_Value};
}

private:
double m_Value;
};

//! Fuzzy check if \p value is greater than \p threshold.
CFuzzyExpression softGreaterThan(double value, double threshold, double margin) {
return CFuzzyExpression{2.0 * CTools::logisticFunction(value, margin, threshold, +1.0)};
}

//! Fuzzy check if \p value is less than \p threshold.
CFuzzyExpression softLessThan(double value, double threshold, double margin) {
return CFuzzyExpression{2.0 * CTools::logisticFunction(value, margin, threshold, -1.0)};
}

//! \brief Accumulates the minimum amplitude.
class CMinAmplitude {
public:
Expand Down Expand Up @@ -1231,26 +1269,27 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec& hypotheses) const
TMinAccumulator vmin;
TMinAccumulator DFmin;
for (const auto& summary : summaries) {
vmin.add(varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 + CONFIDENCE_INTERVAL / 2.0) /
summary.s_VarianceThreshold);
double v{varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 + CONFIDENCE_INTERVAL / 2.0)};
vmin.add(v == summary.s_VarianceThreshold ? 1.0 : v / summary.s_VarianceThreshold);
DFmin.add(summary.s_DF);
}

TMinAccumulator pmin;
TMinAccumulator minMinusTruth;
for (const auto& summary : summaries) {
double v{varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 - CONFIDENCE_INTERVAL / 2.0) /
summary.s_VarianceThreshold / vmin[0]};
double R{summary.s_R / summary.s_AutocorrelationThreshold};
double DF{summary.s_DF / DFmin[0]};
double p{CTools::logisticFunction(v, 0.2, 1.0, -1.0) *
CTools::logisticFunction(R, 0.2, 1.0, +1.0) *
CTools::logisticFunction(DF, 0.2, 1.0, +1.0) *
CTools::logisticFunction(summary.s_TrendSegments, 0.3, 0.0, -1.0) *
CTools::logisticFunction(summary.s_ScaleSegments, 0.3, 0.0, -1.0)};
LOG_TRACE(<< "p = " << p);
if (pmin.add(-p)) {
50.0 - CONFIDENCE_INTERVAL / 2.0)};
v = v == summary.s_VarianceThreshold * vmin[0]
? 1.0
: v / summary.s_VarianceThreshold / vmin[0];
double truth{(softLessThan(v, 1.0, 0.2) &&
softGreaterThan(summary.s_R, summary.s_AutocorrelationThreshold, 0.1) &&
softGreaterThan(summary.s_DF / DFmin[0], 1.0, 0.2) &&
softLessThan(summary.s_TrendSegments, 0.0, 0.3) &&
softLessThan(summary.s_ScaleSegments, 0.0, 0.3))
.truthValue()};
LOG_TRACE(<< "truth(hypothesis) = " << truth);
if (minMinusTruth.add(-truth)) {
result = summary.s_H;
}
}
Expand Down Expand Up @@ -1718,11 +1757,11 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows,

double R;
double meanRepeats;
double pVariance;
double truthVariance;
return this->testVariance(window, values, period_, df1, v1, stats, R,
meanRepeats, pVariance) ||
meanRepeats, truthVariance) ||
this->testAmplitude(window, values, period_, b, v, R, meanRepeats,
pVariance, stats);
truthVariance, stats);
}

bool CPeriodicityHypothesisTests::testPeriodWithScaling(const TTimeTimePr2Vec& windows,
Expand Down Expand Up @@ -1855,9 +1894,9 @@ bool CPeriodicityHypothesisTests::testPeriodWithScaling(const TTimeTimePr2Vec& w

double R;
double meanRepeats;
double pVariance;
double truthVariance;
return this->testVariance({{0, length(windows)}}, values, period_, df1, v1,
stats, R, meanRepeats, pVariance, segmentation);
stats, R, meanRepeats, truthVariance, segmentation);
}

bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition,
Expand Down Expand Up @@ -2062,7 +2101,7 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
return CBasicStatistics::mean(result);
};

double p{0.0};
CFuzzyExpression correlationCondition;
double R{-1.0};

TFloatMeanAccumulatorVec partitionValues;
Expand All @@ -2084,27 +2123,23 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
}

double meanRepeats{calculateMeanRepeats(window, period_)};
double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " relative mean repeats = " << relativeMeanRepeats);

p = std::max(
p, CTools::logisticFunction(RW / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0));
double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

correlationCondition =
std::max(correlationCondition,
softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2));
R = std::max(R, RW);
}

double relativeLogSignificance{
double logSignificance{
CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double meanRepeats{calculateMeanRepeats({{0, windowLength}}, repeat)};
double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
p *= CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
(vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
: CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
LOG_TRACE(<< " p(partition) = " << p);

if (p >= 1.0) {

if (correlationCondition && softGreaterThan(logSignificance, 1.0, 0.1) &&
(vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0) : softLessThan(v1 / vt, 1.0, 0.1))) {
stats.s_StartOfPartition = startOfPartition;
stats.s_R0 = R;
return true;
Expand All @@ -2121,7 +2156,7 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
STestStats& stats,
double& R,
double& meanRepeats,
double& pVariance,
double& truthVariance,
const TSizeVec& segmentation) const {
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};

Expand All @@ -2145,7 +2180,6 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
result.add(calculateRepeats(window, period_, m_BucketLength, buckets));
return CBasicStatistics::mean(result);
}();
LOG_TRACE(<< " mean repeats = " << meanRepeats);

// We're trading off:
// 1) The significance of the variance reduction,
Expand All @@ -2159,22 +2193,24 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
// is equal to the threshold, the variance reduction is equal to the
// threshold and we've observed three periods on average.

double relativeLogSignificance{
double logSignificance{
CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
double segmentsPerRepeat{(stats.s_TrendSegments +
std::max(static_cast<double>(segmentation.size()), 1.0) - 2.0) /
meanRepeats};
pVariance = CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
CTools::logisticFunction(R / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
(vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
: CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
LOG_TRACE(<< " p(variance) = " << pVariance);

if (pVariance >= 1.0) {
double meanRepeatsPerSegment{
meanRepeats /
std::max(stats.s_TrendSegments + static_cast<double>(segmentation.size()), 1.0) /
MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

auto condition = softGreaterThan(logSignificance, 1.0, 0.1) &&
softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
(vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0)
: softLessThan(v1 / vt, 0.1, 1.0)) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2);
truthVariance = condition.truthValue();
LOG_TRACE(<< " truth(variance) = " << truthVariance);

if (condition) {
stats.s_R0 = R;
stats.s_Segmentation = segmentation;
return true;
Expand All @@ -2189,7 +2225,7 @@ bool CPeriodicityHypothesisTests::testAmplitude(const TTimeTimePr2Vec& window,
double v,
double R,
double meanRepeats,
double pVariance,
double truthVariance,
STestStats& stats) const {
core_t::TTime windowLength{length(window)};
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
Expand Down Expand Up @@ -2226,19 +2262,16 @@ bool CPeriodicityHypothesisTests::testAmplitude(const TTimeTimePr2Vec& window,

// Trade off the test significance and the mean number of repeats
// we've observed.
double relativeLogSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double relativeMeanRepeats{
meanRepeats / static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
double minusLogPVariance{-CTools::fastLog(pVariance)};
double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
double pAmplitude{CTools::logisticFunction(relativeLogSignificance, 0.2, 1.0) *
CTools::logisticFunction(relativeMeanRepeats, 0.5, 1.0) *
CTools::logisticFunction(minusLogPVariance, 2.0, 0.0, -1.0) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.0625};
LOG_TRACE(<< " p(amplitude) = " << pAmplitude);

if (pAmplitude >= 1.0) {
double logSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
double minusLogTruthVariance{-CTools::fastLog(truthVariance)};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

if (softLessThan(minusLogTruthVariance, 0.0, 2.0) &&
softGreaterThan(logSignificance, 1.0, 0.2) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2)) {
stats.s_R0 = R;
return true;
}
Expand Down
18 changes: 12 additions & 6 deletions lib/maths/CTimeSeriesModel.cc
Expand Up @@ -1551,8 +1551,9 @@ CUnivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& samples,
}
}

// Time order is not reliable, for example if the data are polled
// or for count feature, the times of all samples will be the same.
// Time order is not a total order, for example if the data are polled
// the times of all samples will be the same. So break ties using the
// sample value.
TSizeVec timeorder(samples.size());
std::iota(timeorder.begin(), timeorder.end(), 0);
std::stable_sort(timeorder.begin(), timeorder.end(),
Expand Down Expand Up @@ -1656,6 +1657,7 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
// We can't properly handle periodicity in the variance of the rate if
// using a Poisson process so remove it from model detectio if we detect
// seasonality.
double numberSamples{m_ResidualModel->numberSamples()};
m_ResidualModel->removeModels(
maths::CPrior::CModelFilter().remove(maths::CPrior::E_Poisson));
m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());
Expand All @@ -1665,7 +1667,8 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
[](double weight, const TFloatMeanAccumulator& sample) {
return weight + CBasicStatistics::count(sample);
})};
double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
double weightScale{
std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
maths_t::TDoubleWeightsAry1Vec weights(1);
for (const auto& residual : residuals) {
double weight(CBasicStatistics::count(residual));
Expand Down Expand Up @@ -2862,8 +2865,9 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample
}
}

// Time order is not reliable, for example if the data are polled
// or for count feature, the times of all samples will be the same.
// Time order is not a total order, for example if the data are polled
// the times of all samples will be the same. So break ties using the
// sample value.
TSizeVec timeorder(samples.size());
std::iota(timeorder.begin(), timeorder.end(), 0);
std::stable_sort(timeorder.begin(), timeorder.end(),
Expand Down Expand Up @@ -2965,6 +2969,7 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
// re-weight so that the total sample weight corresponds to the sample
// weight the model receives from a fixed (shortish) time interval.

double numberSamples{m_ResidualModel->numberSamples()};
m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());

if (residuals.size() > 0) {
Expand All @@ -2988,7 +2993,8 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
}

double Z{std::accumulate(weights.begin(), weights.end(), 0.0)};
double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
double weightScale{
std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
maths_t::TDouble10VecWeightsAry1Vec weight(1);
for (std::size_t i = 0; i < samples.size(); ++i) {
if (weights[i] > 0.0) {
Expand Down

0 comments on commit 0bc6a9d

Please sign in to comment.