Skip to content

Commit

Permalink
Remove support for models
Browse files Browse the repository at this point in the history
Models were fundamentally broken in query evaluation, and no one
noticed—which is usually a sign that a feature can be removed without
much user friction. We already have plans to bring the feature back
through more powerful expressions in TQL.
  • Loading branch information
dominiklohmann committed Oct 7, 2023
1 parent 3e8dc97 commit f03b023
Show file tree
Hide file tree
Showing 16 changed files with 22 additions and 334 deletions.
5 changes: 5 additions & 0 deletions changelog/next/changes/3552--disable-dense-indexes.md
Expand Up @@ -2,3 +2,8 @@ Tenzir no longer builds dense indexes for imported events. Dense indexes
improved query performance at the cost of a higher memory usage. However, over
time the performance improvement became smaller due to other improvements in the
underlying storage engine.

Tenzir no longer supports models in taxonomies. Since Tenzir v4.0 they were only
supported in the deprecated `tenzir-ctl export` and `tenzir-ctl count` commands.
We plan to bring the functionality back in the future with more powerful
expressions in TQL.
2 changes: 1 addition & 1 deletion libtenzir/builtins/operators/where.cpp
Expand Up @@ -67,7 +67,7 @@ class where_operator final

auto initialize(const type& schema, operator_control_plane& ctrl) const
-> caf::expected<state_type> override {
auto ts = taxonomies{.concepts = ctrl.concepts(), .models = {}};
auto ts = taxonomies{.concepts = ctrl.concepts()};
auto resolved_expr = resolve(ts, expr_.inner, schema);
if (not resolved_expr) {
diagnostic::warning("{}", resolved_expr.error())
Expand Down
59 changes: 3 additions & 56 deletions libtenzir/include/tenzir/taxonomies.hpp
Expand Up @@ -62,54 +62,17 @@ using concepts_map = detail::stable_map<std::string, concept_>;
/// to a `concepts_map`.
extern const type concepts_data_schema;

/// The definition of a model.
struct model {
/// The description of the model.
std::string description;

/// The ordered concepts and models that the model is composed of.
/// If an entry is another model, its concepts must also be represented for
/// a schema to be considered.
std::vector<std::string> definition;

friend bool operator==(const model& lhs, const model& rhs);

template <class Inspector>
friend auto inspect(Inspector& f, model& m) {
return f.object(m).pretty_name("model").fields(
f.field("description", m.description),
f.field("definition", m.definition));
}

inline static const record_type& schema() noexcept {
static const auto result = record_type{
{"description", string_type{}},
{"definition", list_type{string_type{}}},
};
return result;
}
};

/// Maps model names to their definitions.
using models_map = detail::stable_map<std::string, model>;

/// Describes the schema of a tenzir::list of models for automatic conversion to
/// a `models_map`.
extern const type models_data_schema;

/// A taxonomy is a combination of concepts and models. Tenzir stores all
/// configured taxonomies in memory together, hence the plural naming.
struct taxonomies {
concepts_map concepts;
models_map models;

friend bool operator==(const taxonomies& lhs, const taxonomies& rhs);

template <class Inspector>
friend auto inspect(Inspector& f, taxonomies& t) {
return f.object(t)
.pretty_name("taxonomies")
.fields(f.field("concepts", t.concepts), f.field("models", t.models));
.fields(f.field("concepts", t.concepts));
}
};

Expand All @@ -123,8 +86,8 @@ std::vector<std::string>
resolve_concepts(const concepts_map& concepts,
std::vector<std::string> fields_or_concepts);

/// Substitutes concept and model identifiers in field extractors with
/// replacement expressions containing only concrete field names.
/// Substitutes concept identifiers in field extractors with replacement
/// expressions containing only concrete field names.
/// @param t The set of taxonomies to apply.
/// @param e The original expression.
/// @param schema An optional schema to restrict taxonomy resolution by.
Expand All @@ -136,22 +99,6 @@ resolve(const taxonomies& t, const expression& e, const type& schema = {});

namespace fmt {

template <>
struct formatter<tenzir::model> {
template <class ParseContext>
constexpr auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
return ctx.begin();
}

template <class FormatContext>
auto format(const tenzir::model& value, FormatContext& ctx)
-> decltype(ctx.out()) {
return fmt::format_to(ctx.out(),
"model {{description: {}, definition: [{}]}}",
value.description, fmt::join(value.definition, ", "));
}
};

template <>
struct formatter<tenzir::concept_> {
template <class ParseContext>
Expand Down
7 changes: 0 additions & 7 deletions libtenzir/src/catalog.cpp
Expand Up @@ -625,14 +625,7 @@ catalog(catalog_actor::stateful_pointer<catalog_state> self,
self->quit(std::move(err));
return catalog_actor::behavior_type::make_empty_behavior();
}
auto taxonomies = load_taxonomies(self->system().config());
if (!taxonomies) {
self->quit(std::move(taxonomies.error()));
return catalog_actor::behavior_type::make_empty_behavior();
}
self->state.taxonomies.concepts = modules::concepts();
// TODO: Taxonomy models are to be removed soon.
self->state.taxonomies.models = std::move(taxonomies->models);
// Load loaded schema types from the singleton.
// TODO: Move to the load handler and re-parse the files.
TENZIR_DIAGNOSTIC_PUSH
Expand Down
13 changes: 1 addition & 12 deletions libtenzir/src/module.cpp
Expand Up @@ -238,7 +238,6 @@ auto load_taxonomies(const caf::actor_system_config& cfg)
std::error_code err{};
auto dirs = get_module_dirs(cfg);
concepts_map concepts;
models_map models;
for (const auto& dir : dirs) {
TENZIR_DEBUG("loading taxonomies from {}", dir);
const auto dir_exists = std::filesystem::exists(dir, err);
Expand All @@ -258,19 +257,9 @@ auto load_taxonomies(const caf::actor_system_config& cfg)
for (auto& [name, definition] : concepts)
TENZIR_DEBUG("extracted concept {} with {} fields", name,
definition.fields.size());
if (auto err = convert(yaml, models, models_data_schema))
return caf::make_error(ec::parse_error,
"failed to extract models from file",
file.string(), err.context());
for (auto& [name, definition] : models) {
TENZIR_DEBUG("extracted model {} with {} fields", name,
definition.definition.size());
TENZIR_TRACE("uses model mapping {} -> {}", name,
definition.definition);
}
}
}
return tenzir::taxonomies{std::move(concepts), std::move(models)};
return tenzir::taxonomies{std::move(concepts)};
}

} // namespace tenzir
150 changes: 3 additions & 147 deletions libtenzir/src/taxonomies.cpp
Expand Up @@ -57,19 +57,8 @@ const type concepts_data_schema = type{map_type{
},
}};

bool operator==(const model& lhs, const model& rhs) {
return lhs.definition == rhs.definition;
}

const type models_data_schema = type{map_type{
type{string_type{}, {{"key", "model.name"}}},
record_type{
{"model", model::schema()},
},
}};

bool operator==(const taxonomies& lhs, const taxonomies& rhs) {
return lhs.concepts == rhs.concepts && lhs.models == rhs.models;
return lhs.concepts == rhs.concepts;
}

std::vector<std::string>
Expand Down Expand Up @@ -170,142 +159,9 @@ resolve(const taxonomies& ts, const expression& e, const type& schema) {
return expression{d};
}
};
auto resolve_models
= [&](const std::string& field_name, relational_operator op,
const tenzir::data& data,
auto make_predicate) -> caf::expected<expression> {
auto r = caf::get_if<record>(&data);
if (!r)
// Models can only be compared to records, so if the data side is
// not a record, we move to the concept substitution phase directly.
return resolve_concepts(field_name, op, data, make_predicate);
if (r->empty())
return expression{caf::none};
auto it = ts.models.find(field_name);
if (it == ts.models.end())
return resolve_concepts(field_name, op, data, make_predicate);
// We have a model predicate.
// ==========================
// The model definition forms a tree that contains models as non-leaf
// nodes and concepts as leafs. For model substition we need to iterate
// over the leafs in the order of definition, which is left to right.
// The levels stack is used to keep track of the current position at
// each level of the tree.
auto level_1 = std::pair{it->second.definition.begin(),
it->second.definition.end()};
auto levels = std::stack{std::vector{std::move(level_1)}};
auto descend = [&] {
for (auto child_component = ts.models.find(*levels.top().first);
child_component != ts.models.end();
child_component = ts.models.find(*levels.top().first)) {
auto& child_def = child_component->second.definition;
levels.emplace(child_def.begin(), child_def.end());
}
};
// Move the cursor to the leftmost leaf in the tree.
descend();
auto next_leaf = [&] {
// Update the levels stack; explicit scope for clarity.
while (!levels.empty() && ++levels.top().first == levels.top().second)
levels.pop();
if (!levels.empty()) {
descend();
// Empty models ought to be rejected at load time.
TENZIR_ASSERT(levels.top().first != levels.top().second);
}
};
// The conjunction for all model concepts that are restriced by a value
// in rec.
conjunction restricted;
// The conjunction for all model concepts that aren't specified in rec.
conjunction unrestricted;
auto abs_op = is_negated(op) ? negate(op) : op;
auto insert_meta_field_predicate = [&] {
auto make_meta_field_predicate =
[&]([[maybe_unused]] relational_operator op, const tenzir::data&) {
return [](std::string item) {
return predicate{field_extractor{std::move(item)},
relational_operator::not_equal,
tenzir::data{}};
};
};
unrestricted.emplace_back(
resolve_concepts(*levels.top().first, relational_operator::equal,
caf::none, make_meta_field_predicate));
};
auto named = !r->begin()->first.empty();
if (named) {
// TODO: Nested records of the form
// <src_endpoint: <1.2.3.4, _>, process_filename: "svchost.exe">
// are currently not supported.
for (; !levels.empty(); next_leaf()) {
// TODO: Use `ends_with` for better ergonomics.
// TODO: Remove matched entries and check mismatched concepts.
auto concept_field = r->find(*levels.top().first);
if (concept_field == r->end())
insert_meta_field_predicate();
else
restricted.emplace_back(
resolve_concepts(*levels.top().first, abs_op,
concept_field->second, make_predicate));
}
} else {
auto value_iterator = r->begin();
for (; !levels.empty(); next_leaf(), ++value_iterator) {
if (value_iterator == r->end())
// The provided record is shorter than the matched concept.
// TODO: This error could be rendered in a way that makes it
// clear how the mismatch happened. For example:
// src_ip, src_port, dst_ip, dst_port, proto
// < _, _, 1.2.3.4, _>
// ^~~~~
// not enough fields provided
return caf::make_error(ec::invalid_query, *r,
"doesn't match the model:", it->first);
if (caf::holds_alternative<caf::none_t>(value_iterator->second))
insert_meta_field_predicate();
else
restricted.emplace_back(
resolve_concepts(*levels.top().first, abs_op,
value_iterator->second, make_predicate));
}
if (value_iterator != r->end()) {
// The provided record is longer than the matched concept.
// TODO: This error could be rendered in a way that makes it
// clear how the mismatch happened. For example:
// src_ip, src_port, dst_ip, dst_port, proto
// < _, _, 1.2.3.4, _, _, "tcp">
// ^~~~~
// too many fields
// provided
return caf::make_error(ec::invalid_query, *r,
"doesn't match the model:", it->first);
}
}
expression expr;
switch (restricted.size()) {
case 0: {
return unrestricted;
}
case 1: {
expr = restricted[0];
break;
}
default: {
expr = expression{std::move(restricted)};
break;
}
}
if (is_negated(op))
expr = negation{std::move(expr)};
if (unrestricted.empty())
return expr;
unrestricted.push_back(expr);
return unrestricted;
};
if (auto data = caf::get_if<tenzir::data>(&pred.rhs)) {
if (auto fe = caf::get_if<field_extractor>(&pred.lhs)) {
return resolve_models(
return resolve_concepts(
fe->field, pred.op, *data,
[&](relational_operator op, const tenzir::data& o) {
return [&, op](const std::string& item) {
Expand All @@ -316,7 +172,7 @@ resolve(const taxonomies& ts, const expression& e, const type& schema) {
}
if (auto data = caf::get_if<tenzir::data>(&pred.lhs)) {
if (auto fe = caf::get_if<field_extractor>(&pred.rhs)) {
return resolve_models(
return resolve_concepts(
fe->field, pred.op, *data,
[&](relational_operator op, const tenzir::data& o) {
return [&, op](const std::string& item) {
Expand Down
9 changes: 0 additions & 9 deletions schema/taxonomy/tenzir/network.yaml
Expand Up @@ -96,12 +96,3 @@
concepts:
- net.outer_vlan
- net.inner_vlan

- model:
name: net.connection
definition:
- net.src.ip
- net.src.port
- net.dst.ip
- net.dst.port
- net.proto
1 change: 0 additions & 1 deletion tenzir/integration/reference/taxonomy-queries/step_03.ref

This file was deleted.

1 change: 0 additions & 1 deletion tenzir/integration/reference/taxonomy-queries/step_04.ref

This file was deleted.

1 change: 0 additions & 1 deletion tenzir/integration/reference/taxonomy-queries/step_05.ref

This file was deleted.

1 change: 0 additions & 1 deletion tenzir/integration/reference/taxonomy-queries/step_06.ref

This file was deleted.

1 change: 0 additions & 1 deletion tenzir/integration/reference/taxonomy-queries/step_07.ref

This file was deleted.

1 change: 0 additions & 1 deletion tenzir/integration/reference/taxonomy-queries/step_08.ref

This file was deleted.

8 changes: 0 additions & 8 deletions tenzir/integration/tests.yaml
Expand Up @@ -329,14 +329,6 @@ tests:
- command: import -b suricata
input: data/pcap/suricata/eve.json.gz
- command: count "net.src.ip == 192.168.168.100"
- command: count "net.connection == <192.168.168.100, _, 72.247.178.18, _, _>"
# We omit the whitespace after the colon on purpose, otherwise pyyaml
# thinks this is a key-value pair.
- command: count 'net.connection == <net.src.ip:192.168.168.100, net.dst.port:80>'
- command: count 'net.connection != <net.src.ip:192.168.168.100, net.dst.port:80>'
- command: count "net.connection == <_, _, _, _, _>"
- command: count "net.connection == <_, _, _, 80, _>"
- command: count "net.connection != <_, _, _, 80, _>"

Arrow Full Data Model:
tags: [export, arrow]
Expand Down

0 comments on commit f03b023

Please sign in to comment.