diff --git a/parser.hpp b/parser.hpp index b631a89..c98c9dc 100644 --- a/parser.hpp +++ b/parser.hpp @@ -1,18 +1,29 @@ #ifndef _PARSERTOY_HPP_ #define _PARSERTOY_HPP_ /***************************************************************************** - General-purpose, extensible recursive descent parser for small tasks + Programmable, extensible recursive descent parser for small tasks - NOTE: + In a sense, this is just a naive, inefficient reimplementation of some + common regex functionality -- using regexes... And, for better or worse, + it's also like a sort of embryonic LISP. (I only noticed that after + it has kinda become one all by itself.) - - In a sense, this is just a naive/uninformed reimplementation of some basic - regex functionality -- using regexes... + (I'm not even sure it still qualifies as a rec. desc. parser. + I don't know much about parsing at all, actually.) - But, the main feature would be actually building an AST, and supporting - user hooks etc. for matching constructs, just haven't got 'round to it yet. + The main feature would be actually building an AST, and supporting user + hooks for matching constructs etc., I just haven't got 'round to it yet. At least this is a "structured regex engine" (or "programmable regexes" - is also echoing in my mind); i.e. *regular* regular expressions are not - for structured text, while this can effortlessly handle nested constructs. + is also echoing in my mind); i.e. _regular_ regular expressions really + dislike structured text, while this can handle nested (recursive) + constructs kinda effortlessly. + + And the regex syntax sucks, too, for anything non-trivial. It's like + eating thistle, while hugging a hedgehog. Consider this to be a blanket + around regexes, and also an exoskeleton, to not only proxy, but extend + their abilities. + + NOTE: - It copies the source text, so that it can be kept a little more clean & robust (e.g. for threading), instead of trying to be copyless (and @@ -160,14 +171,11 @@ using namespace std::regex_constants; //!! refine (filter) #include // function, reference_wrapper, ... #include // move -#include - using std::optional; -//!!??#include // for uniformly AST nodes +//!!#include //!!Replace that horrid manual union in RULE! (#22) #include #include #include -//#include -// using std::initializer_list; + //--------------------------------------------------------------------------- namespace Parsing { @@ -216,17 +224,25 @@ namespace Parsing { CONST _SAVE = OPCODE('('); // Save matched text to unnamed capture results CONST _SAVE_AS = OPCODE('['); // Save matched text to named capture results // - Its 1st arg must be an ATOM (USER_LITERAL) for the name + CONST _DEF = OPCODE(':'); // Define a named rule (expects 2 arguments: "Name", {Rule} + // - To trigger Rule later, use _USE... + CONST _USE = OPCODE('`'); // Invoke named rule (expects 1 argument: "Name") +//!!?? CONST _SELF = OPCODE('@'); // Invoke the enclosing DEF-target rule, or NIL if none + // Operator functions... struct RULE; class Parser; - using OPERATION = std::function; + using CONST_OPERATOR = std::function; +//!! using OPERATOR = std::function; // Operator lookup table... - using OP_MAP = std::unordered_map; - extern OP_MAP OPERATORS; //!! See also NAMED_PATTERNS, why not CONST (or at least static) - // Will be populated later, as: - // OPERATORS[RULE::opcode] = [](Parser&, input_pos, rule&) { ... return match-length or 0; } + using CONSTOP_MAP = std::unordered_map; +//!! using OP_MAP = std::unordered_map; + extern CONSTOP_MAP CONST_OPERATORS; +//!! extern OP_MAP OPERATORS; + // These will be populated later, as e.g.: + // OPERATORS[SOME_OPCODE] = [](Parser&, input_pos, rule&) { ... return match-length or 0; } // Can be freely extended by users (respecting the opcode list above). @@ -278,7 +294,7 @@ struct RULE //!!Well, making it const didn't help with the mysterious double copy at "COPYLESS" creation!... //!! -> TC "PROD: directly from ATOM-RULE" - const ATOM atom; //!! Should be extended later to support optional name + regex pairs! (See d_name, currently!) + const ATOM atom; //!! Should be extended later to support optional name + regex pairs! (See d_memo, currently!) //!! Or, actually, better to have patterns as non-atomic types instead (finally)? //!! Also: extend to support precompiled regexes! @@ -295,8 +311,11 @@ struct RULE #endif }; - string d_name; // Symbolic name, if any (e.g. for named patterns, opcodes) for diagnostics, - // or (a placeholder to) the "uniform string representation" of a rule + mutable string name; // Optional user-assigned symbolic name + + mutable string d_memo; // Diagnostic note (e.g. NAMED_PATTERNS key, opcode) + //!!string d_as_str // (placeholder to) "uniform string representation" of a rule + //!! (could even evolve to sg. useful for #9) //----------------------------------------------------------- @@ -333,7 +352,7 @@ struct RULE // Also, this should stop the bizarra "vector too long" range // misinterpretation errors (with arrays of 2 items), too, as a bonus! - RULE(OPCODE opcode): type(OP), opcode(opcode), d_name({(char)opcode}) { + RULE(OPCODE opcode): type(OP), opcode(opcode) { DBG("RULE::OPCODE-ctor creating [{}] as: {} ('{}')...", (void*)this, opcode, (char)opcode); } @@ -429,7 +448,10 @@ DBG("~RULE destructing [{}] (type: {})...", (void*)this, _type_cstr()); _destruct(); } + private: +//!!??friend class Parser; //!!?? WTF does this make no difference?! See _lookup()! + //----------------------------------------------------------- // Construction/destruction/copy/move helpers... @@ -445,19 +467,29 @@ DBG("- Setting up empty rule..."); _prod.emplace_back(_NIL); #endif type = PROD; - d_name = "EMPTY"; + d_memo = ""; } void _init_atom(auto&& s); void _relink_parents() { if (!is_prod()) return; - for(auto& r : prod()) { + for (auto& r : prod()) { r._parent = this; r._relink_parents(); } } + public: + const RULE* _lookup(const string& n) const { + if (name == n) return this; + if (is_prod()) for(auto& r : prod()) { + if (auto res = r._lookup(n); res) return res; + } + return nullptr; + } + +private: void _destruct() { //DBG("RULE::destruc (type: {})...", _type_cstr()); //DUMP(); assert (type != _DESTROYED_); @@ -476,7 +508,8 @@ DBG("- Setting up empty rule..."); assert(other.type != _DESTROYED_); assert(other.type != _MOVED_FROM_); type = other.type; - d_name = other.d_name; + name = other.name; + d_memo = other.d_memo; if (is_atom()) new (const_cast(&atom)) ATOM(other.atom); #ifdef COPYLESS_GRAMMAR else if (type == PROD) new (&_prod) decltype(_prod)(other._prod); //! Copying ref_wrap will only bind other's! @@ -499,13 +532,14 @@ DBG("RULE::_copy (type == {}) done.", _type_cstr()); assert(tmp.type != _DESTROYED_); assert(tmp.type != _MOVED_FROM_); type = tmp.type; - d_name = tmp.d_name; + std::swap(name, tmp.name); + std::swap(d_memo, tmp.d_memo); if (is_atom()) new (const_cast(&atom)) ATOM(std::move(tmp.atom)); #ifdef COPYLESS_GRAMMAR else if (type == PROD) new (&_prod) decltype(_prod)(std::move(tmp._prod)); //!!?? Will this do what I hope? //!! I don't think so!... #else - else if (type == PROD) new (const_cast(&_prod)) PRODUCTION(std::move(tmp.prod())); //! Can't use is_prod() here: it's false if empty()! + else if (type == PROD) new (const_cast(&_prod)) PRODUCTION(std::move(tmp._prod)); //! Can't use is_prod() here: it's false if empty()! #endif else opcode = std::move(tmp.opcode); // just a number... tmp.type = _MOVED_FROM_; @@ -542,18 +576,19 @@ DBG("RULE::_move (type == {}) done.", _type_cstr()); auto _p [[maybe_unused]] = [&](auto x, auto... args) {cerr << x << endl; }; if (!level) p("/------------------------------------------------------------------\\"); - if (d_name.empty()) p_(format("[{} :{}] {} (type #{}):", (void*)this, (void*)_parent, _type_cstr(), (int)type)); - else p_(format("[{} :{}] {} (type #{}) '{}':", (void*)this, (void*)_parent, _type_cstr(), (int)type, d_name)); + if (name.empty()) p_(format("[{} :{}] {} (type #{}):", (void*)this, (void*)_parent, _type_cstr(), (int)type)); + else p_(format("[{} :{}] {} (type #{}) '{}':", (void*)this, (void*)_parent, _type_cstr(), (int)type, name)); if (type == _DESTROYED_) p(" !!! INVALID (DESTROYED) OBJECT !!!"); if (type == _MOVED_FROM_) p(" !!! INVALID (MOVED-FROM) OBJECT !!!"); - if (is_atom()) { _p(format(" \"{}\"", atom)); + if (is_atom()) { _p_(format(" \"{}\"", atom)); } else if (type == PROD) { //! Can't use is_prod() here: it's false if empty()! _p(""); p("{"); // New line for the { for (auto& r : prod()) { r._dump(level + 1); } - p("}"); - } else if (type == OP) { _p(format(" opcode = {} ('{}')", opcode, char(opcode))); + p_("}"); + } else if (type == OP) { _p_(format(" opcode = {} ('{}')", opcode, char(opcode))); } else if (type == _DESTROYED_) { p("!!! _DESTROYED_ !!!"); } else p("*** UNKNOWN/INVALID RULE TYPE! ***"); + _p(d_memo.empty() ? "" : format(" // {} ", d_memo)); if (!level) p("\\------------------------------------------------------------------/\n"); } #ifndef NDEBUG @@ -583,7 +618,7 @@ class Parser // Results of capture ops.; valid only after a successful parse(): STRING_MAP named_captures; //!! My initial guess is that SSO makes it pretty much useless to keep string_views here. - std::map unnamed_captures; // Ordered map! Now we only have to make sure that its order kinda makes sense! :) + std::map unnamed_captures; // Ordered map! Now we only have to make sure that its order kinda makes sense! :) // Diagnostics: int loopguard; @@ -591,12 +626,12 @@ class Parser int rules_tried; int terminals_tried; - CONST DEFAULT_RECURSION_LIMIT = 500; + CONST RECURSION_LIMIT = 300; // Hopefully this'd be hit before a stack overflow... (500 was too high for me) void _reset_counters() { - loopguard = DEFAULT_RECURSION_LIMIT; - depth_reached = DEFAULT_RECURSION_LIMIT; + loopguard = RECURSION_LIMIT; + depth_reached = RECURSION_LIMIT; rules_tried = 0; terminals_tried = 0; } @@ -623,9 +658,13 @@ class Parser text_length = txt.length(); } - +/*!!??WTF cannot access -- it's set as friend! :-o + const RULE* _lookup(const string& name) const { + return syntax._lookup(name); + } +??!!*/ //------------------------------------------------------------------- - Parser(const RULE& syntax, int maxnest = DEFAULT_RECURSION_LIMIT): + Parser(const RULE& syntax, int maxnest = RECURSION_LIMIT): // Sync with _reset*()! syntax(syntax), loopguard(maxnest), @@ -654,6 +693,9 @@ class Parser return match(0, syntax, matched_length); } + //!!bool run(/*runtime_context { const string& input, OUT string output}*/) + bool run() { return parse(""); } + //!! Move these to a `results` (or directly to `captures` or `saves`) objects //!! instead, for more (versatile) queries like unnamed_captures() or saves.count() //!! etc. to begin with! @@ -667,38 +709,55 @@ class Parser // If matches, returns the length of the matched input, otherwise 0. //------------------------------------------------------------------- { -DBG("Parser::match()"); +DBG("match({}, {} [{}]: '{}')... // loopguard: {}", pos, + rule._type_cstr(), (void*)&rule, + rule.type == RULE::USER_LITERAL ? rule.atom : + rule.is_opcode() ? string(1, (char)rule.opcode) : "", + loopguard); + --loopguard; if (depth_reached > loopguard) depth_reached = loopguard; if (!loopguard) { - ERROR("Infinite loop (in 'match()')?!\n"); + ERROR("Recursion level {} is too deep (in match())!", RECURSION_LIMIT); } - OPERATION f; + CONST_OPERATOR f; //++rules_tried; //!! #_of_tried_matches, FFS if (rule.is_atom()) // "curated regex", literal (or user regex, if still supported...) { - assert(!OPERATORS.empty()); - f = OPERATORS[_ATOM]; + assert(!CONST_OPERATORS.empty()); + assert(CONST_OPERATORS.find(_ATOM) != CONST_OPERATORS.end()); + f = CONST_OPERATORS[_ATOM]; //!! Should be dispatched further across the various atom types, instead: //!!f = atom_handler(rule); } else if (rule.is_prod()) { f = prod_handler(rule); // First item is the op. of the rule, or else _SEQ is implied: + // Throws via ERROR() if not found! } - else if (rule.is_opcode()) + else if (rule.is_opcode()) //!! But _SELF and other nullaries!... :-/ -> #26 { - ERROR("Invalid grammar at rule '{}': OPCODE outside of PRODUCTION", rule.d_name); + ERROR("Invalid grammar at rule {}: OPCODE '{}' outside of PRODUCTION", (void*)&rule, rule.opcode); } else { - ERROR("Invalid grammar at rule '{}'", rule.d_name); + ERROR("Invalid grammar at rule {}: '{}'", (void*)&rule, rule.d_memo); } +#ifdef NDEBUG + len = 0; //! This doesn't help (it's even bad, for false sense of sec., and + //! easily masking bugs with that benign-looking 0 in the output), + //! as 0 is a valid output, which should still be ignored -- as any + //! others! -- if match() returned false! + //! OK, disabling it in debug builds for better diagnostics, but + //! enabling in release mode for some cushioning!... +#else + len = (unsigned)-666; // And indeed, this *did* crash! :) (e.g. #29) +#endif auto res = f(*this, pos, rule, len); //! Remember: `len` is OUT! ++loopguard; @@ -707,11 +766,11 @@ DBG("Parser::match()"); } private: - const OPERATION& prod_handler(const RULE& rule) const + const CONST_OPERATOR& prod_handler(const RULE& rule) const { -DBG("OPERATORS in prod_handler: {}", (void*)&OPERATORS); // Remnant from hunting as accidental shadow copies of it... -//DBG("OPERATORS.size in prod_handler: {}", OPERATORS.size()); - assert(!OPERATORS.empty()); +//DBG("CONST_OPERATORS in prod_handler: {}", (void*)&CONST_OPERATORS); // Remnant from hunting as accidental shadow copies of it... +//DBG("CONST_OPERATORS.size in prod_handler: {}", CONST_OPERATORS.size()); + assert(!CONST_OPERATORS.empty()); assert(rule.type == RULE::PROD); //! Shouldn't be asking any other types (not even an opcode-type RULE object directly) assert(!rule.prod().empty()); @@ -725,8 +784,8 @@ DBG("OPERATORS in prod_handler: {}", (void*)&OPERATORS); // Remnant from hunting opcode = rule.prod()[0].opcode; } - if (auto it = OPERATORS.find(opcode); it != OPERATORS.end()) { - decltype(OPERATORS.cbegin()) cit = it; //!!?? better one-liner for iter -> const-iter? + if (auto it = CONST_OPERATORS.find(opcode); it != CONST_OPERATORS.end()) { + decltype(CONST_OPERATORS.cbegin()) cit = it; //!!?? better one-liner for iter -> const-iter? return cit->second; } else { ERROR("Unimplemented opcode: {} ('{}')", opcode, (char)opcode); @@ -766,8 +825,6 @@ DBG("RULE::_init_atom from: \"{}\"...", s); if (s.empty()) { _init_as_nil(); return; } - d_name = s; // Save it as name for diagnostics (even though it's the same as it's value for literals) - auto set_type_and_adjust_regex = [&](string_view pattern, decltype(type) TYPE_if_literal, decltype(type) TYPE_if_regex) { // If "/.../" then it's a regex, so unwrap & mark it as such: @@ -784,8 +841,8 @@ DBG("RULE::_init_atom from: \"{}\"...", s); { auto pattern = set_type_and_adjust_regex(it->second, CURATED_LITERAL, CURATED_REGEX); new (const_cast(&atom)) ATOM(pattern); // Replace the atom name with the actual pattern (that's what that lame `second` is) - -DBG("RULE initialized as named pattern '{}' ('{}') (type: {})", d_name, atom, _type_cstr()); + d_memo = s; // Save the pattern name for diagnostics +DBG("RULE initialized as named pattern '{}' ('{}') (type: {})", d_memo, atom, _type_cstr()); } else { auto pattern = set_type_and_adjust_regex(s, USER_LITERAL, USER_REGEX); new (const_cast(&atom)) ATOM(pattern); @@ -805,11 +862,11 @@ DBG("RULE initialized as string literal '{}' (type: {}).", atom, _type_cstr()); /*!! -OPERATION RULE::op(OPCODE code) const +CONST_OPERATOR RULE::op(OPCODE code) const { assert(type == PROD); //! Never asking the opcode directly! :) - auto it = OPERATORS.find(code); - return it != OPERATORS.end() ? *it : false; //!!?? OPERATORS[NIL] // -- but that can't be (!op)'ed... :-/ + auto it = CONST_OPERATORS.find(code); + return it != CONST_OPERATORS.end() ? *it : false; //!!?? CONST_OPERATORS[NIL] // -- but that can't be (!op)'ed... :-/ } !!*/ @@ -832,7 +889,8 @@ OPERATION RULE::op(OPCODE code) const namespace Parsing { PATTERN_MAP NAMED_PATTERNS = {}; - OP_MAP OPERATORS = {}; + CONSTOP_MAP CONST_OPERATORS = {}; +//!! OP_MAP OPERATORS = {}; void init() { @@ -879,23 +937,23 @@ void init() // Initialize the operation map //------------------------------------------------------------------- - assert(OPERATORS.empty()); + assert(CONST_OPERATORS.empty()); //------------------------------------------------------------------- - OPERATORS[_NIL] = [](Parser&, size_t, const RULE&, OUT size_t&) -> bool + CONST_OPERATORS[_NIL] = [](Parser&, size_t, const RULE&, OUT size_t&) -> bool { DBG("NIL: no op. (returning false)"); return false; }; //------------------------------------------------------------------- - OPERATORS[_T] = [](Parser&, size_t, const RULE&, OUT size_t&) -> bool + CONST_OPERATORS[_T] = [](Parser&, size_t, const RULE&, OUT size_t&) -> bool { DBG("T: 'true' op. (returning true)"); return true; }; //------------------------------------------------------------------- - OPERATORS[_ATOM] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_ATOM] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.is_atom()); static_assert(std::is_same::value); @@ -979,15 +1037,17 @@ DBG("REGEX \"{}\": ---NOT--- MATCHED '{}'!", atom, p.text.substr(pos)); //! Case-insensitivity=true will fail for UNICODE chars! //! So we just go case-sensitive. All the $ATOMs are like that anyway, and //! the user still has control to change it, but won't over a failing match... - if (p.text_length - pos >= len //! needed to silence a PHP warning... +//DBG("LITERAL: source: [{}], pos: {}, len: {}", string_view(p.text), pos, len); + if (p.text_length - pos >= len && string_view(p.text).substr(pos, len) == atom) { +DBG("LITERAL \"{}\": MATCHED '{}'!", atom, string_view(p.text).substr(pos, len)); return true; } else return false; } }; //------------------------------------------------------------------- - OPERATORS[_SEQ] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_SEQ] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.is_prod()); assert(rule.prod().size() >= 2); @@ -1004,14 +1064,17 @@ DBG("REGEX \"{}\": ---NOT--- MATCHED '{}'!", atom, p.text.substr(pos)); }; //------------------------------------------------------------------- - OPERATORS[_SEQ_IMPLIED] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_SEQ_IMPLIED] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { +DBG("_SEQ_IMPLIED: Processing rule [{}]", (void*)&rule); assert(rule.is_prod()); +//!! -> #25 assert(rule.prod().size() > 1); assert(rule.prod().size() >= 1); len = 0; for (auto r = rule.prod().cbegin(); !(r == rule.prod().cend()); ++r) { +//DBG("_SEQ_IMPLIED [{}]: next rule: [{}], pos: {}", (void*)&rule, (void*)&(*r), pos); size_t len_add; if (!p.match(pos + len, *r, len_add)) return false; else len += len_add; @@ -1020,23 +1083,25 @@ DBG("REGEX \"{}\": ---NOT--- MATCHED '{}'!", atom, p.text.substr(pos)); }; //------------------------------------------------------------------- - OPERATORS[_OR] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_OR] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.prod().size() >= 3); for (auto r = rule.prod().cbegin() + 1; r != rule.prod().cend(); ++r) { - if (p.match(pos, *r, len)) { +if (r->is_opcode()) DBG("_OR: found opcode '{}'", (char)r->opcode); +else DBG("_OR: checking (non-operator) rule [{}]...", (void*)&(*r)); + + if (size_t tmplen; p.match(pos, *r, tmplen)) { + len = tmplen; return true; - } else { - continue; } } return false; }; //------------------------------------------------------------------- - OPERATORS[_OPT] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_OPT] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.prod().size() == 2); @@ -1047,7 +1112,7 @@ DBG("REGEX \"{}\": ---NOT--- MATCHED '{}'!", atom, p.text.substr(pos)); }; //------------------------------------------------------------------- - OPERATORS[_ANY] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_ANY] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.prod().size() == 2); @@ -1070,7 +1135,7 @@ DBG("REGEX \"{}\": ---NOT--- MATCHED '{}'!", atom, p.text.substr(pos)); }; //------------------------------------------------------------------- - OPERATORS[_MANY] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_MANY] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.prod().size() == 2); @@ -1095,19 +1160,20 @@ DBG("REGEX \"{}\": ---NOT--- MATCHED '{}'!", atom, p.text.substr(pos)); }; //------------------------------------------------------------------- - OPERATORS[_NOT] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_NOT] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.prod().size() == 2); - if (p.match(pos, rule.prod()[1], len)) { + if (size_t tmplen; p.match(pos, rule.prod()[1], tmplen)) { return false; } else { + len = tmplen; return true; } }; //--------------------------------------------------------------------------- - OPERATORS[_SAVE] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_SAVE] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.prod().size() >= 2); @@ -1124,7 +1190,7 @@ DBG("\n\n SNAPSHOT: [{}]\n\n", snapshot); return false; }; - OPERATORS[_SAVE_AS] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool + CONST_OPERATORS[_SAVE_AS] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { assert(rule.prod().size() >= 3); assert(rule.prod()[1].is_atom()); @@ -1144,9 +1210,49 @@ DBG("\n\n SNAPSHOT[{}]: \"{}\"\n\n", name, snapshot); return false; }; + CONST_OPERATORS[_DEF] = [](Parser& p, [[maybe_unused]] size_t pos, const RULE& rule, OUT [[maybe_unused]] size_t& len) -> bool { + assert(rule.prod().size() == 3); + assert(rule.prod()[1].is_atom()); + auto name = rule.prod()[1].atom; + auto target_rule = rule.prod().begin() + 2; + target_rule->name = name; +DBG("_DEF: '{}' -> [{}], lookup: {}", name, (void*)&(*target_rule), (void*)p.syntax._lookup(name)); + len = 0; + return true; + }; + + CONST_OPERATORS[_USE] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { + assert(rule.prod().size() == 2); + assert(rule.prod()[1].is_atom()); + auto name = rule.prod()[1].atom; + + auto target_rule = p.syntax._lookup(name); + + if (!target_rule) { + ERROR("_USE: '{}' was not found!", name); + return false; + } +DBG("_USE: trying rule [{}] at pos {}...", (void*)target_rule, pos); + return p.match(pos, *target_rule, len); + }; + +/*!! + CONST_OPERATORS[_SELF] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { + assert(rule.prod().size() == 1); + + auto target_rule = ... + +DBG("_SELF: recursing..."); + if (target_rule) { + return p.match(pos, *target_rule, len); + } else { + return false; + } + }; +!!*/ assert(!NAMED_PATTERNS.empty()); - assert(!OPERATORS.empty()); + assert(!CONST_OPERATORS.empty()); initialized = true; DBG("+++ Static init done. +++"); } // init() diff --git a/test/OP_DEF_USE_SELF.cpp b/test/OP_DEF_USE_SELF.cpp new file mode 100644 index 0000000..f42a37d --- /dev/null +++ b/test/OP_DEF_USE_SELF.cpp @@ -0,0 +1,175 @@ +#include "../parser.hpp" + +//--------------------------------------------------------------------------- +// TEST CASES +//--------------------------------------------------------------------------- +#include "./fw/doctest-setup.hpp" + +// Global env. for the test cases... +using namespace Parsing; + + +CASE("def basic") { + Parser p(_{_DEF, "crap", _{_T, _{"subrule"}}}); + p.syntax.DUMP(); + p.run(); + p.syntax.DUMP(); +} + +CASE("def in a seq") { + Parser p(_{ + _{_DEF, "crap", _{_T, _{"subrule"}}}, + "anchor" + }); + p.syntax.DUMP(); + p.run(); + p.syntax.DUMP(); +} + +CASE("def atom alias") { + Parser p(_{ + _{_DEF, "begin", "{"}, + _{_DEF, "end", "}"}, + + _{_USE, "begin"}, "_WHITESPACES", _{_USE, "end"} + }); + p.syntax.DUMP(); + CHECK(p.parse("{ }")); + CHECK(!p.parse("}{")); +// p.syntax.DUMP(); +} + +CASE("call") { + Parser p(_{ + _{_DEF, "crap", "_ID"}, + "<", + _{_USE, "crap"}, + ">" + }); + p.syntax.DUMP(); + ____ + p.parse("x"); + ____ + p.syntax.DUMP(); + ____ + CHECK(p.parse("")); + ____ +} + +/* +CASE(": nested") { + RULE code = _{_MANY, _{_OR, "_ID", "=", "_DIGITS", ";", "_WHITESPACES"} }; + RULE block = _{_DEF, "BLOCK", _{"<", code, ">"}}; + RULE block_out = _{"<", _{_SAVE_AS, "outer", + _{ _{_OPT, code}, _{_OPT, block_in}, _{_OPT, code} }, + }, ">"}; + + Parser p(block); p.syntax.DUMP(); + + CHECK(p.parse(" block>")); + CHECK(p["inner"] == "inner"); + CHECK(p["outer"] == "outer block"); + + CHECK(p.parse(" block>")); + CHECK(p["inner"] == " x = 1; y = 22; "); //! Mind the spaces... + CHECK(p["inner"] != "x = 1; y = 22;" ); //! Mind the spaces... + CHECK(p["outer"] == "outer < x = 1; y = 22; > block"); + +} +*/ + +/* +CASE("recursive basic") { + RULE block = _{_{_OR, "x", _{_SELF}}}, ">"}}; + Parser p(block); p.syntax.DUMP(); + +// CHECK(p.parse("x = 1")); +// CHECK(p.parse("")); + CHECK(p.parse(">")); +// CHECK(!p.parse("")); +} +*/ + +CASE("recursion: nested blocks") { + RULE code = _{_MANY, _{_OR, "_ID", "=", "_DIGITS", ";", "_WHITESPACES"} }; + RULE def_block = _{_DEF, "block", _{"<", _{_ANY, _{_OR, code, _{_USE, "block"}}}, ">"}}; + Parser p(_{def_block, _{_USE, "block"}}); p.syntax.DUMP(); + + CHECK(!p.parse("x = 1")); // -> CASE "nested mixed code & blocks" + CHECK(p.parse("")); + CHECK(p.parse(">")); + CHECK(!p.parse("")); // missing close tag +} +CASE("recursion: bad nesting, should fail!") { + RULE code = _{_MANY, _{_OR, "_ID", "=", "_DIGITS", ";", "_WHITESPACES"} }; + RULE def_block = _{_DEF, "block", _{"<", _{_ANY, _{_OR, code, _{_USE, "block"}}}, ">"}}; + Parser p(_{def_block, _{_USE, "block"}}); p.syntax.DUMP(); + + CHECK(!p.parse("< ")); // missing close tag +} +CASE("recursion: nested mixed code & blocks") { + RULE code = _{_MANY, _{_OR, "_ID", "=", "_DIGITS", ";", "_WHITESPACES"} }; + RULE def_block = _{_DEF, "block", _{"<", _{_ANY, _{_OR, code, _{_USE, "block"}}}, ">"}}; + Parser p(_{def_block, + _{_OR, code, _{_USE, "block"}} + }); p.syntax.DUMP(); + + CHECK(p.parse("x = 1")); + CHECK(p.parse("")); + CHECK(p.parse(">")); + CHECK(!p.parse("")); // missing close tag + CHECK(p.parse("< x = 1; y = 2; or > >")); +} + + +//=========================================================================== +int main(int argc, char** argv) +//=========================================================================== +{ + doctest::Context TEST; + TEST.applyCommandLine(argc, argv); + + try { + Parsing::init(); +/*!! +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! CONST VIOLATION! + //!!OPERATORS[_NAME] = [](Parser& p, size_t pos, RULE& rule, OUT size_t& len) -> bool { + CONST_OPERATORS[_DEF] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { + assert(rule.prod().size() == 3); + assert(rule.prod()[1].is_atom()); + auto name = rule.prod()[1].atom; +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! CONST VIOLATION HERE! :-/ + auto target_rule = const_cast(rule).prod().begin() + 2; + target_rule->name = name; +// auto& r = const_cast(rule); +// r.prod().erase(r.prod().begin(), r.prod().begin() + 2); +DBG("_DEF: '{}', this: {}, parent: {}, lookup: {}", name, (const void*)&rule, (void*)rule._parent, (void*)p.syntax._lookup(name)); + return true; + }; + + CONST_OPERATORS[_USE] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { + assert(rule.prod().size() == 2); + assert(rule.prod()[1].is_atom()); + auto name = rule.prod()[1].atom; + auto target_rule = p.syntax._lookup(name); + if (!target_rule) { +DBG("_USE: '{}' was not found!", name); + return false; + } +DBG("_USE: trying rule [{}] at pos {}...", (void*)target_rule, pos); + return p.match(pos, *target_rule, len); + }; +!!*/ + TEST.run(); + + } catch(std::runtime_error& x) { + cerr << x.what() << "\n"; + exit(-1); + } catch(std::exception& x) { + cerr << "- C++ runtime error: " << x.what() << "\n"; + exit(-2); + } catch(...) { + cerr << "- UNKNOWN ERROR(S)!...\n"; + exit(-9); + } +} diff --git a/test/OP_NAME_AND_CALL.cpp b/test/OP_NAME_AND_CALL.cpp deleted file mode 100644 index 72a4263..0000000 --- a/test/OP_NAME_AND_CALL.cpp +++ /dev/null @@ -1,69 +0,0 @@ -#include "../parser.hpp" - -//--------------------------------------------------------------------------- -// TEST CASES -//--------------------------------------------------------------------------- -#include "./fw/doctest-setup.hpp" - -// Global env. for the test cases... -using namespace Parsing; - -CASE("CAPTURE: nested") { - RULE code = _{_MANY, _{_OR, "_ID", "=", "_DIGITS", ";", "_WHITESPACES"} }; - RULE block_in = _{"<", _{_SAVE_AS, "inner", code}, ">"}; - RULE block_out = _{"<", _{_SAVE_AS, "outer", - _{ _{_OPT, code}, _{_OPT, block_in}, _{_OPT, code} }, - }, ">"}; - - Parser p(block_out); p.syntax.DUMP(); - - CHECK(p.parse(" block>")); - CHECK(p["inner"] == "inner"); - CHECK(p["outer"] == "outer block"); - - CHECK(p.parse(" block>")); - CHECK(p["inner"] == " x = 1; y = 22; "); //! Mind the spaces... - CHECK(p["inner"] != "x = 1; y = 22;" ); //! Mind the spaces... - CHECK(p["outer"] == "outer < x = 1; y = 22; > block"); -} - - - -//=========================================================================== -int main(int argc, char** argv) -//=========================================================================== -{ - doctest::Context TEST; - TEST.applyCommandLine(argc, argv); - - try { - Parsing::init(); - - auto _NAME = OPCODE(':'); - - OPERATORS[_NAME] = [](Parser& p, size_t pos, const RULE& rule, OUT size_t& len) -> bool { -/* // Shift off the CAPTURE prefix... - //!! ...which, alas, currently means full cloning... :-/ - RULE target_rule(PROD(rule.prod().cbegin() + 1, rule.prod().cend())); - - if (p.match(pos, target_rule, len)) { - cerr << "\n\n SNAPSHOT: [" << string_view(p.text).substr(pos, len) << "]" << "\n\n"; - return true; - } -*/ return false; - }; - - - TEST.run(); - - } catch(std::runtime_error& x) { - cerr << x.what() << "\n"; - exit(-1); - } catch(std::exception& x) { - cerr << "- C++ runtime error: " << x.what() << "\n"; - exit(-2); - } catch(...) { - cerr << "- UNKNOWN ERROR(S)!...\n"; - exit(-9); - } -} diff --git a/test/main.cpp b/test/mechanics.cpp similarity index 80% rename from test/main.cpp rename to test/mechanics.cpp index e436704..8cd91a8 100644 --- a/test/main.cpp +++ b/test/mechanics.cpp @@ -9,14 +9,39 @@ // Global env. for the test cases... using namespace Parsing; -CASE("RULE: empty from empty str") { +CASE("diagnostic memo") { + RULE r = _{"_IDCHAR", "_BACKSLASH"}; // named patterns + r.DUMP(); +} + +/* This won't compile: +CASE("RULE: {}") { + //RULE r{}; // "no default ctor" -- FFS, C++, this is an empty aggreg.! + r.DUMP(); +} +*/ +CASE("RULE: {NIL}") { + RULE r{_NIL}; + r.DUMP(); + CHECK((r.is_opcode() && !r.is_prod())); + CHECK(r.type == RULE::OP); + CHECK(r.opcode == _NIL); +} +CASE("RULE: ctor-op= NIL - internal-only use case, but should work") { + RULE r = _NIL; + CHECK((r.is_opcode() && !r.is_prod())); + CHECK(r.type == RULE::OP); + CHECK(r.opcode == _NIL); + ____ +} +CASE("RULE: ctor-op= empty str") { RULE r = ""; //!! This happens to work without a prior init(), but not future-proof! //!! Also, it won't itself call init! r.DUMP(); CHECK(!r.prod().empty()); // Should've been converted to PROD{NIL}, so not empty ____ } -CASE("RULE: empty from empty PROD 1") { +CASE("RULE: ctor-op= empty PROD") { //RULE rule = {}; //! This won't compile, _{} needed! RULE r = _{}; //!! This happens to work without a prior init(), but not future-proof! //!! Also, it won't itself call init! @@ -24,23 +49,14 @@ CASE("RULE: empty from empty PROD 1") { CHECK(!r.prod().empty()); // Should've been converted to PROD{NIL}, so not empty ____ } -CASE("RULE: empty from empty PROD 2") { +CASE("RULE: from empty PROD {}") { PROD prod{}; RULE r{prod}; r.DUMP(); CHECK(!r.prod().empty()); // Should've been converted to PROD{NIL}, so not empty ____ } - - -CASE("RULE: op = NIL - internal-only use case, but works") { - RULE r{_NIL}; - CHECK((r.is_opcode() && !r.is_prod())); - CHECK(r.type == RULE::OP); - CHECK(r.opcode == _NIL); - ____ -} -CASE("RULE: op = T") { +CASE("RULE: {T}") { RULE r{_T}; CHECK((r.is_opcode() && !r.is_prod())); CHECK(r.type == RULE::OP); @@ -115,6 +131,8 @@ CASE("PROD: move auto-created RULE from std::string literal") { !!*/ CASE("PROD: move explicit temporary RULE(NIL)") { PROD prod{RULE(_NIL)}; + //!! This would be even worse, preventing copy elision (MSVC -Wall warned!): + //!! PROD prod{std::move(RULE(_NIL))}; ____ CHECK(!"Why not moved?"); } @@ -129,7 +147,25 @@ CASE("RULE: move from temp prod.") { RULE r{_{_NIL}}; r.DUMP(); ____ - CHECK(!"Why not moved?"); + MESSAGE("Was it still moved? (1 copy is ok for the vector)"); +} + +CASE("RULE: move from temp RULE") { + ____ + RULE rfrom{_NIL}; + RULE rto{std::move(rfrom)}; + rto.DUMP(); + ____ + MESSAGE("Was it still moved? (1 copy is ok for the vector)"); +} + + +//--------------------------------------------------------------------------- +CASE("relink parents after external vector copy") { + RULE r = {_{_OPT, _{_T}}}; + r.DUMP(); + RULE q{r}; + q.DUMP(); } @@ -288,6 +324,20 @@ CASE("regex curated single chars") { CHECK(p.parse("\t \"'/\\")); } +CASE("infinite recursion detected") { + try { + Parser p(_{ + _{_DEF, "self", _{_USE, "self"}}, + _{_USE, "self"} + }); p.syntax.DUMP(); + p.parse("!"); + CHECK(false); + } catch (std::runtime_error& x) { + cerr << x.what() << endl; + CHECK("OK, got an error, but verify in the logs it's really the loop-guard!"); + } +} + //=========================================================================== int main(int argc, char** argv) diff --git a/test/parsing.cpp b/test/parsing.cpp index 7b50311..676bcf9 100644 --- a/test/parsing.cpp +++ b/test/parsing.cpp @@ -147,7 +147,6 @@ CASE("regex curated single chars") { }); CHECK(p.parse("\t \"'/\\")); } - CASE("regex curated ID") { Parser p(_{"_ID"}); CHECK(!p.parse("")); @@ -160,7 +159,8 @@ CASE("regex curated ID") { CHECK(p.parse("a1_")); } -CASE("_OR") { + +CASE("_OR: more than 2") { Parser p(_{_OR, "_ID", "=", "_DIGITS", ";" }); p.syntax.DUMP(); @@ -171,6 +171,64 @@ CASE("_OR") { CHECK(p.parse(";")); CHECK(!p.parse("!")); } +CASE("_OR: mixed with PROD arg") { + Parser p(_{ + _{_OR, "_DIGITS", _{"a", "b"}}, + ";" // ; just for anchoring + }); + p.syntax.DUMP(); + + CHECK(p.parse("1;")); + CHECK(p.parse("ab;")); + CHECK(!p.parse(";")); + CHECK(!p.parse("1 ab;")); + CHECK(!p.parse("1 a b;")); +} +CASE("_OR: wtf original") { + RULE x = _{"x"}; + Parser good{_{_OR, x, '$', "++++++++SENTINEL++++++++"}}; + //good.syntax.DUMP(); + CHECK(good.parse("x -> This is correct actually, nothing to see here.")); + + Parser bad{_{_{_OR, x, '$', "++++++++SENTINEL++++++++"}, "DUMMY ANCHOR"}}; + //bad.syntax.DUMP(); + CHECK(!bad.parse("x -> But not with an anchor.")); +} +CASE("_OR: wtf opcode error") { + try { + RULE code = _{_MANY, _{_OR, "_ID", "=", "_DIGITS", ";", "_WHITESPACES"} }; + // '$' is a nonexistent opcode, jus to trigger an error: + RULE bad = _{_{_{_OR, code, '$', "++++++++SENTINEL++++++++"}}}; + // "$" is just an ordinary literal, no problems: + RULE good = _{_{_{_OR, code, "$", "++++++++SENTINEL++++++++"}}}; +// Parser p(good); p.syntax.DUMP(); + Parser p(bad); p.syntax.DUMP(); + p.parse("...check the logs, look for the SENTINEL in the OR loop!"); + } catch (std::runtime_error&) { + CHECK("OK, got the exception."); + } +} +CASE("_NOT") { + Parser p(_{_NOT, "a"}); + p.syntax.DUMP(); + CHECK(!p.parse("a")); + CHECK(p.parse("b")); + CHECK(p.parse(" ")); + CHECK(p.parse("")); +} +CASE("_NOT: PROD arg") { + Parser p(_{_NOT, _{"a", "b"}}); + p.syntax.DUMP(); + CHECK(!p.parse("ab")); + CHECK(!p.parse("abc")); + CHECK(p.parse("a")); + CHECK(p.parse("b")); + CHECK(p.parse("aab")); + CHECK(p.parse("ba")); + CHECK(p.parse(" ")); + CHECK(p.parse("")); +} + CASE("set int - ANY") { RULE r = _{