Skip to content

Commit

Permalink
optimize parser
Browse files Browse the repository at this point in the history
According to valgrind, the tokenizer::next_token function is called
several million times during a typical unit test. This commit
changes the parser to allocate the tokenizer as a member variable
of the class (on the stack) rather than on the heap using new,
thus it eliminates millions of calls to new.
  • Loading branch information
cbeck88 committed Jun 13, 2014
1 parent 63e3395 commit d2fe8d1
Showing 1 changed file with 48 additions and 50 deletions.
98 changes: 48 additions & 50 deletions src/serialization/parser.cpp
Expand Up @@ -70,7 +70,7 @@ class parser
void error(const std::string& message, const std::string& pos_format = "");

config& cfg_;
tokenizer *tok_;
tokenizer tok_;
abstract_validator *validator_;

struct element {
Expand All @@ -90,27 +90,25 @@ class parser

parser::parser(config &cfg, std::istream &in, abstract_validator * validator)
:cfg_(cfg),
tok_(new tokenizer(in)),
tok_(in),
validator_(validator),
elements()
{
}


parser::~parser()
{
delete tok_;
}
{}

void parser::operator()()
{
cfg_.clear();
elements.push(element(&cfg_, ""));

do {
tok_->next_token();
tok_.next_token();

switch(tok_->current_token().type) {
switch(tok_.current_token().type) {
case token::LF:
continue;
case '[':
Expand All @@ -120,13 +118,13 @@ void parser::operator()()
parse_variable();
break;
default:
if (static_cast<unsigned char>(tok_->current_token().value[0]) == 0xEF &&
static_cast<unsigned char>(tok_->next_token().value[0]) == 0xBB &&
static_cast<unsigned char>(tok_->next_token().value[0]) == 0xBF)
if (static_cast<unsigned char>(tok_.current_token().value[0]) == 0xEF &&
static_cast<unsigned char>(tok_.next_token().value[0]) == 0xBB &&
static_cast<unsigned char>(tok_.next_token().value[0]) == 0xBF)
{
utils::string_map i18n_symbols;
std::stringstream ss;
ss << tok_->get_start_line() << " " << tok_->get_file();
ss << tok_.get_start_line() << " " << tok_.get_file();
ERR_CF << lineno_string(i18n_symbols,
ss.str(),
"Skipping over a utf8 BOM at $pos")
Expand All @@ -139,7 +137,7 @@ void parser::operator()()
break;
}
loadscreen::increment_progress();
} while (tok_->current_token().type != token::END);
} while (tok_.current_token().type != token::END);

// The main element should be there. If it is not, this is a parser error.
assert(!elements.empty());
Expand All @@ -157,53 +155,53 @@ void parser::operator()()

void parser::parse_element()
{
tok_->next_token();
tok_.next_token();
std::string elname;
config* current_element = NULL;
switch(tok_->current_token().type) {
switch(tok_.current_token().type) {
case token::STRING: // [element]
elname = tok_->current_token().value;
if (tok_->next_token().type != ']')
elname = tok_.current_token().value;
if (tok_.next_token().type != ']')
error(_("Unterminated [element] tag"));
// Add the element
current_element = &(elements.top().cfg->add_child(elname));
elements.push(element(current_element, elname, tok_->get_start_line(), tok_->get_file()));
elements.push(element(current_element, elname, tok_.get_start_line(), tok_.get_file()));
if (validator_){
validator_->open_tag(elname,tok_->get_start_line(),
tok_->get_file());
validator_->open_tag(elname,tok_.get_start_line(),
tok_.get_file());
}
break;

case '+': // [+element]
if (tok_->next_token().type != token::STRING)
if (tok_.next_token().type != token::STRING)
error(_("Invalid tag name"));
elname = tok_->current_token().value;
if (tok_->next_token().type != ']')
elname = tok_.current_token().value;
if (tok_.next_token().type != ']')
error(_("Unterminated [+element] tag"));

// Find the last child of the current element whose name is
// element
if (config &c = elements.top().cfg->child(elname, -1)) {
current_element = &c;
if (validator_){
validator_->open_tag(elname,tok_->get_start_line(),
tok_->get_file(),true);
validator_->open_tag(elname,tok_.get_start_line(),
tok_.get_file(),true);
}
} else {
current_element = &elements.top().cfg->add_child(elname);
if (validator_){
validator_->open_tag(elname,tok_->get_start_line(),
tok_->get_file());
validator_->open_tag(elname,tok_.get_start_line(),
tok_.get_file());
}
}
elements.push(element(current_element, elname, tok_->get_start_line(), tok_->get_file()));
elements.push(element(current_element, elname, tok_.get_start_line(), tok_.get_file()));
break;

case '/': // [/element]
if(tok_->next_token().type != token::STRING)
if(tok_.next_token().type != token::STRING)
error(_("Invalid closing tag name"));
elname = tok_->current_token().value;
if(tok_->next_token().type != ']')
elname = tok_.current_token().value;
if(tok_.next_token().type != ']')
error(_("Unterminated closing tag"));
if(elements.size() <= 1)
error(_("Unexpected closing tag"));
Expand Down Expand Up @@ -235,12 +233,12 @@ void parser::parse_variable()
std::vector<std::string> variables;
variables.push_back("");

while (tok_->current_token().type != '=') {
switch(tok_->current_token().type) {
while (tok_.current_token().type != '=') {
switch(tok_.current_token().type) {
case token::STRING:
if(!variables.back().empty())
variables.back() += ' ';
variables.back() += tok_->current_token().value;
variables.back() += tok_.current_token().value;
break;
case ',':
if(variables.back().empty()) {
Expand All @@ -253,7 +251,7 @@ void parser::parse_variable()
error(_("Unexpected characters after variable name (expected , or =)"));
break;
}
tok_->next_token();
tok_.next_token();
}
if(variables.back().empty())
error(_("Empty variable name"));
Expand All @@ -264,10 +262,10 @@ void parser::parse_variable()

bool ignore_next_newlines = false, previous_string = false;
while(1) {
tok_->next_token();
tok_.next_token();
assert(curvar != variables.end());

switch (tok_->current_token().type) {
switch (tok_.current_token().type) {
case ',':
if ((curvar+1) != variables.end()) {
if (buffer.translatable())
Expand All @@ -276,8 +274,8 @@ void parser::parse_variable()
cfg[*curvar] = buffer.value();
if(validator_){
validator_->validate_key (cfg,*curvar,buffer.value(),
tok_->get_start_line (),
tok_->get_file ());
tok_.get_start_line (),
tok_.get_file ());
}
buffer = t_string_base();
++curvar;
Expand All @@ -286,17 +284,17 @@ void parser::parse_variable()
}
break;
case '_':
tok_->next_token();
switch (tok_->current_token().type) {
tok_.next_token();
switch (tok_.current_token().type) {
case token::UNTERMINATED_QSTRING:
error(_("Unterminated quoted string"));
break;
case token::QSTRING:
buffer += t_string_base(tok_->current_token().value, tok_->textdomain());
buffer += t_string_base(tok_.current_token().value, tok_.textdomain());
break;
default:
buffer += "_";
buffer += tok_->current_token().value;
buffer += tok_.current_token().value;
break;
case token::END:
case token::LF:
Expand All @@ -311,10 +309,10 @@ void parser::parse_variable()
if (previous_string) buffer += " ";
//nobreak
default:
buffer += tok_->current_token().value;
buffer += tok_.current_token().value;
break;
case token::QSTRING:
buffer += tok_->current_token().value;
buffer += tok_.current_token().value;
break;
case token::UNTERMINATED_QSTRING:
error(_("Unterminated quoted string"));
Expand All @@ -326,7 +324,7 @@ void parser::parse_variable()
goto finish;
}

previous_string = tok_->current_token().type == token::STRING;
previous_string = tok_.current_token().type == token::STRING;
ignore_next_newlines = false;
}

Expand All @@ -337,8 +335,8 @@ void parser::parse_variable()
cfg[*curvar] = buffer.value();
if(validator_){
validator_->validate_key (cfg,*curvar,buffer.value(),
tok_->get_start_line (),
tok_->get_file ());
tok_.get_start_line (),
tok_.get_file ());
}
while (++curvar != variables.end()) {
cfg[*curvar] = "";
Expand Down Expand Up @@ -382,11 +380,11 @@ void parser::error(const std::string& error_type, const std::string& pos_format)
i18n_symbols["error"] = error_type;

std::stringstream ss;
ss << tok_->get_start_line() << " " << tok_->get_file();
ss << tok_.get_start_line() << " " << tok_.get_file();

#ifdef DEBUG
i18n_symbols["value"] = tok_->current_token().value;
i18n_symbols["previous_value"] = tok_->previous_token().value;
i18n_symbols["value"] = tok_.current_token().value;
i18n_symbols["previous_value"] = tok_.previous_token().value;

const std::string& tok_state =
_("Value: '$value' Previous: '$previous_value'");
Expand Down

1 comment on commit d2fe8d1

@cbeck88
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: The commit message is unfortunately misleading --
the reasons it is an improvement is

1.) Eliminates pointer dereference step tok_-> at nearly every line of parser operator()() -- as I understand C++ compilers cannot optimize out instructions like this.
2.) Therefore simplify inlining of tokenizer functions.
3.) Favor stack allocation over heap allocation when possible, for speed.

The tokenizer is not reallocated every time parser operator() is called, so this commit does not in fact save millions of calls to new() as described.

callgrind reports that for a unit test "wesnoth -u characterize_pathfinding_reach_1", parser::operator()() and tokenizer::next_token() take 20% of the (estimated) time respectively, and that after this change they only take 15%.

Please sign in to comment.