-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparsegen_language.cpp
185 lines (176 loc) · 5.67 KB
/
parsegen_language.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#include "parsegen_language.hpp"
#include <cstdlib>
#include <iostream>
#include <set>
#include <sstream>
#include "parsegen_build_parser.hpp"
#include "parsegen_regex.hpp"
#include "parsegen_std_vector.hpp"
#include "parsegen_string.hpp"
#include "parsegen_error.hpp"
namespace parsegen {
grammar_ptr build_grammar(language const& language) {
std::map<std::string, int> symbol_map;
int nterminals = 0;
for (auto& token : language.tokens) {
symbol_map[token.name] = nterminals++;
}
int nsymbols = nterminals;
for (auto& production : language.productions) {
if (production.lhs.empty()) {
std::cerr << "ERROR: production "
<< (&production - language.productions.data())
<< " has empty left hand side\n";
abort();
}
if (symbol_map.count(production.lhs)) continue;
symbol_map[production.lhs] = nsymbols++;
}
grammar out;
out.nsymbols = nsymbols;
out.nterminals = nterminals;
for (auto& lang_prod : language.productions) {
grammar::production gprod;
assert(symbol_map.count(lang_prod.lhs));
gprod.lhs = symbol_map[lang_prod.lhs];
for (auto& lang_symb : lang_prod.rhs) {
if (!symbol_map.count(lang_symb)) {
std::stringstream ss;
ss << "RHS entry \"" << lang_symb
<< "\" is neither a nonterminal (LHS of a production) nor a "
"token!\n";
throw std::invalid_argument(ss.str());
}
gprod.rhs.push_back(symbol_map[lang_symb]);
}
out.productions.emplace_back(std::move(gprod));
}
out.symbol_names = make_vector<std::string>(nsymbols);
for (auto& pair : symbol_map) {
at(out.symbol_names, pair.second) = pair.first;
}
add_end_terminal(out);
add_accept_production(out);
for (auto const& name : language.ignored_tokens) {
auto const it = symbol_map.find(name);
if (it == symbol_map.end()) {
throw std::runtime_error("ignored token " + name + " does not exist");
}
out.ignored_terminals.push_back(it->second);
}
return std::make_shared<grammar>(std::move(out));
}
std::ostream& operator<<(std::ostream& os, language const& lang) {
for (auto& token : lang.tokens) {
os << "token " << token.name << " regex " << single_quote(token.regex) << "\n";
}
std::set<std::string> nonterminal_set;
std::vector<std::string> nonterminal_list;
for (auto& prod : lang.productions) {
if (!nonterminal_set.count(prod.lhs)) {
nonterminal_set.insert(prod.lhs);
nonterminal_list.push_back(prod.lhs);
}
}
for (auto& nonterminal : nonterminal_list) {
std::stringstream ss;
ss << nonterminal << " ::=";
auto lead = ss.str();
os << lead;
for (auto& c : lead) c = ' ';
bool first = true;
for (auto& prod : lang.productions) {
if (prod.lhs != nonterminal) continue;
if (first)
first = false;
else
os << " |\n" << lead;
for (auto& symb : prod.rhs) {
if (symb == "|")
os << " '|'";
else
os << " " << symb;
}
}
os << "\n";
}
os << "\n";
return os;
}
finite_automaton build_lexer(language const& language) {
finite_automaton lexer;
for (int i = 0; i < isize(language.tokens); ++i) {
auto& name = at(language.tokens, i).name;
auto& regex = at(language.tokens, i).regex;
if (name.empty()) {
std::cerr << "ERROR: token "
<< i << " has empty name\n";
abort();
}
if (regex.empty()) {
std::cerr << "ERROR: token "
<< i << " has empty regex\n";
abort();
}
if (i == 0) {
lexer = regex::build_dfa(name, regex, i);
} else {
lexer = finite_automaton::unite(lexer, regex::build_dfa(name, regex, i));
}
}
lexer = finite_automaton::simplify(finite_automaton::make_deterministic(lexer));
return lexer;
}
static indentation build_indent_info(language const& language) {
indentation out;
out.is_sensitive = false;
out.indent_token = -1;
out.dedent_token = -1;
out.newline_token = -1;
for (int tok_i = 0; tok_i < isize(language.tokens); ++tok_i) {
auto& token = at(language.tokens, tok_i);
if (token.name == "INDENT") {
if (out.indent_token != -1) {
throw std::invalid_argument("The language has two or more INDENT tokens\n");
}
out.indent_token = tok_i;
out.is_sensitive = true;
} else if (token.name == "DEDENT") {
if (out.dedent_token != -1) {
throw std::invalid_argument("The language has two or more DEDENT tokens\n");
}
out.dedent_token = tok_i;
} else if (token.name == "NEWLINE") {
if (out.newline_token != -1) {
throw std::invalid_argument("The language has two or more NEWLINE tokens\n");
}
out.newline_token = tok_i;
}
}
if (out.is_sensitive && out.indent_token == -1) {
throw std::invalid_argument(
"This indentation-sensitive language has no INDENT token\n");
}
if (out.is_sensitive && out.dedent_token == -1) {
throw std::invalid_argument(
"This indentation-sensitive language has no DEDENT token\n");
}
if (out.is_sensitive && out.newline_token == -1) {
throw std::invalid_argument(
"This indentation-sensitive language has no NEWLINE token\n");
}
if (out.indent_token < out.newline_token ||
out.dedent_token < out.newline_token) {
throw std::invalid_argument(
"NEWLINE needs to come before all other indent tokens\n");
}
return out;
}
parser_tables_ptr build_parser_tables(language const& language) {
auto lexer = build_lexer(language);
auto indent_info = build_indent_info(language);
auto grammar = build_grammar(language);
auto parser = accept_parser(build_lalr1_parser(grammar));
return parser_tables_ptr(new parser_tables({parser, lexer, indent_info}));
}
} // namespace parsegen