/
grammarebnf.ebnf
503 lines (424 loc) · 10.6 KB
/
grammarebnf.ebnf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/**
# DParserGen Syntax
The description for the grammar is written as documentation comments
in dparsergen/generator/grammarebnf.ebnf. It is compiled into a
markdown document in docs/syntax.md.
*/
/**
A grammar file contains a list of declarations.
*/
EBNF
= Declaration+
;
/// ditto
Declaration
= <SymbolDeclaration
| <MatchDeclaration
| <Import
| <OptionDeclaration
;
/**
Declares a symbol, which describes a part of the language.
Every symbol has a name after the optional type. Optional parameters
can be used to declare patterns like comma separated lists only once.
Annotations starting with @ are used for supplying some options to
the parser and lexer generator, but can also be used by the parse
tree creator. The expression defines the sublanguage for this symbol.
There are three types of symbols: Tokens and fragments are explicitly
declared and nonterminals just use the identifier without type before
it. The type of symbols with parameters and no explicit type is
detected automatically from context.
Tokens are sequences of characters, which are detected by the
lexer. Fragments are only used inside tokens or other fragments.
Nonterminals can use other nonterminals and tokens.
*/
SymbolDeclaration
= DeclarationType? Identifier MacroParametersPart? Annotation* ";"
| DeclarationType? Identifier MacroParametersPart? Annotation* "=" Expression ";"
;
/// ditto
DeclarationType
= "fragment"
| "token"
;
/// ditto
MacroParametersPart
= "(" MacroParameters? ")"
;
/// ditto
MacroParameters @array
= MacroParameter
| MacroParameters "," MacroParameter
;
/// ditto
MacroParameter
= Identifier
| Identifier "..."
;
/++
Sets a global option for this grammar. Currently, the following
options are defined and all need an integer as value:
* startTokenID: Sets the first ID for tokens. Defaults to 0.
* startNonterminalID: Sets the first ID for nonterminals. Defaults to 0.
* startProductionID: Sets the first ID for productions. Defaults to 0.
+/
OptionDeclaration
= ^"option" Identifier ^"=" IntegerLiteral ^";"
;
/**
Imports symbols from a different grammar file.
The string literal contains the path of the other grammar file
relative to the current grammar file. The imported file is parsed
separately, and all symbols will be available.
*/
Import
= "import" StringLiteral ";"
;
/**
Hint that two tokens normally come in pairs, like parens. This can be
used by long lookahead to e.g. skip tokens in parens and make a
decision based on tokens after them.
*/
MatchDeclaration
= "match" Symbol Symbol ";"
;
/**
Annotations can be added to symbols or expressions. Some annotations
are used by the parser or lexer generator as options. The user can
also use annotations for custom meta data, which can be used by
the tree creator or at runtime.
*/
Annotation
= "@" Identifier AnnotationParams?
;
/// ditto
AnnotationParams
= "(" AnnotationParamsPart* ")"
;
/// ditto
AnnotationParamsPart
= StringLiteral | Identifier | CharacterSetLiteral | IntegerLiteral
| "(" AnnotationParamsPart* ")"
| "=" | ":" | ";" | "," | "{" | "}" | "?" | "!" | "<" | ">" | "*" | ">>" | "<<" | "-"
;
/**
Disallows a symbol at this position. Inside tokens, it can disallow
characters. Inside nonterminals, it can disallow a token. At the
end of a production it disallows the symbol after the production.
*/
NegativeLookahead
= "!" Symbol
| "!" "anytoken"
;
/**
Defines a sublanguage.
$TRANSITIVE_UNWRAP_TABLE(Expression, TokenMinus, PostfixExpression)
*/
Expression
= <Alternation
;
/**
Defines a sublanguage, which allows all texts of the combined
sublanguages.
*/
Alternation
= <Concatenation
| Alternation "|" Concatenation
;
/**
Defines a sublanguage, where multiple parts appear in a sequence.
It allows optional annotations at the end.
Empty productions should use the annotation @empty.
*/
Concatenation
= <TokenMinus
| TokenMinus TokenMinus+ @regArray ProductionAnnotation*
| TokenMinus @regArray ProductionAnnotation+
| @regArray ProductionAnnotation+
;
/// ditto
ProductionAnnotation @directUnwrap
= <Annotation
| <NegativeLookahead
;
/**
Defines a sublanguage for tokens, which allows every text in the left
sublanguage, but not in the right sublanguge. Only allowed inside
the definition of tokens.
*/
TokenMinus
= <AnnotatedExpression
| TokenMinus "-" AnnotatedExpression
;
/**
Adds different annotations to an expression.
The optional name can be used by the tree creator.
The prefix "<" can be used to unwrap nonterminals for simple
definitions like "A = <B;", so no tree will be created for "A".
The prefix "^" drops this part in the created tree.
*/
AnnotatedExpression
= @regArray ExpressionAnnotation* ExpressionName? ExpressionPrefix* PostfixExpression
;
/// ditto
ExpressionAnnotation @directUnwrap
= <Annotation
| <NegativeLookahead
;
/// ditto
ExpressionName
= Identifier ":"
;
/// ditto
ExpressionPrefix
= "<"
| "^"
;
/**
*/
PostfixExpression
= <Optional
| <Repetition
| <RepetitionPlus
| <AtomExpression
;
/**
Makes an expression optional.
Internally using X? is replaced with a new nonterminal and new
grammar rules are added, similar to the following:
```
XOpt = @empty | <X;
```
*/
Optional
= PostfixExpression "?"
;
/**
Allows to repeat an expression zero or more time.
Internally using X* is replaced with a new nonterminal and new
grammar rules are added, similar to the following:
```
XStar @array = @empty | XPlus;
XPlus @array = X | XPlus X;
```
*/
Repetition
= PostfixExpression "*"
;
/**
Allows to repeat an expression one or more time.
Internally using X+ is replaced with a new nonterminal and new
grammar rules are added, similar to the following:
```
XPlus @array = X | XPlus X;
```
*/
RepetitionPlus
= PostfixExpression "+"
;
/**
*/
AtomExpression
= <Symbol
| <ParenExpression
/* | <PosLookaheadExpression
| <NegLookaheadExpression*/
| <SubToken
| <UnpackVariadicList
| <Tuple
;
/**
*/
Symbol
= <Name
| <Token
| <MacroInstance
;
/**
References a symbol by name, which could be a token or nonterminal.
*/
Name
= Identifier
;
/**
Literal for token.
*/
Token
= StringLiteral
| CharacterSetLiteral
;
/**
Unpacks a list of variadic parameters, so a macro is instantiated
with them directly.
*/
UnpackVariadicList
= Identifier "..."
;
/**
Uses the token on the left side with the condition, that it matches
the right side.
*/
SubToken
= Symbol ">>" Symbol
| Symbol ">>" ParenExpression
;
/**
Uses a macro.
*/
MacroInstance
= Identifier "(" ExpressionList? ")"
;
/**
Uses the inner expression without change.
*/
ParenExpression
= "{" Expression "}"
;
/*PosLookaheadExpression
= "{" "?" "=" ExpressionList "}"
;
NegLookaheadExpression
= "{" "?" "!" ExpressionList "}"
;*/
/**
Comma seperated list of expressions.
*/
ExpressionList @array
= Expression
| ExpressionList "," Expression
;
/**
Allows to create a tuple of expressions, which works like variadic
arguments.
*/
Tuple
= "t(" ExpressionList? ")"
;
/**
*/
token Identifier @lowPrio
= [a-zA-Z_] [a-zA-Z0-9_]*
;
/**
`StringLiteral` specifies a sequence of characters, which can be
directly used as a token or for defining other tokens.
*/
token StringLiteral
= "\"" StringPart* "\""
;
/// ditto
fragment StringPart
= [^\"\\\r\n]
| EscapeSequence
;
/**
`CharacterSetLiteral` specifies a set of characters, which can be directly
used as a token or for defining other tokens. The syntax is inspired
by bracket expressions inside regular expressions.
The set is defined by a list of characters and ranges. A range
consists of two characters separated by a '-'. The range contains
all characters from the start character to the end characters
including both.
Using '^' directly after the opening bracket inverts the set.
The sequence [^] means any valid character.
`EscapeSequence`s can be used for characters, which would have a special
meaning inside the character set. The characters '\', ']', and '-'
always have a special meaning and need to be escaped. The character
'^' is only special at the beginning. '[' is also reserved.
The characters are always case-sensitive and do not depend on the
locale for ordering.
*/
token CharacterSetLiteral
= "[" "^"? CharacterSetPart* "]"
;
/// ditto
fragment CharacterSetPart
= CharacterSetPart2
| CharacterSetPart2 "-" CharacterSetPart2
;
/// ditto
fragment CharacterSetPart2
= [^\[\]\\\-]
| EscapeSequence
;
/**
Used in `StringLiteral` and `CharacterSetLiteral`.
The escape sequences \0, \a, \b, \f, \n, \r, \t and \v represent
special characters like in D.
The escape sequences \x, \u and \U are followed by a hexadecimal
number, which is turned into a Unicode character. The number needs
to be a valid Unicode character of the used size. For \x only ACSII
characters are valid, because other characters need more UTF-8 bytes.
For \u and \U Unicode surrogates are not allowed.
All other escape sequences represent the character following the slash.
*/
fragment EscapeSequence
= "\\\\"
| "\\\""
| "\\\'"
| "\\0"
| "\\a"
| "\\b"
| "\\f"
| "\\n"
| "\\r"
| "\\t"
| "\\v"
| "\\["
| "\\]"
| "\\-"
| "\\x" Hex Hex
| "\\u" Hex Hex Hex Hex
| "\\U" Hex Hex Hex Hex Hex Hex Hex Hex
;
/// ditto
fragment Hex
= [0-9A-Fa-f]
;
/**
*/
token IntegerLiteral
= [1-9] [0-9]* | "0"
;
/**
Whitespace is ignored. Sometimes it is necessary to separate tokens.
*/
token Space @ignoreToken
= [ \n\r\t]+
;
/**
A line comment starts with "//" and includes all characters of the
current line. Line comments starting with "///" can be used as
documentation comments for symbols.
*/
token LineComment @ignoreToken
= "//" [^\n\r]*
;
/++
A block comment starts with "/\*" and ends with next occurrence of "\*/".
It can not be nested. Block comments starting with "/\*\*" can be used
as documentation comments for symbols.
+/
token BlockComment @ignoreToken
= "/*" BlockCommentPart* "*"* "*/"
;
/// ditto
fragment BlockCommentPart
= [^*]
| "*"+ [^*/]
;
/**
A nested block comment starts with "/+" and can be nested. It ends with
the next not nested occurrence of "+/". Nested block comments starting
with "/++" can be used as documentation comments for symbols.
*/
token NestingBlockComment @ignoreToken
= "/+" NestingBlockCommentPart* "+"* "+/" @recursiveLexer
;
/// ditto
fragment NestingBlockCommentPart
= [^+/]
| "+"+ [^+/]
| "/"+ [^+/]
| "/"* NestingBlockComment
;