-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathparser.rs
2313 lines (2084 loc) · 79.5 KB
/
parser.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright (c) 2020 Google LLC All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#![deny(missing_docs)]
use {
crate::{content::*, error::*},
lazy_static::lazy_static,
regex::{CaptureLocations, Match, Regex},
std::cell::RefCell,
std::rc::Rc,
};
/// All of the regular expressions in this module consume from the start of the remaining characters
/// in the input buffer. To make it more clear that the Regex instances must start with "^", this
/// function prepends the "^" to the start of the rest of the regex string, and all Regex
/// declarations start with this function.
fn from_start(regex: &str) -> String {
"^".to_owned() + regex
}
/// Wraps a regex pattern to match the string *only* if it matches the entire string.
fn exact_match(regex: &str) -> String {
"^".to_owned() + regex + "$"
}
lazy_static! {
/// Any unicode whitespace except newline.
static ref WHITESPACE_PATTERN: &'static str = r#"([\s&&[^\n]]+)"#;
static ref WHITESPACE: usize = 1;
/// Match a newline (except newlines in multiline strings and block comments).
static ref NEWLINE_PATTERN: &'static str = r#"(\n)"#;
static ref NEWLINE: usize = 2;
/// Match two slashes before capturing the line comment. Additional slashes and leading spaces
/// are considered part of the content, so they will be accurately restored by the formatter.
static ref LINE_COMMENT_SLASHES_PATTERN: &'static str = r#"(//)"#;
static ref LINE_COMMENT_SLASHES: usize = 3;
/// Match the start of a block comment.
static ref OPEN_BLOCK_COMMENT_PATTERN: &'static str = r#"(/\*)"#;
static ref OPEN_BLOCK_COMMENT: usize = 4;
/// Any non-string primitive (Number, Boolean, 'null').
static ref NON_STRING_PRIMITIVE_PATTERN: &'static str =
r#"((?x) # ignore whitespace and allow '#' comments
# Capture null, true, or false (lowercase only, as in the ECMAScript keywords).
# End with a word boundary ('\b' marker) to ensure the pattern does not match if
# it is followed by a word ('\w') character; for example, 'nullify' is a valid
# identifier (depending on the context) and must not match the 'null' value.
(?:(?:null|true|false)\b)|
# Capture all number formats. Every variant is allowed an optional '-' or '+' prefix.
(?:[-+]?(?:
# All of the following variants end in a word character. Use '\b' to prevent
# matching numbers immediately followed by another word character, for example,
# 'NaNo', 'Infinity_', or '0xadef1234ghi'.
(?:(?:
NaN|
Infinity|
# hexadecimal notation
(?:0[xX][0-9a-fA-F]+)|
# decimal exponent notation
(?:(?:0|(?:[1-9][0-9]*))?\.[0-9]+[eE][+-]?[0-9]+)|
# integer exponent notation with optional trailing decimal point
(?:(?:0|(?:[1-9][0-9]*))\.?[eE][+-]?[0-9]+)|
# decimal notation
(?:(?:0|(?:[1-9][0-9]*))?\.[0-9]+)
)\b)|
# Capture integers, with an optional trailing decimal point.
# If the value ends in a digit (no trailing decimal point), apply `\b` to prevent
# matching integers immediatly followed by a word character (for example, 1200PDT).
# But if the integer has a trailing decimal, the '\b' does not apply. (Since '.' is
# not itself a '\w' word character, the '\b' would have the opposite affect,
# matching only if the next character is a word character, unless there is no next
# character.)
(?:
(?:0|(?:[1-9][0-9]*))(?:\.|\b)
)
))
)"#;
static ref NON_STRING_PRIMITIVE: usize = 5;
/// Property name without quotes.
static ref UNQUOTED_PROPERTY_NAME_PATTERN: &'static str = r#"[\$\w&&[^\d]][\$\w]*"#;
static ref UNQUOTED_PROPERTY_NAME_REGEX: Regex =
Regex::new(&exact_match(&UNQUOTED_PROPERTY_NAME_PATTERN)).unwrap();
static ref UNQUOTED_PROPERTY_NAME_AND_COLON_PATTERN_STRING: String =
r#"(?:("#.to_owned() + *UNQUOTED_PROPERTY_NAME_PATTERN + r#")[\s&&[^\n]]*:)"#;
static ref UNQUOTED_PROPERTY_NAME_AND_COLON_PATTERN: &'static str =
&UNQUOTED_PROPERTY_NAME_AND_COLON_PATTERN_STRING;
static ref UNQUOTED_PROPERTY_NAME_AND_COLON: usize = 6;
/// Initial quote for a single or double quote string.
static ref OPEN_QUOTE_PATTERN: &'static str = r#"(["'])"#;
static ref OPEN_QUOTE: usize = 7;
/// An opening or closing curly brace or square brace.
static ref BRACE_PATTERN: &'static str = r#"([{}\[\]])"#;
static ref BRACE: usize = 8;
/// Match a comma, separating object properties and array items
static ref COMMA_PATTERN: &'static str = r#"(,)"#;
static ref COMMA: usize = 9;
/// Capture any of the above tokens. These regular expressions are designed for an exclusive
/// match, so only one of the tokens should match a valid JSON 5 document fragement, when
/// applied.
static ref NEXT_TOKEN: Regex = Regex::new(
&from_start(&(r#"(?:"#.to_owned()
+ &[*WHITESPACE_PATTERN,
*NEWLINE_PATTERN,
*LINE_COMMENT_SLASHES_PATTERN,
*OPEN_BLOCK_COMMENT_PATTERN,
*NON_STRING_PRIMITIVE_PATTERN,
*UNQUOTED_PROPERTY_NAME_AND_COLON_PATTERN,
*OPEN_QUOTE_PATTERN,
*BRACE_PATTERN,
*COMMA_PATTERN].join("|")
+ r#")"#))
).unwrap();
/// Capture the contents of a line comment.
static ref LINE_COMMENT: Regex = Regex::new(
&from_start(r#"([^\n]*)"#)
).unwrap();
/// Capture the contents of a block comment.
static ref BLOCK_COMMENT: Regex = Regex::new(
&from_start(r#"((?:.|\n)*?)\*/"#)
).unwrap();
/// Capture the string, without quotes.
static ref SINGLE_QUOTED: Regex = Regex::new(
&from_start(r#"((?:(?:\\\\)|(?:\\')|(?:\\\n)|(?:[^'\n]))*)(?:')"#)
).unwrap();
/// Capture the string, without quotes.
static ref DOUBLE_QUOTED: Regex = Regex::new(
&from_start(r#"((?:(?:\\\\)|(?:\\")|(?:\\\n)|(?:[^"\n]))*)(?:")"#)
).unwrap();
/// Quoted property names are captured using the same regex as quoted string primitives, and
/// unlike `UNQUOTED_PROPERTY_NAME_AND_COLON`, the property name separator (colon with optional
/// whitespace) is not automatically consumed. Use this regex to consume the separator after
/// encountering a quoted string in the property name position.
static ref COLON: Regex = Regex::new(
&from_start(r#"([\s&&[^\n]]*:)"#)
).unwrap();
}
fn matches_unquoted_property_name(strval: &str) -> bool {
const KEYWORDS: &[&str] = &["true", "false", "null"];
UNQUOTED_PROPERTY_NAME_REGEX.is_match(strval) && !KEYWORDS.contains(&strval)
}
struct Capturer {
regex: &'static Regex,
overall_match: Option<String>,
locations: CaptureLocations,
}
impl Capturer {
fn new(regex: &'static Regex) -> Self {
Self { regex, overall_match: None, locations: regex.capture_locations() }
}
fn capture<'a>(&mut self, text: &'a str) -> Option<Match<'a>> {
let captures = self.regex.captures_read(&mut self.locations, text);
if let Some(captures) = &captures {
self.overall_match = Some(text[0..captures.end()].to_string());
} else {
self.overall_match = None;
}
captures
}
fn overall_match(&self) -> Option<&str> {
self.overall_match.as_deref()
}
fn captured(&self, i: usize) -> Option<&str> {
if let (Some(overall_match), Some((start, end))) =
(&self.overall_match, self.locations.get(i))
{
Some(&overall_match[start..end])
} else {
None
}
}
}
/// This internal struct holds the information needed to print a
/// contextually-relevant portion of the line (if not the entire line) where a
/// parser error was caught, the first character of the error on that line, and
/// the number of characters from that initial character index (1 or more) to
/// highlight as being part of the error.
struct ParserErrorContext {
/// The error line to be printed with a parser error.
line: String,
/// The starting character of the error (zero-based index).
indicator_start: usize,
/// The number of characters to highlight, including the character at the
/// `indicator_start` (at least 1).
indicator_len: usize,
}
impl ParserErrorContext {
fn new(line: String, indicator_start: usize, indicator_len: usize) -> Self {
assert!(indicator_len >= 1);
Self { line, indicator_start, indicator_len }
}
fn line(&self) -> &str {
&self.line
}
fn indicator(&self) -> String {
let mut line = " ".repeat(self.indicator_start) + "^";
if self.indicator_len > 1 {
line += &"~".repeat(self.indicator_len - 1);
}
line
}
}
pub(crate) struct Parser<'parser> {
/// The remaining text in the input buffer since the last capture.
remaining: &'parser str,
/// The input filename, if any.
filename: &'parser Option<String>,
/// The text of the current line being parsed.
current_line: &'parser str,
/// The text of the next line to be parsed after parsing the last capture.
next_line: &'parser str,
/// The current line number (from 1) while parsing the input buffer.
line_number: usize,
/// The current column number (from 1) while parsing the input buffer.
column_number: usize,
/// The line number of the next token to be parsed.
next_line_number: usize,
/// The column number of the next token to be parsed.
next_column_number: usize,
/// The top of the stack is the current Object or Array whose content is being parsed.
/// Tne next item in the stack is the Object or Array that contains the current one,
/// and so on.
scope_stack: Vec<Rc<RefCell<Value>>>,
/// To avoid accidentally overflowing the program stack, limit the number of
/// nested scopes and generate an error if it is exceeded.
nesting_limit: usize,
/// Captures a colon token when expected.
colon_capturer: Capturer,
}
impl<'parser> Parser<'parser> {
/// The default limit of nested scopes when parsing a JSON5 document.
pub const DEFAULT_NESTING_LIMIT: usize = 1000;
pub fn new(filename: &'parser Option<String>) -> Self {
let remaining = "";
let current_line = &remaining;
Self {
remaining,
filename,
current_line,
next_line: current_line,
line_number: 1,
column_number: 1,
next_line_number: 1,
next_column_number: 1,
scope_stack: Vec::default(),
nesting_limit: Self::DEFAULT_NESTING_LIMIT,
colon_capturer: Capturer::new(&COLON),
}
}
/// To avoid accidentally overflowing the program stack, there is a mutable
/// limit on the number of nested scopes allowed. If this limit is exceeded
/// while parsing a document, a parser error is generated.
pub fn set_nesting_limit(&mut self, new_limit: usize) {
self.nesting_limit = new_limit;
}
fn current_scope(&self) -> Rc<RefCell<Value>> {
assert!(!self.scope_stack.is_empty());
self.scope_stack.last().unwrap().clone()
}
fn with_container<F, T>(&self, f: F) -> Result<T, Error>
where
F: FnOnce(&mut dyn Container) -> Result<T, Error>,
{
match &mut *self.current_scope().borrow_mut() {
Value::Array { val, .. } => f(val),
Value::Object { val, .. } => f(val),
unexpected => Err(Error::internal(
self.location(),
format!(
"Current scope should be an Array or Object, but scope was {:?}",
unexpected
),
)),
}
}
fn with_array<F, T>(&self, f: F) -> Result<T, Error>
where
F: FnOnce(&mut Array) -> Result<T, Error>,
{
match &mut *self.current_scope().borrow_mut() {
Value::Array { val, .. } => f(val),
unexpected => Err(self.error(format!(
"Invalid Array token found while parsing an {:?} (mismatched braces?)",
unexpected
))),
}
}
fn with_object<F, T>(&self, f: F) -> Result<T, Error>
where
F: FnOnce(&mut Object) -> Result<T, Error>,
{
match &mut *self.current_scope().borrow_mut() {
Value::Object { val, .. } => f(val),
unexpected => Err(self.error(format!(
"Invalid Object token found while parsing an {:?} (mismatched braces?)",
unexpected
))),
}
}
fn is_in_array(&self) -> bool {
(*self.current_scope().borrow()).is_array()
}
fn is_in_object(&self) -> bool {
!self.is_in_array()
}
fn add_value(&mut self, value: Value) -> Result<(), Error> {
let is_container = value.is_object() || value.is_array();
let value_ref = Rc::new(RefCell::new(value));
self.with_container(|container| container.add_value(value_ref.clone(), self))?;
if is_container {
self.scope_stack.push(value_ref.clone());
if self.scope_stack.len() > self.nesting_limit {
return Err(self.error(format!(
"The given JSON5 document exceeds the parser's nesting limit of {}",
self.nesting_limit
)));
}
}
Ok(())
}
fn on_newline(&mut self) -> Result<(), Error> {
self.with_container(|container| container.on_newline())
}
/// Adds a standalone line comment to the current container, or adds an end-of-line comment
/// to the current container's current value.
///
/// # Arguments
/// * `captured`: the line comment content (including leading spaces)
/// * `pending_new_line_comment_block` - If true and the comment is not an
/// end-of-line comment, the container should insert a line_comment_break before inserting
/// the next line comment. This should only be true if this standalone line comment was
/// preceded by one or more standalone line comments and one or more blank lines.
///
/// # Returns
/// true if the line comment is standalone, that is, not an end-of-line comment
fn add_line_comment(
&self,
captured: Option<&str>,
pending_new_line_comment_block: bool,
) -> Result<bool, Error> {
match captured {
Some(content) => {
let content = content.trim_end();
self.with_container(|container| {
container.add_line_comment(
content,
self.column_number,
pending_new_line_comment_block,
)
})
}
None => Err(Error::internal(
self.location(),
"Line comment regex should support empty line comment",
)),
}
}
fn add_block_comment(&self, captured: Option<&str>) -> Result<(), Error> {
match captured {
Some(content) => {
// `indent_count` subtracts 2 characters for the "/*" prefix on the firt line of
// the block comment, and 2 spaces on subsequent lines, assuming the line content is
// meant to be vertically aligned.
let indent_count = self.column_number - 3;
let indent = " ".repeat(indent_count);
if content.lines().enumerate().any(|(index, line)| {
index > 0 && !line.starts_with(&indent) && line.trim() != ""
}) {
self.with_container(|container| {
container.add_block_comment(Comment::Block {
lines: content.lines().map(|line| line.to_owned()).collect(),
align: false,
})
})
} else {
// All block comment lines are indented at least beyond the "/*", so strip the
// indent and re-indent when formatting.
let trimmed_lines = content
.lines()
.enumerate()
.map(|(index, line)| {
if index == 0 {
line
} else if line.trim().is_empty() {
""
} else {
&line[indent_count..]
}
})
.collect::<Vec<&str>>();
self.with_container(|container| {
container.add_block_comment(Comment::Block {
lines: trimmed_lines.iter().map(|line| line.to_string()).collect(),
align: true,
})
})
}
}
None => Err(self.error("Block comment started without closing \"*/\"")),
}
}
fn take_pending_comments(&mut self) -> Result<Vec<Comment>, Error> {
self.with_container(|container| Ok(container.take_pending_comments()))
}
/// The given property name was parsed. Once it's value is also parsed, the property will be
/// added to the current `Object`.
///
/// # Arguments
/// * name - the property name, possibly quoted
fn set_pending_property(&self, name: &str) -> Result<(), Error> {
self.with_object(|object| object.set_pending_property(name.to_string(), self))
}
/// Adds a primitive string value or quoted property name, depending on the current context.
///
/// For property names that meet the requirements for unquoted property names, the unnecessary
/// quotes are removed; otherwise, the original quotes are retained since the content of the
/// string may depend on the type of quote. For instance:
///
/// ```json
/// 'JSON string "with double quotes" wrapped in single quotes'
/// ```
///
/// As long as the single quotes are restored as-is (and not replaced with double-quotes)
/// the formatter can restore the original representation of the string without additional
/// (and perhaps less-readable) escaping of internal quotes.
fn add_quoted_string(&mut self, quote: &str, captured: Option<&str>) -> Result<(), Error> {
match captured {
Some(unquoted) => {
if self.is_in_object()
&& !self.with_object(|object| object.has_pending_property())?
{
let captured = self.colon_capturer.capture(self.remaining);
if self.consume_if_matched(captured) {
if matches_unquoted_property_name(unquoted) {
self.set_pending_property(unquoted)
} else {
self.set_pending_property(&format!("{}{}{}", quote, &unquoted, quote))
}
} else {
Err(self.error("Property name separator (:) missing"))
}
} else {
let comments = self.take_pending_comments()?;
self.add_value(Value::new_primitive(
format!("{}{}{}", quote, &unquoted, quote),
comments,
))
}
}
None => Err(self.error("Unclosed string")),
}
}
fn add_non_string_primitive(&mut self, non_string_primitive: &str) -> Result<(), Error> {
let comments = self.take_pending_comments()?;
self.add_value(Value::new_primitive(non_string_primitive.to_string(), comments))
}
fn on_brace(&mut self, brace: &str) -> Result<(), Error> {
match brace {
"{" => self.open_object(),
"}" => self.close_object(),
"[" => self.open_array(),
"]" => self.close_array(),
unexpected => Err(Error::internal(
self.location(),
format!("regex returned unexpected brace string: {}", unexpected),
)),
}
}
fn open_object(&mut self) -> Result<(), Error> {
let comments = self.take_pending_comments()?;
self.add_value(Value::new_object(comments))
}
fn exit_scope(&mut self) -> Result<(), Error> {
self.scope_stack.pop();
if self.scope_stack.is_empty() {
Err(self.error("Closing brace without a matching opening brace"))
} else {
Ok(())
}
}
fn close_object(&mut self) -> Result<(), Error> {
self.with_object(|object| object.close(self))?;
self.exit_scope()
}
fn open_array(&mut self) -> Result<(), Error> {
let comments = self.take_pending_comments()?;
self.add_value(Value::new_array(comments))
}
fn close_array(&mut self) -> Result<(), Error> {
self.with_array(|array| array.close(self))?;
self.exit_scope()
}
fn end_value(&self) -> Result<(), Error> {
self.with_container(|container| container.end_value(self))
}
pub fn location(&self) -> Option<Location> {
Some(Location::new(self.filename.clone(), self.line_number, self.column_number))
}
pub fn error(&self, err: impl std::fmt::Display) -> Error {
const MAX_ERROR_LINE_LEN: usize = 200;
const MIN_CONTEXT_LEN: usize = 10;
const ELLIPSIS: &str = "\u{2026}";
let error_context = self.get_error_context(MAX_ERROR_LINE_LEN, MIN_CONTEXT_LEN, ELLIPSIS);
Error::parse(
self.location(),
format!("{}:\n{}\n{}", err, error_context.line(), error_context.indicator()),
)
}
fn consume_if_matched(&mut self, matched: Option<Match<'_>>) -> bool {
self.column_number = self.next_column_number;
if self.line_number < self.next_line_number {
self.line_number = self.next_line_number;
self.current_line = self.next_line;
}
if let Some(matched) = matched {
let matched_and_remaining = &self.remaining[matched.start()..];
self.remaining = &self.remaining[matched.end()..];
// If `matched` contains newlines, advance the `next_line` and column, for printing the
// location of the next syntax element, in error messages, for example.
let mut some_matched_lines = None;
for c in matched.as_str().chars() {
if c == '\n' {
let matched_lines = some_matched_lines
.get_or_insert_with(|| matched_and_remaining.lines().skip(1));
self.next_line = matched_lines.next().unwrap_or(self.current_line);
self.next_line_number += 1;
self.next_column_number = 1;
} else {
self.next_column_number += 1;
}
}
true
} else {
false
}
}
fn capture(&mut self, capturer: &mut Capturer) -> bool {
self.consume_if_matched(capturer.capture(self.remaining))
}
fn consume<'a>(&mut self, capturer: &'a mut Capturer) -> Option<&'a str> {
if self.capture(capturer) {
capturer.captured(1)
} else {
None
}
}
/// Parse the given document string as a JSON5 document containing Array
/// elements (with implicit outer braces). Document locations (use in, for
/// example, error messages), are 1-based and start at line 1, column 1.
pub fn parse(&mut self, buffer: &'parser str) -> Result<Array, Error> {
self.parse_from_location(buffer, 1, 1)
}
/// Parse the given document string as a JSON5 document containing Array
/// elements (with implicit outer braces), and use the given 1-based line
/// and column numbers when referring to document locations.
pub fn parse_from_location(
&mut self,
buffer: &'parser str,
starting_line_number: usize,
starting_column_number: usize,
) -> Result<Array, Error> {
self.remaining = buffer;
self.current_line = self.remaining;
assert!(starting_line_number > 0, "document line numbers are 1-based");
self.next_line_number = starting_line_number;
self.next_column_number = starting_column_number;
self.next_line = self.current_line;
self.line_number = self.next_line_number - 1;
self.column_number = self.next_column_number - 1;
self.scope_stack = vec![Rc::new(RefCell::new(Value::new_array(vec![])))];
let mut next_token = Capturer::new(&NEXT_TOKEN);
let mut single_quoted = Capturer::new(&SINGLE_QUOTED);
let mut double_quoted = Capturer::new(&DOUBLE_QUOTED);
let mut line_comment = Capturer::new(&LINE_COMMENT);
let mut block_comment = Capturer::new(&BLOCK_COMMENT);
// Blocks of adjacent line comments should be kept together as a "line comment block", but
// adjacent line comment blocks separated by one or more newlines must be maintained as
// separate blocks.
//
// These booleans, along with `reset_line_comment_break_check`, update state information as
// line comments and blank lines are parsed.
let mut just_captured_line_comment = false;
let mut pending_blank_line = false;
let mut pending_new_line_comment_block = false;
while !self.remaining.is_empty() {
// See comment above regarding "line comment blocks".
let mut reset_line_comment_break_check = true;
if self.capture(&mut next_token) {
// Since this has to be done as an if-else-if-... check the most common
// occurrences first.
if next_token.captured(*WHITESPACE).is_some() {
reset_line_comment_break_check = false;
Ok(()) // ignore all whitespace not in a string or comment
} else if next_token.captured(*NEWLINE).is_some() {
reset_line_comment_break_check = false;
if just_captured_line_comment {
if pending_blank_line {
pending_new_line_comment_block = true;
pending_blank_line = false;
} else if !pending_new_line_comment_block {
pending_blank_line = true;
}
}
self.on_newline()
} else if next_token.captured(*COMMA).is_some() {
self.end_value()
} else if let Some(brace) = next_token.captured(*BRACE) {
self.on_brace(brace)
} else if let Some(non_string_primitive) =
next_token.captured(*NON_STRING_PRIMITIVE)
{
self.add_non_string_primitive(non_string_primitive)
} else if let Some(quote) = next_token.captured(*OPEN_QUOTE) {
let quoted_string = if quote == "'" {
self.consume(&mut single_quoted)
} else {
self.consume(&mut double_quoted)
};
self.add_quoted_string(quote, quoted_string)
} else if let Some(unquoted_property_name) =
next_token.captured(*UNQUOTED_PROPERTY_NAME_AND_COLON)
{
self.set_pending_property(unquoted_property_name)
} else if let Some(_line_comment_start) = next_token.captured(*LINE_COMMENT_SLASHES)
{
reset_line_comment_break_check = false;
pending_blank_line = false;
let line_comment = self.consume(&mut line_comment);
if self.add_line_comment(line_comment, pending_new_line_comment_block)? {
// standalone line comment
just_captured_line_comment = true;
pending_new_line_comment_block = false;
} // else it was an end-of-line comment
Ok(())
} else if let Some(_block_comment_start) = next_token.captured(*OPEN_BLOCK_COMMENT)
{
let block_comment = self.consume(&mut block_comment);
self.add_block_comment(block_comment)
} else {
Err(Error::internal(
self.location(),
format!(
"NEXT_TOKEN matched an unexpected capture group: {}",
next_token.overall_match().unwrap_or("")
),
))
}
} else {
Err(self.error("Unexpected token"))
}?;
if reset_line_comment_break_check {
just_captured_line_comment = false;
pending_blank_line = false;
pending_new_line_comment_block = false;
}
}
self.remaining = "";
self.close_document()?;
match Rc::try_unwrap(self.scope_stack.pop().unwrap())
.map_err(|_| Error::internal(None, "Rc<> for document array could not be unwrapped."))?
.into_inner()
{
Value::Array { val, .. } => Ok(val),
unexpected => Err(Error::internal(
self.location(),
format!("Final scope should be an Array, but scope was {:?}", unexpected),
)),
}
}
fn close_document(&mut self) -> Result<(), Error> {
if self.scope_stack.len() == 1 {
Ok(())
} else {
Err(self.error("Mismatched braces in the document"))
}
}
/// Returns the given `current_line` and an `indicator` line: spaces, followed
/// by a carat (`^`) that points at the given `column_number`, followed by
/// tilde's (`~`) as long as the error token.
///
/// If the line is longer than a set maximum length, the line is trimmed and
/// the indicator positions are adjusted.
fn get_error_context(
&self,
max_error_line_len: usize,
min_context_len: usize,
ellipsis: &str,
) -> ParserErrorContext {
let error_line_len = self.current_line.chars().count();
// `indicator_start` is a 0-based char position
let indicator_start = std::cmp::min(self.column_number - 1, error_line_len);
let indicator_len = if self.line_number == self.next_line_number {
std::cmp::max(
std::cmp::min(
self.next_column_number - self.column_number,
error_line_len - indicator_start,
),
1,
)
} else {
1
};
if error_line_len <= max_error_line_len {
ParserErrorContext::new(self.current_line.to_owned(), indicator_start, indicator_len)
} else {
trim_error_line_and_indicator(
self.current_line,
indicator_start,
indicator_len,
error_line_len,
max_error_line_len,
min_context_len,
ellipsis,
)
}
}
}
struct CharRange {
range: std::ops::Range<usize>,
}
impl CharRange {
fn new(range: std::ops::Range<usize>) -> Self {
Self { range }
}
fn into_byte_range(self, from_string: &str) -> Option<std::ops::Range<usize>> {
let char_len = from_string.chars().count();
let mut some_start_byte =
if self.range.start == char_len { Some(from_string.len()) } else { None };
let mut some_end_byte =
if self.range.end == char_len { Some(from_string.len()) } else { None };
if let (Some(start_byte), Some(end_byte)) = (some_start_byte, some_end_byte) {
return Some(start_byte..end_byte);
}
for (char_pos, (byte_pos, _char)) in from_string.char_indices().enumerate() {
if char_pos == self.range.start {
if let Some(end_byte) = some_end_byte {
return Some(byte_pos..end_byte);
}
some_start_byte = Some(byte_pos);
}
if char_pos == self.range.end {
if let Some(start_byte) = some_start_byte {
return Some(start_byte..byte_pos);
}
some_end_byte = Some(byte_pos);
}
}
None
}
}
fn trim_error_line_and_indicator(
error_line: &str,
indicator_start: usize,
mut indicator_len: usize,
error_line_len: usize,
max_error_line_len: usize,
min_context_len: usize,
ellipsis: &str,
) -> ParserErrorContext {
let ellipsis_len = ellipsis.chars().count();
assert!(max_error_line_len > ellipsis_len);
assert!(max_error_line_len < error_line_len);
assert!(
indicator_start <= error_line_len,
"Error because indicator_start={} > error_line_len={}\n{}",
indicator_start,
error_line_len,
error_line
);
assert!(
indicator_len == 1 || (indicator_start + indicator_len) <= error_line_len,
"Error because indicator_start={}, indicator_len={}, error_line_len={}\n{}",
indicator_start,
indicator_len,
error_line_len,
error_line
);
indicator_len = std::cmp::min(indicator_len, max_error_line_len);
let min_right_context_len = std::cmp::max(min_context_len, indicator_len);
let context_end =
std::cmp::min(indicator_start + min_right_context_len, error_line_len - ellipsis_len);
if context_end < max_error_line_len - ellipsis_len {
let slice_bytes = CharRange::new(0..(max_error_line_len - ellipsis_len))
.into_byte_range(error_line)
.expect("char indices should map to String bytes");
return ParserErrorContext::new(
error_line[slice_bytes].to_string() + ellipsis,
indicator_start,
indicator_len,
);
}
let context_start = indicator_start - std::cmp::min(indicator_start, min_context_len);
if error_line_len - context_start < max_error_line_len - ellipsis_len {
let start_char = error_line_len - (max_error_line_len - ellipsis_len);
let slice_bytes = CharRange::new(start_char..error_line_len)
.into_byte_range(error_line)
.expect("char indices should map to String bytes");
return ParserErrorContext::new(
ellipsis.to_owned() + &error_line[slice_bytes],
(indicator_start + ellipsis_len) - start_char,
indicator_len,
);
}
let margin_chars =
max_error_line_len - std::cmp::min(max_error_line_len, (ellipsis_len * 2) + indicator_len);
let right_margin = std::cmp::min(
error_line_len - std::cmp::min(error_line_len, indicator_start + indicator_len),
margin_chars / 2,
);
let left_margin = margin_chars - right_margin;
let mut start_char = indicator_start - left_margin;
let mut end_char =
std::cmp::min(indicator_start + indicator_len + right_margin, error_line_len);
let mut start_ellipsis = ellipsis;
let mut end_ellipsis = ellipsis;
if start_char == 0 {
start_ellipsis = "";
end_char += ellipsis_len;
} else if end_char == error_line_len {
end_ellipsis = "";
start_char -= ellipsis_len;
}
let slice_bytes = CharRange::new(start_char..end_char)
.into_byte_range(error_line)
.expect("char indices should map to String bytes");
ParserErrorContext::new(
start_ellipsis.to_owned() + &error_line[slice_bytes] + end_ellipsis,
(indicator_start + ellipsis_len) - start_char,
indicator_len,
)
}
#[cfg(test)]
mod tests {
use {super::*, crate::test_error, proptest::prelude::*};
fn gen_error_line_test(
error_line: &str,
pattern: &str,
max_error_line_len: usize,
min_context_len: usize,
ellipsis: &str,
expected_errorline: &str,
expected_indicator: &str,
) -> Result<(), String> {
let some_newline = pattern.find('\n');
let pattern_line1 =
if let Some(newline) = some_newline { &pattern[0..newline] } else { pattern };
assert!(pattern_line1.len() > 0);
let indicator_start = error_line.find(pattern_line1).expect("pattern not found in line");
let end = indicator_start + pattern.len();
let indicator_len = end - indicator_start;
let error_context = if error_line.chars().count() <= max_error_line_len {
ParserErrorContext::new(error_line.to_owned(), indicator_start, indicator_len)
} else {
trim_error_line_and_indicator(
error_line,
indicator_start,
indicator_len,
error_line.chars().count(),
max_error_line_len,
min_context_len,
ellipsis,
)
};
let actual_errorline = error_context.line();
let actual_indicator = error_context.indicator();
let mut errors = String::new();
if expected_errorline != actual_errorline {
println!(
r#"
expected_errorline: >>>{}<<< (charlen={})
actual_errorline: >>>{}<<< (charlen={} of {}, min context len={})"#,
expected_errorline,
expected_errorline.chars().count(),
actual_errorline,
actual_errorline.chars().count(),
max_error_line_len,
min_context_len,
);
errors.push_str("actual errorline does not match expected");
} else if expected_indicator != actual_indicator {
println!(
r#"
{}"#,
actual_errorline,
);
}
if expected_indicator != actual_indicator {
if !errors.is_empty() {
errors.push_str(" and ");
}
println!(
r#"