Skip to content

Commit

Permalink
Restructure the parser around the concept of token gluing. (#4081)
Browse files Browse the repository at this point in the history
  • Loading branch information
DelSkayn committed Jun 12, 2024
1 parent 2184e80 commit 3539eac
Show file tree
Hide file tree
Showing 45 changed files with 3,105 additions and 2,753 deletions.
3 changes: 2 additions & 1 deletion core/src/sql/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ impl Debug for Regex {

impl Display for Regex {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
write!(f, "/{}/", &self.0)
let t = self.0.to_string().replace('/', "\\/");
write!(f, "/{}/", &t)
}
}

Expand Down
1 change: 1 addition & 0 deletions core/src/sql/uuid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ impl Uuid {

impl Display for Uuid {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
write!(f, "u")?;
Display::fmt(&quote_str(&self.0.to_string()), f)
}
}
31 changes: 31 additions & 0 deletions core/src/syn/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,21 @@ impl Location {

pub fn of_offset(source: &str, offset: usize) -> Self {
assert!(offset <= source.len(), "tried to find location of substring in unrelated string");

if offset == source.len() {
// Eof character

let (last_line, column) = LineIterator::new(source)
.enumerate()
.last()
.map(|(idx, (l, _))| (idx, l.len()))
.unwrap_or((0, 0));
return Self {
line: last_line + 1,
column: column + 1,
};
}

// Bytes of input prior to line being iterated.
let mut bytes_prior = 0;
for (line_idx, (line, seperator_len)) in LineIterator::new(source).enumerate() {
Expand Down Expand Up @@ -109,6 +124,22 @@ impl Location {
let offset = span.offset as usize;
let end = offset + span.len as usize;

if span.len == 0 && source.len() == span.offset as usize {
// EOF span
let (last_line, column) = LineIterator::new(source)
.enumerate()
.last()
.map(|(idx, (l, _))| (idx, l.len()))
.unwrap_or((0, 0));
return Self {
line: last_line + 1,
column,
}..Self {
line: last_line + 1,
column: column + 1,
};
}

// Bytes of input prior to line being iteratated.
let mut bytes_prior = 0;
let mut iterator = LineIterator::new(source).enumerate().peekable();
Expand Down
216 changes: 157 additions & 59 deletions core/src/syn/lexer/byte.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::syn::{
unicode::{byte, chars},
Error, Lexer,
},
token::{t, Token, TokenKind},
token::{t, DatetimeChars, Token, TokenKind},
};

impl<'a> Lexer<'a> {
Expand Down Expand Up @@ -41,8 +41,6 @@ impl<'a> Lexer<'a> {
_ => {}
}
}
self.set_whitespace_span(self.current_span());
self.skip_offset();
}

/// Eats a multi line comment and returns an error if `*/` would be missing.
Expand All @@ -57,8 +55,6 @@ impl<'a> Lexer<'a> {
};
if b'/' == byte {
self.reader.next();
self.set_whitespace_span(self.current_span());
self.skip_offset();
return Ok(());
}
}
Expand Down Expand Up @@ -100,53 +96,35 @@ impl<'a> Lexer<'a> {
_ => break,
}
}
self.set_whitespace_span(self.current_span());
self.skip_offset();
}

// re-lexes a `/` token to a regex token.
pub fn relex_regex(&mut self, token: Token) -> Token {
debug_assert_eq!(token.kind, t!("/"));
debug_assert_eq!(token.span.offset + 1, self.last_offset);
debug_assert_eq!(token.span.len, 1);
debug_assert_eq!(self.scratch, "");

self.last_offset = token.span.offset;
loop {
match self.reader.next() {
Some(b'\\') => {
if let Some(b'/') = self.reader.peek() {
self.reader.next();
self.scratch.push('/')
} else {
self.scratch.push('\\')
}
}
Some(b'/') => break,
Some(x) => {
if x.is_ascii() {
self.scratch.push(x as char);
} else {
match self.reader.complete_char(x) {
Ok(x) => {
self.scratch.push(x);
}
Err(e) => return self.invalid_token(e.into()),
if !x.is_ascii() {
if let Err(e) = self.reader.complete_char(x) {
return self.invalid_token(e.into());
}
}
}
None => return self.invalid_token(Error::UnexpectedEof),
}
}

match self.scratch.parse() {
Ok(x) => {
self.scratch.clear();
self.regex = Some(x);
self.finish_token(TokenKind::Regex)
}
Err(e) => self.invalid_token(Error::Regex(e)),
}
self.finish_token(TokenKind::Regex)
}

/// Lex the next token, starting from the given byte.
Expand All @@ -163,7 +141,7 @@ impl<'a> Lexer<'a> {
b'@' => t!("@"),
byte::CR | byte::FF | byte::LF | byte::SP | byte::VT | byte::TAB => {
self.eat_whitespace();
return self.next_token_inner();
TokenKind::WhiteSpace
}
b'|' => match self.reader.peek() {
Some(b'|') => {
Expand Down Expand Up @@ -262,7 +240,7 @@ impl<'a> Lexer<'a> {
Some(b'-') => {
self.reader.next();
self.eat_single_line_comment();
return self.next_token_inner();
TokenKind::WhiteSpace
}
Some(b'=') => {
self.reader.next();
Expand Down Expand Up @@ -294,12 +272,12 @@ impl<'a> Lexer<'a> {
if let Err(e) = self.eat_multi_line_comment() {
return self.invalid_token(e);
}
return self.next_token_inner();
TokenKind::WhiteSpace
}
Some(b'/') => {
self.reader.next();
self.eat_single_line_comment();
return self.next_token_inner();
TokenKind::WhiteSpace
}
_ => t!("/"),
},
Expand Down Expand Up @@ -340,54 +318,174 @@ impl<'a> Lexer<'a> {
}
b'#' => {
self.eat_single_line_comment();
return self.next_token_inner();
TokenKind::WhiteSpace
}
b'`' => return self.lex_surrounded_ident(true),
b'"' => return self.lex_strand(true),
b'\'' => return self.lex_strand(false),
b'd' => {
match self.reader.peek() {
Some(b'"') => {
self.reader.next();
return self.lex_datetime(true);
}
Some(b'\'') => {
self.reader.next();
return self.lex_datetime(false);
b'"' => t!("\""),
b'\'' => t!("'"),
b'd' => match self.reader.peek() {
Some(b'"') => {
self.reader.next();
t!("d\"")
}
Some(b'\'') => {
self.reader.next();
t!("d'")
}
Some(b'e') => {
self.reader.next();

let Some(b'c') = self.reader.peek() else {
self.scratch.push('d');
return self.lex_ident_from_next_byte(b'e');
};

self.reader.next();

if self.reader.peek().map(|x| x.is_ascii_alphanumeric()).unwrap_or(false) {
self.scratch.push('d');
self.scratch.push('e');
return self.lex_ident_from_next_byte(b'c');
}
_ => {}

t!("dec")
}
return self.lex_ident_from_next_byte(b'd');
}
b'u' => {
match self.reader.peek() {
Some(b'"') => {
self.reader.next();
return self.lex_uuid(true);
Some(x) if !x.is_ascii_alphabetic() => {
t!("d")
}
None => {
t!("d")
}
_ => {
return self.lex_ident_from_next_byte(b'd');
}
},
b'f' => match self.reader.peek() {
Some(x) if !x.is_ascii_alphanumeric() => {
t!("f")
}
None => t!("f"),
_ => {
return self.lex_ident_from_next_byte(b'f');
}
},
b'n' => match self.reader.peek() {
Some(b's') => {
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
self.scratch.push('n');
return self.lex_ident_from_next_byte(b's');
}
Some(b'\'') => {
self.reader.next();
return self.lex_uuid(false);
t!("ns")
}
_ => {
return self.lex_ident_from_next_byte(b'n');
}
},
b'm' => match self.reader.peek() {
Some(b's') => {
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
self.scratch.push('m');
return self.lex_ident_from_next_byte(b's');
}
_ => {}
t!("ms")
}
Some(x) if !x.is_ascii_alphabetic() => {
t!("m")
}
None => {
t!("m")
}
_ => {
return self.lex_ident_from_next_byte(b'm');
}
},
b's' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b's');
} else {
t!("s")
}
}
b'h' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b'h');
} else {
t!("h")
}
}
b'w' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b'w');
} else {
t!("w")
}
return self.lex_ident_from_next_byte(b'u');
}
b'y' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b'y');
} else {
t!("y")
}
}
b'u' => match self.reader.peek() {
Some(b'"') => {
self.reader.next();
t!("u\"")
}
Some(b'\'') => {
self.reader.next();
t!("u'")
}
Some(b's') => {
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
self.scratch.push('u');
return self.lex_ident_from_next_byte(b's');
}
t!("us")
}
_ => {
return self.lex_ident_from_next_byte(b'u');
}
},
b'r' => match self.reader.peek() {
Some(b'\"') => {
Some(b'"') => {
self.reader.next();
t!("r\"")
}
Some(b'\'') => {
self.reader.next();
t!("r'")
}
_ => return self.lex_ident_from_next_byte(byte),
_ => {
return self.lex_ident_from_next_byte(b'r');
}
},
b'Z' => match self.reader.peek() {
Some(x) if x.is_ascii_alphabetic() => {
return self.lex_ident_from_next_byte(b'Z');
}
_ => TokenKind::DatetimeChars(DatetimeChars::Z),
},
b'T' => match self.reader.peek() {
Some(x) if x.is_ascii_alphabetic() => {
return self.lex_ident_from_next_byte(b'T');
}
_ => TokenKind::DatetimeChars(DatetimeChars::T),
},
b'e' => {
return self.lex_exponent(b'e');
}
b'E' => {
return self.lex_exponent(b'E');
}
b'0'..=b'9' => return self.lex_digits(),
b'a'..=b'z' | b'A'..=b'Z' | b'_' => {
return self.lex_ident_from_next_byte(byte);
}
b'0'..=b'9' => return self.lex_number(byte),
//b'0'..=b'9' => return self.lex_number(byte),
x => return self.invalid_token(Error::UnexpectedCharacter(x as char)),
};

Expand Down
Loading

0 comments on commit 3539eac

Please sign in to comment.