fix: workarouond for dollar quoted strings

psteinroe · psteinroe · commit c4e9edb5e875 · 2023-07-14T17:57:22.000+02:00
diff --git a/crates/parser/Cargo.toml b/crates/parser/Cargo.toml
@@ -10,4 +10,5 @@ cstree = { version = "0.12.0", features = ["derive"] }
 pg_query = "0.7"
 logos = "0.13.0"
 serde_json = "1.0"
+regex = "1.9.1"
 serde = { version = "1.0", features = ["derive"] }
diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs
@@ -15,7 +15,7 @@ pub struct Parser {
     errors: Vec<SyntaxError>,
     stmts: Vec<RawStmt>,
     checkpoint: Option<i32>,
-    is_parsing_erronous_node: bool,
+    is_parsing_flat_node: bool,
 }
 
 #[derive(Debug)]
@@ -35,7 +35,7 @@ impl Parser {
             errors: Vec::new(),
             stmts: Vec::new(),
             checkpoint: None,
-            is_parsing_erronous_node: false,
+            is_parsing_flat_node: false,
         }
     }
 
@@ -46,12 +46,12 @@ impl Parser {
         }
     }
 
-    pub fn set_checkpoint(&mut self, is_parsing_erronous_node: bool) {
+    pub fn set_checkpoint(&mut self, is_parsing_flat_node: bool) {
         assert!(self.checkpoint.is_none());
         assert!(self.token_buffer.is_empty());
         println!("set_checkpoint at {}", self.curr_depth);
         self.checkpoint = Some(self.curr_depth);
-        self.is_parsing_erronous_node = is_parsing_erronous_node;
+        self.is_parsing_flat_node = is_parsing_flat_node;
     }
 
     pub fn close_checkpoint(&mut self) {
@@ -60,7 +60,7 @@ impl Parser {
             self.close_until_depth(self.checkpoint.unwrap());
         }
         self.checkpoint = None;
-        self.is_parsing_erronous_node = false;
+        self.is_parsing_flat_node = false;
     }
 
     pub fn start_node(&mut self, kind: SyntaxKind) {
@@ -101,7 +101,7 @@ impl Parser {
     ///
     /// if `is_parsing_erronous_node` is true, applies token immediately
     pub fn token(&mut self, kind: SyntaxKind, text: &str) {
-        if self.is_parsing_erronous_node {
+        if self.is_parsing_flat_node {
             self.inner.token(kind, text);
             return;
         }
diff --git a/crates/parser/src/statement.rs b/crates/parser/src/statement.rs
@@ -5,11 +5,18 @@
 /// being used. all words are put into the "Word" type and will be defined in more detail by the results of pg_query.rs
 use cstree::text::{TextRange, TextSize};
 use logos::Logos;
+use regex::Regex;
 
 use crate::{
     parser::Parser, pg_query_utils::get_position_for_pg_query_node, syntax_kind::SyntaxKind,
 };
 
+#[derive(Logos, Debug, PartialEq)]
+pub enum Test {
+    #[regex("'([^']+)'|\\$(\\w)?\\$.*\\$(\\w)?\\$")]
+    Sconst,
+}
+
 #[derive(Logos, Debug, PartialEq)]
 pub enum StatementToken {
     // copied from protobuf::Token. can be generated later
@@ -52,7 +59,8 @@ pub enum StatementToken {
     #[token("^")]
     Ascii94,
     // comments, whitespaces and keywords
-    #[regex("'([^']+)'")]
+    // FIXME: nexted and named dollar quoted strings do not work yet
+    #[regex("'([^']+)'|\\$(\\w)?\\$.*\\$(\\w)?\\$")]
     Sconst,
     #[regex("(\\w+)"gm)]
     Word,
@@ -143,20 +151,36 @@ impl Parser {
         // parse root node if no syntax errors
         if pg_query_nodes.peek().is_some() {
             let (node, depth, _) = pg_query_nodes.next().unwrap();
-            // TODO: if root node is a create or alter function stmt, parse the function body
-            // separately
             self.stmt(node.to_enum(), range);
             self.start_node_at(SyntaxKind::from_pg_query_node(&node), Some(depth));
-            self.set_checkpoint(false);
+            // if there is only one node, there are no children, and we do not need to buffer the
+            // tokens. this happens for example with create or alter function statements.
+            self.set_checkpoint(pg_query_nodes.peek().is_none());
         } else {
             // fallback to generic node as root
             self.start_node_at(SyntaxKind::Stmt, None);
             self.set_checkpoint(true);
         }
 
+        // FIXME: the lexer, for some reason, does not parse dollar quoted string
+        // so we check if the error is one
         while let Some(token) = lexer.next() {
-            match token {
-                Ok(token) => {
+            let t: Option<StatementToken> = match token {
+                Ok(token) => Some(token),
+                Err(_) => {
+                    if Regex::new("'([^']+)'|\\$(\\w)?\\$.*\\$(\\w)?\\$")
+                        .unwrap()
+                        .is_match_at(lexer.slice(), 0)
+                    {
+                        Some(StatementToken::Sconst)
+                    } else {
+                        None
+                    }
+                }
+            };
+
+            match t {
+                Some(token) => {
                     let span = lexer.span();
 
                     // consume pg_query nodes until there is none, or the node is outside of the current text span
@@ -179,6 +203,8 @@ impl Parser {
                         ) || span
                             .contains(&usize::try_from(next_pg_query_token.unwrap().end).unwrap()))
                     {
+                        // TODO: if within function declaration and current token is Sconst, its
+                        // the function body. it should be passed into parse_source_file.
                         self.token(
                             SyntaxKind::from_pg_query_token(&pg_query_tokens.next().unwrap()),
                             lexer.slice(),
@@ -188,7 +214,7 @@ impl Parser {
                         self.token(token.syntax_kind(), lexer.slice());
                     }
                 }
-                Err(_) => panic!("Unknown SourceFileToken: {:?}", lexer.span()),
+                None => panic!("Unknown StatementToken: {:?}", lexer.slice()),
             }
         }
 
@@ -272,4 +298,19 @@ mod tests {
 
         assert_eq!(parsed.cst.text(), input);
     }
+
+    #[test]
+    fn test_create_sql_function() {
+        let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text)
+    AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$
+    LANGUAGE SQL;";
+
+        let mut parser = Parser::new();
+        parser.parse_statement(input, None);
+        let parsed = parser.finish();
+
+        dbg!(&parsed.cst);
+
+        assert_eq!(parsed.cst.text(), input);
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ pub struct Parser {`
`15`	`15`	`errors: Vec<SyntaxError>,`
`16`	`16`	`stmts: Vec<RawStmt>,`
`17`	`17`	`checkpoint: Option<i32>,`
`18`		`- is_parsing_erronous_node: bool,`
	`18`	`+ is_parsing_flat_node: bool,`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`#[derive(Debug)]`
`@@ -35,7 +35,7 @@ impl Parser {`
`35`	`35`	`errors: Vec::new(),`
`36`	`36`	`stmts: Vec::new(),`
`37`	`37`	`checkpoint: None,`
`38`		`- is_parsing_erronous_node: false,`
	`38`	`+ is_parsing_flat_node: false,`
`39`	`39`	`}`
`40`	`40`	`}`
`41`	`41`
`@@ -46,12 +46,12 @@ impl Parser {`
`46`	`46`	`}`
`47`	`47`	`}`
`48`	`48`
`49`		`- pub fn set_checkpoint(&mut self, is_parsing_erronous_node: bool) {`
	`49`	`+ pub fn set_checkpoint(&mut self, is_parsing_flat_node: bool) {`
`50`	`50`	`assert!(self.checkpoint.is_none());`
`51`	`51`	`assert!(self.token_buffer.is_empty());`
`52`	`52`	`println!("set_checkpoint at {}", self.curr_depth);`
`53`	`53`	`self.checkpoint = Some(self.curr_depth);`
`54`		`- self.is_parsing_erronous_node = is_parsing_erronous_node;`
	`54`	`+ self.is_parsing_flat_node = is_parsing_flat_node;`
`55`	`55`	`}`
`56`	`56`
`57`	`57`	`pub fn close_checkpoint(&mut self) {`
`@@ -60,7 +60,7 @@ impl Parser {`
`60`	`60`	`self.close_until_depth(self.checkpoint.unwrap());`
`61`	`61`	`}`
`62`	`62`	`self.checkpoint = None;`
`63`		`- self.is_parsing_erronous_node = false;`
	`63`	`+ self.is_parsing_flat_node = false;`
`64`	`64`	`}`
`65`	`65`
`66`	`66`	`pub fn start_node(&mut self, kind: SyntaxKind) {`
`@@ -101,7 +101,7 @@ impl Parser {`
`101`	`101`	`///`
`102`	`102`	/// if `is_parsing_erronous_node` is true, applies token immediately
`103`	`103`	`pub fn token(&mut self, kind: SyntaxKind, text: &str) {`
`104`		`- if self.is_parsing_erronous_node {`
	`104`	`+ if self.is_parsing_flat_node {`
`105`	`105`	`self.inner.token(kind, text);`
`106`	`106`	`return;`
`107`	`107`	`}`