From 0c591dea23a81fb3716b7abe4f72eb7e66626968 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sat, 22 Feb 2025 17:22:19 +0100 Subject: [PATCH 1/8] fix: emit scan errors --- Cargo.lock | 56 +------------ crates/pglt_lexer/Cargo.toml | 2 +- crates/pglt_lexer/src/lib.rs | 23 +++--- crates/pglt_lexer_codegen/src/lib.rs | 1 - crates/pglt_lexer_codegen/src/syntax_kind.rs | 2 +- crates/pglt_statement_splitter/Cargo.toml | 10 ++- .../src/diagnostics.rs | 80 +++++++++++++++++++ crates/pglt_statement_splitter/src/lib.rs | 28 +++++-- crates/pglt_statement_splitter/src/parser.rs | 14 ++-- .../src/syntax_error.rs | 32 -------- 10 files changed, 132 insertions(+), 116 deletions(-) create mode 100644 crates/pglt_statement_splitter/src/diagnostics.rs delete mode 100644 crates/pglt_statement_splitter/src/syntax_error.rs diff --git a/Cargo.lock b/Cargo.lock index 1f3e9680d..2e6457f7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -859,32 +859,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "cstree" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d609e3b8b73dbace666e8a06351fd9062e1ec025e74b27952a932ccb8ec3a25" -dependencies = [ - "cstree_derive", - "fxhash", - "indexmap 2.7.0", - "parking_lot", - "sptr", - "text-size", - "triomphe", -] - -[[package]] -name = "cstree_derive" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84d8f6eaf2917e8bf0173045fe7824c0809e21ef09dc721108da4ee67ce7494b" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.90", -] - [[package]] name = "dashmap" version = "5.5.3" @@ -1305,15 +1279,6 @@ dependencies = [ "slab", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -2450,9 +2415,9 @@ dependencies = [ name = "pglt_lexer" version = "0.0.0" dependencies = [ - "cstree", "insta", "pg_query", + "pglt_diagnostics", "pglt_lexer_codegen", "regex", "text-size", @@ -2563,8 +2528,10 @@ name = "pglt_statement_splitter" version = "0.0.0" dependencies = [ "ntest", - "pg_query", + "pglt_diagnostics", "pglt_lexer", + "pglt_query_ext", + "regex", "text-size", ] @@ -3481,12 +3448,6 @@ dependencies = [ "der", ] -[[package]] -name = "sptr" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b9b39299b249ad65f3b7e96443bad61c02ca5cd3589f46cb6d610a0fd6c0d6a" - [[package]] name = "sqlformat" version = "0.2.6" @@ -4193,15 +4154,6 @@ dependencies = [ "tree-sitter", ] -[[package]] -name = "triomphe" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8f7726da4807b58ea5c96fdc122f80702030edc33b35aff9190a51148ccc85" -dependencies = [ - "stable_deref_trait", -] - [[package]] name = "trybuild" version = "1.0.101" diff --git a/crates/pglt_lexer/Cargo.toml b/crates/pglt_lexer/Cargo.toml index efa02bd1b..48d4e0310 100644 --- a/crates/pglt_lexer/Cargo.toml +++ b/crates/pglt_lexer/Cargo.toml @@ -15,9 +15,9 @@ version = "0.0.0" regex = "1.9.1" pg_query.workspace = true +pglt_diagnostics.workspace = true pglt_lexer_codegen.workspace = true -cstree = { version = "0.12.0", features = ["derive"] } text-size.workspace = true [dev-dependencies] diff --git a/crates/pglt_lexer/src/lib.rs b/crates/pglt_lexer/src/lib.rs index 82edebfe6..a1179769a 100644 --- a/crates/pglt_lexer/src/lib.rs +++ b/crates/pglt_lexer/src/lib.rs @@ -107,15 +107,14 @@ fn whitespace_tokens(input: &str) -> VecDeque { /// Turn a string of potentially valid sql code into a list of tokens, including their range in the source text. /// /// The implementation is primarily using libpg_querys `scan` method, and fills in the gaps with tokens that are not parsed by the library, e.g. whitespace. -pub fn lex(text: &str) -> Vec { +pub fn lex(text: &str) -> pg_query::Result> { let mut whitespace_tokens = whitespace_tokens(text); // tokens from pg_query.rs - let mut pglt_query_tokens = match pg_query::scan(text) { - Ok(scanned) => VecDeque::from(scanned.tokens), - // this _should_ never fail - _ => panic!("pg_query::scan failed"), - }; + let mut pglt_query_tokens = pg_query::scan(text)? + .tokens + .into_iter() + .collect::>(); // merge the two token lists let mut tokens: Vec = Vec::new(); @@ -173,7 +172,7 @@ pub fn lex(text: &str) -> Vec { ); } - tokens + Ok(tokens) } #[cfg(test)] @@ -183,28 +182,28 @@ mod tests { #[test] fn test_special_chars() { let input = "insert into c (name, full_name) values ('Å', 1);"; - let tokens = lex(input); + let tokens = lex(input).unwrap(); assert!(!tokens.is_empty()); } #[test] fn test_tab_tokens() { let input = "select\t1"; - let tokens = lex(input); + let tokens = lex(input).unwrap(); assert_eq!(tokens[1].kind, SyntaxKind::Tab); } #[test] fn test_newline_tokens() { let input = "select\n1"; - let tokens = lex(input); + let tokens = lex(input).unwrap(); assert_eq!(tokens[1].kind, SyntaxKind::Newline); } #[test] fn test_whitespace_tokens() { let input = "select 1"; - let tokens = lex(input); + let tokens = lex(input).unwrap(); assert_eq!(tokens[1].kind, SyntaxKind::Whitespace); } @@ -212,7 +211,7 @@ mod tests { fn test_lexer() { let input = "select 1; \n -- some comment \n select 2\t"; - let tokens = lex(input); + let tokens = lex(input).unwrap(); let mut tokens_iter = tokens.iter(); let token = tokens_iter.next().unwrap(); diff --git a/crates/pglt_lexer_codegen/src/lib.rs b/crates/pglt_lexer_codegen/src/lib.rs index 99a35aa5d..fa28586f6 100644 --- a/crates/pglt_lexer_codegen/src/lib.rs +++ b/crates/pglt_lexer_codegen/src/lib.rs @@ -13,7 +13,6 @@ pub fn lexer_codegen(_item: proc_macro::TokenStream) -> proc_macro::TokenStream quote! { use pg_query::{protobuf, protobuf::ScanToken, protobuf::Token, NodeEnum, NodeRef}; - use cstree::Syntax; #syntax_kind } diff --git a/crates/pglt_lexer_codegen/src/syntax_kind.rs b/crates/pglt_lexer_codegen/src/syntax_kind.rs index 4bb28d123..b9d7c7ad7 100644 --- a/crates/pglt_lexer_codegen/src/syntax_kind.rs +++ b/crates/pglt_lexer_codegen/src/syntax_kind.rs @@ -26,7 +26,7 @@ pub fn syntax_kind_mod(proto_file: &ProtoFile) -> proc_macro2::TokenStream { /// An u32 enum of all valid syntax elements (nodes and tokens) of the postgres /// sql dialect, and a few custom ones that are not parsed by pg_query.rs, such /// as `Whitespace`. - #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Syntax)] + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] #[repr(u32)] pub enum SyntaxKind { #(#unique_enum_variants),*, diff --git a/crates/pglt_statement_splitter/Cargo.toml b/crates/pglt_statement_splitter/Cargo.toml index d5243f762..3148d4f69 100644 --- a/crates/pglt_statement_splitter/Cargo.toml +++ b/crates/pglt_statement_splitter/Cargo.toml @@ -12,9 +12,11 @@ version = "0.0.0" [dependencies] -pglt_lexer.workspace = true -text-size.workspace = true +pglt_diagnostics = { workspace = true } +pglt_lexer.workspace = true +pglt_query_ext.workspace = true +regex.workspace = true +text-size.workspace = true [dev-dependencies] -ntest = "0.9.3" -pg_query.workspace = true +ntest = "0.9.3" diff --git a/crates/pglt_statement_splitter/src/diagnostics.rs b/crates/pglt_statement_splitter/src/diagnostics.rs new file mode 100644 index 000000000..61b54a58c --- /dev/null +++ b/crates/pglt_statement_splitter/src/diagnostics.rs @@ -0,0 +1,80 @@ +use pglt_diagnostics::{Diagnostic, MessageAndDescription}; +use text_size::TextRange; + +/// A specialized diagnostic for the statement splitter parser. +/// +/// Parser diagnostics are always **errors**. +#[derive(Clone, Debug, Diagnostic, PartialEq)] +#[diagnostic(category = "syntax", severity = Error)] +pub struct ParseDiagnostic { + /// The location where the error is occurred + #[location(span)] + span: Option, + #[message] + #[description] + pub message: MessageAndDescription, + // if true, the error is fatal and the parsing should stop + pub is_fatal: bool, +} + +impl ParseDiagnostic { + pub fn new(message: impl Into, range: TextRange) -> Self { + Self { + span: Some(range), + message: MessageAndDescription::from(message.into()), + is_fatal: false, + } + } + + pub fn from_pg_query_err(err: pglt_query_ext::Error, input: &str) -> Vec { + let err_msg = err.to_string(); + let re = regex::Regex::new(r#"at or near "(.*?)""#).unwrap(); + let mut diagnostics = Vec::new(); + + for captures in re.captures_iter(&err_msg) { + if let Some(matched) = captures.get(1) { + let search_term = matched.as_str(); + for (idx, _) in input.match_indices(search_term) { + let from = idx; + let to = from + search_term.len(); + diagnostics.push(ParseDiagnostic { + span: Some(TextRange::new( + from.try_into().unwrap(), + to.try_into().unwrap(), + )), + message: MessageAndDescription::from(err_msg.clone()), + is_fatal: true, + }); + } + } + } + + if diagnostics.is_empty() { + diagnostics.push(ParseDiagnostic { + span: None, + message: MessageAndDescription::from(err_msg), + is_fatal: true, + }); + } + + diagnostics + } +} + +#[cfg(test)] +mod tests { + use pglt_lexer::lex; + + use super::*; + + #[test] + fn failing_lexer() { + let input = + "select 1443ddwwd33djwdkjw13331333333333; select 1443ddwwd33djwdkjw13331333333333;"; + let err = lex(input).unwrap_err(); + + let diagnostics = ParseDiagnostic::from_pg_query_err(err, input); + assert_eq!(diagnostics.len(), 2); + assert!(diagnostics.iter().all(|d| d.is_fatal)); + } +} diff --git a/crates/pglt_statement_splitter/src/lib.rs b/crates/pglt_statement_splitter/src/lib.rs index 6f78a26c0..33bd6ec0a 100644 --- a/crates/pglt_statement_splitter/src/lib.rs +++ b/crates/pglt_statement_splitter/src/lib.rs @@ -1,13 +1,24 @@ //! Postgres Statement Splitter //! //! This crate provides a function to split a SQL source string into individual statements. +pub mod diagnostics; mod parser; -mod syntax_error; +use diagnostics::ParseDiagnostic; use parser::{source, Parse, Parser}; pub fn split(sql: &str) -> Parse { - let mut parser = Parser::new(sql); + let tokens = match pglt_lexer::lex(sql) { + Ok(tokens) => tokens, + Err(e) => { + return Parse { + ranges: Vec::new(), + errors: ParseDiagnostic::from_pg_query_err(e, sql), + }; + } + }; + + let mut parser = Parser::new(tokens); source(&mut parser); @@ -16,9 +27,9 @@ pub fn split(sql: &str) -> Parse { #[cfg(test)] mod tests { + use diagnostics::ParseDiagnostic; use ntest::timeout; use pglt_lexer::SyntaxKind; - use syntax_error::SyntaxError; use text_size::TextRange; use super::*; @@ -64,7 +75,7 @@ mod tests { self } - fn expect_errors(&self, expected: Vec) -> &Self { + fn expect_errors(&self, expected: Vec) -> &Self { assert_eq!( self.parse.errors.len(), expected.len(), @@ -82,6 +93,13 @@ mod tests { } } + #[test] + fn failing_lexer() { + let input = "select 1443ddwwd33djwdkjw13331333333333"; + let res = split(input); + assert!(res.errors.iter().any(|d| d.is_fatal)); + } + #[test] #[timeout(1000)] fn basic() { @@ -114,7 +132,7 @@ mod tests { fn insert_expect_error() { Tester::from("\ninsert select 1\n\nselect 3") .expect_statements(vec!["insert select 1", "select 3"]) - .expect_errors(vec![SyntaxError::new( + .expect_errors(vec![ParseDiagnostic::new( format!("Expected {:?}", SyntaxKind::Into), TextRange::new(8.into(), 14.into()), )]); diff --git a/crates/pglt_statement_splitter/src/parser.rs b/crates/pglt_statement_splitter/src/parser.rs index 8868ee38d..d6751c468 100644 --- a/crates/pglt_statement_splitter/src/parser.rs +++ b/crates/pglt_statement_splitter/src/parser.rs @@ -5,10 +5,10 @@ mod dml; pub use common::source; -use pglt_lexer::{lex, SyntaxKind, Token, WHITESPACE_TOKENS}; +use pglt_lexer::{SyntaxKind, Token, WHITESPACE_TOKENS}; use text_size::{TextRange, TextSize}; -use crate::syntax_error::SyntaxError; +use crate::diagnostics::ParseDiagnostic; /// Main parser that exposes the `cstree` api, and collects errors and statements /// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html @@ -16,7 +16,7 @@ pub struct Parser { /// The ranges of the statements ranges: Vec<(usize, usize)>, /// The syntax errors accumulated during parsing - errors: Vec, + errors: Vec, /// The start of the current statement, if any current_stmt_start: Option, /// The tokens to parse @@ -33,13 +33,11 @@ pub struct Parse { /// The ranges of the errors pub ranges: Vec, /// The syntax errors accumulated during parsing - pub errors: Vec, + pub errors: Vec, } impl Parser { - pub fn new(sql: &str) -> Self { - let tokens = lex(sql); - + pub fn new(tokens: Vec) -> Self { let eof_token = Token::eof(usize::from( tokens .last() @@ -178,7 +176,7 @@ impl Parser { return; } - self.errors.push(SyntaxError::new( + self.errors.push(ParseDiagnostic::new( format!("Expected {:#?}", kind), self.peek().span, )); diff --git a/crates/pglt_statement_splitter/src/syntax_error.rs b/crates/pglt_statement_splitter/src/syntax_error.rs deleted file mode 100644 index 1e5c36501..000000000 --- a/crates/pglt_statement_splitter/src/syntax_error.rs +++ /dev/null @@ -1,32 +0,0 @@ -use std::fmt; - -use text_size::{TextRange, TextSize}; - -/// Represents the result of unsuccessful tokenization, parsing, -/// or tree validation. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct SyntaxError(String, TextRange); - -impl SyntaxError { - pub fn new(message: impl Into, range: TextRange) -> Self { - Self(message.into(), range) - } - pub fn new_at_offset(message: impl Into, offset: TextSize) -> Self { - Self(message.into(), TextRange::empty(offset)) - } - - pub fn range(&self) -> TextRange { - self.1 - } - - pub fn with_range(mut self, range: TextRange) -> Self { - self.1 = range; - self - } -} - -impl fmt::Display for SyntaxError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } -} From 4d23301845dda6b846fd69ffe4372b3481b4061b Mon Sep 17 00:00:00 2001 From: psteinroe Date: Mon, 24 Feb 2025 09:36:29 +0100 Subject: [PATCH 2/8] fix: impl change handling and delete base_db crate --- Cargo.lock | 10 - crates/pglt_base_db/Cargo.toml | 26 - crates/pglt_base_db/src/change.rs | 554 ------------------ crates/pglt_base_db/src/document.rs | 186 ------ crates/pglt_base_db/src/lib.rs | 20 - crates/pglt_lexer/src/diagnostics.rs | 63 ++ crates/pglt_lexer/src/lib.rs | 12 +- .../src/diagnostics.rs | 59 +- crates/pglt_statement_splitter/src/lib.rs | 24 +- crates/pglt_statement_splitter/src/parser.rs | 8 +- .../src/workspace/server/change.rs | 271 ++++++++- .../src/workspace/server/document.rs | 65 +- 12 files changed, 382 insertions(+), 916 deletions(-) delete mode 100644 crates/pglt_base_db/Cargo.toml delete mode 100644 crates/pglt_base_db/src/change.rs delete mode 100644 crates/pglt_base_db/src/document.rs delete mode 100644 crates/pglt_base_db/src/lib.rs create mode 100644 crates/pglt_lexer/src/diagnostics.rs diff --git a/Cargo.lock b/Cargo.lock index 2e6457f7b..7191a639d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2242,16 +2242,6 @@ dependencies = [ "termcolor", ] -[[package]] -name = "pglt_base_db" -version = "0.0.0" -dependencies = [ - "line_index", - "pglt_fs", - "pglt_statement_splitter", - "text-size", -] - [[package]] name = "pglt_cli" version = "0.0.0" diff --git a/crates/pglt_base_db/Cargo.toml b/crates/pglt_base_db/Cargo.toml deleted file mode 100644 index e3a1651c2..000000000 --- a/crates/pglt_base_db/Cargo.toml +++ /dev/null @@ -1,26 +0,0 @@ -[package] -authors.workspace = true -categories.workspace = true -description = "" -edition.workspace = true -homepage.workspace = true -keywords.workspace = true -license.workspace = true -name = "pglt_base_db" -repository.workspace = true -version = "0.0.0" - - -[dependencies] -line_index.workspace = true -text-size.workspace = true - -pglt_fs.workspace = true -pglt_statement_splitter.workspace = true - -[dev-dependencies] - -[lib] -doctest = false - -[features] diff --git a/crates/pglt_base_db/src/change.rs b/crates/pglt_base_db/src/change.rs deleted file mode 100644 index 689aa263c..000000000 --- a/crates/pglt_base_db/src/change.rs +++ /dev/null @@ -1,554 +0,0 @@ -use std::ops::Sub; - -use line_index::LineIndex; -use text_size::{TextLen, TextRange, TextSize}; - -use crate::document::{Document, StatementRef}; - -#[derive(Debug)] -pub enum StatementChange { - Added(StatementRef), - Deleted(StatementRef), - Modified(ChangedStatement), -} - -impl StatementChange { - pub fn statement(&self) -> &StatementRef { - match self { - StatementChange::Added(stmt) => stmt, - StatementChange::Deleted(stmt) => stmt, - StatementChange::Modified(stmt) => &stmt.statement, - } - } -} - -#[derive(Debug)] -pub struct ChangedStatement { - /// The now "old" statement ref - pub statement: StatementRef, - /// The range in which the text changed - pub range: TextRange, - /// The new text - pub text: String, -} - -impl ChangedStatement { - pub fn new_statement(&self) -> StatementRef { - StatementRef { - idx: self.statement.idx, - text: apply_text_change(&self.statement.text, Some(self.range), &self.text), - document_url: self.statement.document_url.clone(), - } - } -} - -fn apply_text_change(text: &str, range: Option, change_text: &str) -> String { - if range.is_none() { - return change_text.to_string(); - } - - let range = range.unwrap(); - let start = usize::from(range.start()); - let end = usize::from(range.end()); - - let mut new_text = String::new(); - new_text.push_str(&text[..start]); - new_text.push_str(change_text); - new_text.push_str(&text[end..]); - - new_text -} - -#[derive(Debug)] -pub struct DocumentChange { - pub version: i32, - pub changes: Vec, - - changed_statements: Vec, - - applied: bool, -} - -#[derive(Debug, Clone)] -pub struct Change { - /// The range of the file that changed. If `None`, the whole file changed. - pub range: Option, - pub text: String, -} - -impl Change { - pub fn diff_size(&self) -> TextSize { - match self.range { - Some(range) => { - let range_length: usize = range.len().into(); - let text_length = self.text.chars().count(); - let diff = (text_length as i64 - range_length as i64).abs(); - TextSize::from(u32::try_from(diff).unwrap()) - } - None => TextSize::from(u32::try_from(self.text.chars().count()).unwrap()), - } - } - - pub fn is_addition(&self) -> bool { - self.range.is_some() && self.text.len() > self.range.unwrap().len().into() - } - - pub fn is_deletion(&self) -> bool { - self.range.is_some() && self.text.len() < self.range.unwrap().len().into() - } - - pub fn apply_to_text(&self, text: &str) -> String { - if self.range.is_none() { - return self.text.clone(); - } - - let range = self.range.unwrap(); - let start = usize::from(range.start()); - let end = usize::from(range.end()); - - let mut new_text = String::new(); - new_text.push_str(&text[..start]); - new_text.push_str(&self.text); - new_text.push_str(&text[end..]); - - new_text - } - - pub fn apply(&self, doc: &mut Document) -> Vec { - let mut changed_statements: Vec = Vec::new(); - - if self.range.is_none() { - // whole file changed - changed_statements.extend( - doc.drain_statements() - .into_iter() - .map(StatementChange::Deleted), - ); - // TODO also use errors returned by extract sql statement ranges - doc.statement_ranges = pglt_statement_splitter::split(&self.text).ranges.to_vec(); - doc.text = self.text.clone(); - doc.line_index = LineIndex::new(&doc.text); - - changed_statements.extend( - doc.statement_refs() - .iter() - .map(|stmt| StatementChange::Added(stmt.to_owned())), - ); - } else if let Some(changed_stmt_pos) = doc - .statement_ranges - .iter() - .position(|r| r.contains_range(self.range.unwrap())) - { - // change within a single statement - doc.statement_ranges - .iter_mut() - .enumerate() - .skip_while(|(_, r)| self.range.unwrap().end() > r.end()) - .for_each(|(pos, r)| { - if pos == changed_stmt_pos { - // only this ones ref is different, the rest do not have any text - // changes - changed_statements.push(StatementChange::Modified(ChangedStatement { - statement: StatementRef { - idx: pos, - text: doc.text[*r].to_string(), - document_url: doc.url.clone(), - }, - // change must be relative to statement - range: self.range.unwrap().sub(r.start()), - text: self.text.clone(), - })); - - // if addition, expand the range - // if deletion, shrink the range - if self.is_addition() { - *r = TextRange::new(r.start(), r.end() + self.diff_size()); - } else if self.is_deletion() { - *r = TextRange::new(r.start(), r.end() - self.diff_size()); - } - } else if self.is_addition() { - *r += self.diff_size(); - } else if self.is_deletion() { - *r -= self.diff_size(); - } - }); - - doc.text = self.apply_to_text(&doc.text); - doc.line_index = LineIndex::new(&doc.text); - } else { - // change across stmts - - let mut min = self.range.unwrap().start(); - let mut max = self.range.unwrap().end(); - - for (idx, r) in doc - .statement_ranges - .iter() - .enumerate() - .skip_while(|(_, r)| { - // skip until first changed stmt - self.range.unwrap().start() > r.end() - }) - .take_while(|(_, r)| { - // take until after last changed stmt - self.range.unwrap().end() >= r.end() - }) - { - changed_statements.push(StatementChange::Deleted(StatementRef { - idx, - text: doc.text[*r].to_string(), - document_url: doc.url.clone(), - })); - - if r.start() < min { - min = r.start(); - } - if r.end() > max { - max = r.end(); - } - } - - doc.text = self.apply_to_text(&doc.text); - doc.line_index = LineIndex::new(&doc.text); - - if doc.text.text_len() < max { - max = doc.text.text_len(); - } - - // get text from min(first_stmt_start, change_start) to max(last_stmt_end, change_end) - let extracted_text = doc - .text - .as_str() - .get(usize::from(min)..usize::from(max)) - .unwrap(); - - doc.statement_ranges.drain( - changed_statements - .iter() - .min_by_key(|c| c.statement().idx) - .unwrap() - .statement() - .idx - ..changed_statements - .iter() - .max_by_key(|c| c.statement().idx) - .unwrap() - .statement() - .idx - + 1, - ); - - for range in pglt_statement_splitter::split(extracted_text).ranges { - match doc - .statement_ranges - .binary_search_by(|r| r.start().cmp(&range.start())) - { - Ok(_) => {} - Err(pos) => { - doc.statement_ranges.insert(pos, range); - changed_statements.push(StatementChange::Added(StatementRef { - idx: pos, - text: extracted_text[range].to_string(), - document_url: doc.url.clone(), - })); - } - } - } - } - - changed_statements - } -} - -impl DocumentChange { - pub fn new(version: i32, changes: Vec) -> DocumentChange { - DocumentChange { - version, - changes, - changed_statements: Vec::new(), - applied: false, - } - } - - pub fn apply(&mut self, doc: &mut Document) { - assert!(!self.applied, "DocumentChange already applied"); - // TODO: optimize this by searching for the last change without a range and start applying - // from there - self.changed_statements - .extend(self.changes.iter().flat_map(|c| c.apply(doc))); - - doc.version = self.version; - self.applied = true; - } - - pub fn collect_statement_changes(&mut self) -> Vec { - assert!(self.applied, "DocumentChange not yet applied"); - self.changed_statements.drain(..).collect() - } -} - -#[cfg(test)] -mod tests { - use text_size::{TextRange, TextSize}; - - use crate::{change::Change, document::StatementRef, Document, DocumentChange}; - use pglt_fs::PgLTPath; - - #[test] - fn test_document_apply_changes() { - let input = "select id from users;\nselect * from contacts;"; - - let mut d = Document::new(PgLTPath::new("test.sql"), Some(input.to_string())); - - assert_eq!(d.statement_ranges.len(), 2); - - let mut change = DocumentChange::new( - 1, - vec![Change { - text: ",test from users\nselect 1;".to_string(), - range: Some(TextRange::new(9.into(), 45.into())), - }], - ); - - change.apply(&mut d); - let changed = change.collect_statement_changes(); - - assert_eq!(changed.len(), 4); - assert_eq!( - changed[0].statement().to_owned(), - StatementRef { - document_url: PgLTPath::new("test.sql"), - text: "select id from users;".to_string(), - idx: 0 - } - ); - assert_eq!( - changed[1].statement().to_owned(), - StatementRef { - document_url: PgLTPath::new("test.sql"), - text: "select * from contacts;".to_string(), - idx: 1 - } - ); - - assert_eq!("select id,test from users\nselect 1;", d.text); - assert_eq!(d.statement_ranges.len(), 2); - - for r in &pglt_statement_splitter::split(&d.text).ranges { - assert!( - d.statement_ranges.iter().any(|x| r == x), - "should have stmt with range {:#?}", - r - ); - } - - assert_eq!(d.statement_ranges[0], TextRange::new(0.into(), 25.into())); - assert_eq!(d.statement_ranges[1], TextRange::new(26.into(), 35.into())); - } - - #[test] - fn test_document_apply_changes_at_end_of_statement() { - let input = "select id from\nselect * from contacts;"; - - let mut d = Document::new(PgLTPath::new("test.sql"), Some(input.to_string())); - - assert_eq!(d.statement_ranges.len(), 2); - - let stmt_1_range = d.statement_ranges[0]; - let stmt_2_range = d.statement_ranges[1]; - - let update_text = " contacts;"; - - let update_range = TextRange::new(14.into(), 14.into()); - - let update_text_len = u32::try_from(update_text.chars().count()).unwrap(); - let update_addition = update_text_len - u32::from(update_range.len()); - - let mut change = DocumentChange::new( - 1, - vec![Change { - text: update_text.to_string(), - range: Some(update_range), - }], - ); - - change.apply(&mut d); - - assert_eq!("select id from contacts;\nselect * from contacts;", d.text); - assert_eq!(d.statement_ranges.len(), 2); - assert_eq!(d.statement_ranges[0].start(), stmt_1_range.start()); - assert_eq!( - u32::from(d.statement_ranges[0].end()), - u32::from(stmt_1_range.end()) + update_addition - ); - assert_eq!( - u32::from(d.statement_ranges[1].start()), - u32::from(stmt_2_range.start()) + update_addition - ); - assert_eq!( - u32::from(d.statement_ranges[1].end()), - u32::from(stmt_2_range.end()) + update_addition - ); - } - - #[test] - fn test_document_apply_changes_replacement() { - let path = PgLTPath::new("test.sql"); - - let mut doc = Document::new(path, None); - - let mut c = DocumentChange::new( - 1, - vec![Change { - range: None, - text: "select 1;\nselect 2;".to_string(), - }], - ); - - c.apply(&mut doc); - - assert_eq!(doc.statement_ref(0).text, "select 1;".to_string()); - assert_eq!(doc.statement_ref(1).text, "select 2;".to_string()); - assert_eq!( - doc.statement_ranges[0], - TextRange::new(TextSize::new(0), TextSize::new(9)) - ); - assert_eq!( - doc.statement_ranges[1], - TextRange::new(TextSize::new(10), TextSize::new(19)) - ); - - let mut c = DocumentChange::new( - 2, - vec![Change { - range: Some(TextRange::new(7.into(), 8.into())), - text: "".to_string(), - }], - ); - - c.apply(&mut doc); - - assert_eq!(doc.text, "select ;\nselect 2;"); - assert_eq!(doc.statement_refs().len(), 2); - assert_eq!(doc.statement_ref(0).text, "select ;".to_string()); - assert_eq!(doc.statement_ref(1).text, "select 2;".to_string()); - assert_eq!( - doc.statement_ranges[0], - TextRange::new(TextSize::new(0), TextSize::new(8)) - ); - assert_eq!( - doc.statement_ranges[1], - TextRange::new(TextSize::new(9), TextSize::new(18)) - ); - - let mut c = DocumentChange::new( - 3, - vec![Change { - range: Some(TextRange::new(7.into(), 7.into())), - text: "!".to_string(), - }], - ); - - c.apply(&mut doc); - - assert_eq!(doc.text, "select !;\nselect 2;"); - assert_eq!(doc.statement_refs().len(), 2); - assert_eq!( - doc.statement_ranges[0], - TextRange::new(TextSize::new(0), TextSize::new(9)) - ); - assert_eq!( - doc.statement_ranges[1], - TextRange::new(TextSize::new(10), TextSize::new(19)) - ); - - let mut c = DocumentChange::new( - 4, - vec![Change { - range: Some(TextRange::new(7.into(), 8.into())), - text: "".to_string(), - }], - ); - - c.apply(&mut doc); - - assert_eq!(doc.text, "select ;\nselect 2;"); - assert_eq!(doc.statement_refs().len(), 2); - assert_eq!( - doc.statement_ranges[0], - TextRange::new(TextSize::new(0), TextSize::new(8)) - ); - assert_eq!( - doc.statement_ranges[1], - TextRange::new(TextSize::new(9), TextSize::new(18)) - ); - - let mut c = DocumentChange::new( - 5, - vec![Change { - range: Some(TextRange::new(7.into(), 7.into())), - text: "1".to_string(), - }], - ); - c.apply(&mut doc); - - assert_eq!(doc.text, "select 1;\nselect 2;"); - assert_eq!(doc.statement_refs().len(), 2); - assert_eq!( - doc.statement_ranges[0], - TextRange::new(TextSize::new(0), TextSize::new(9)) - ); - assert_eq!( - doc.statement_ranges[1], - TextRange::new(TextSize::new(10), TextSize::new(19)) - ); - } - - #[test] - fn test_document_apply_changes_within_statement() { - let input = "select id from users;\nselect * from contacts;"; - - let mut d = Document::new(PgLTPath::new("test.sql"), Some(input.to_string())); - - assert_eq!(d.statement_ranges.len(), 2); - - let stmt_1_range = d.statement_ranges[0]; - let stmt_2_range = d.statement_ranges[1]; - - let update_text = ",test"; - - let update_range = TextRange::new(9.into(), 10.into()); - - let update_text_len = u32::try_from(update_text.chars().count()).unwrap(); - let update_addition = update_text_len - u32::from(update_range.len()); - - let mut change = DocumentChange::new( - 1, - vec![Change { - text: update_text.to_string(), - range: Some(update_range), - }], - ); - - change.apply(&mut d); - - assert_eq!( - "select id,test from users;\nselect * from contacts;", - d.text - ); - assert_eq!(d.statement_ranges.len(), 2); - assert_eq!(d.statement_ranges[0].start(), stmt_1_range.start()); - assert_eq!( - u32::from(d.statement_ranges[0].end()), - u32::from(stmt_1_range.end()) + update_addition - ); - assert_eq!( - u32::from(d.statement_ranges[1].start()), - u32::from(stmt_2_range.start()) + update_addition - ); - assert_eq!( - u32::from(d.statement_ranges[1].end()), - u32::from(stmt_2_range.end()) + update_addition - ); - } -} diff --git a/crates/pglt_base_db/src/document.rs b/crates/pglt_base_db/src/document.rs deleted file mode 100644 index 2610dc049..000000000 --- a/crates/pglt_base_db/src/document.rs +++ /dev/null @@ -1,186 +0,0 @@ -use std::{hash::Hash, hash::Hasher, ops::RangeBounds}; - -use line_index::LineIndex; -use text_size::{TextRange, TextSize}; - -use pglt_fs::PgLTPath; - -extern crate test; - -/// Represents a sql source file, and contains a list of statements represented by their ranges -pub struct Document { - /// The url of the document - pub url: PgLTPath, - /// The text of the document - pub text: String, - /// The version of the document - pub version: i32, - /// List of statements sorted by range.start() - pub statement_ranges: Vec, - /// Line index for the document - pub line_index: LineIndex, -} - -impl Hash for Document { - fn hash(&self, state: &mut H) { - self.url.hash(state); - } -} - -/// Represents a reference to a sql statement. This is the primary data structure that is used by higher-level crates to save and retrieve information about a statement. -/// This needs to be optimised by removing the text from the struct and making it a reference to the text in the document. -/// -/// Note that the ref must include all information needed to uniquely identify the statement, so that it can be used as a key in a hashmap. -#[derive(Debug, Hash, PartialEq, Eq, Clone)] -pub struct StatementRef { - pub document_url: PgLTPath, - // TODO use string interner for text - pub text: String, - pub idx: usize, -} - -impl Document { - /// Create a new document - pub fn new(url: PgLTPath, text: Option) -> Document { - Document { - version: 0, - line_index: LineIndex::new(text.as_ref().unwrap_or(&"".to_string())), - // TODO: use errors returned by split - statement_ranges: text.as_ref().map_or_else(Vec::new, |f| { - pglt_statement_splitter::split(f).ranges.to_vec() - }), - text: text.unwrap_or("".to_string()), - url, - } - } - - /// Returns the statement at the given offset - pub fn statement_at_offset(&self, offset: &TextSize) -> Option { - self.statement_ranges - .iter() - .position(|r| r.contains(offset)) - .map(|idx| self.statement_ref(idx)) - } - - /// Returns the statements at the given range - pub fn statements_at_range(&self, range: &TextRange) -> Vec { - self.statement_ranges - .iter() - .enumerate() - .filter(|(_, r)| { - range.contains_range(r.to_owned().to_owned()) || r.contains_range(range.to_owned()) - }) - .map(|(idx, _)| self.statement_ref(idx)) - .collect() - } - - /// Returns the statement at the given offset with its range in the document - pub fn statement_at_offset_with_range( - &self, - offset: &TextSize, - ) -> Option<(TextRange, StatementRef)> { - self.statement_ranges - .iter() - .position(|r| r.contains(offset)) - .map(|idx| self.statement_ref_with_range(idx)) - } - - /// Drains the statements from the document - pub(crate) fn drain_statements(&mut self) -> Vec { - self.statement_ranges - .drain(..) - .enumerate() - .map(|(idx, range)| StatementRef { - document_url: self.url.clone(), - text: self.text[range].to_string(), - idx, - }) - .collect() - } - - /// Returns all statements with their ranges in the document - pub fn statement_refs_with_range(&self) -> Vec<(TextRange, StatementRef)> { - self.statement_ranges - .iter() - .enumerate() - .map(|(idx, range)| { - ( - *range, - StatementRef { - document_url: self.url.clone(), - text: self.text[*range].to_string(), - idx, - }, - ) - }) - .collect() - } - - /// Returns all statements in the document - pub fn statement_refs(&self) -> Vec { - self.statement_ranges - .iter() - .enumerate() - .map(|(idx, range)| StatementRef { - document_url: self.url.clone(), - text: self.text[*range].to_string(), - idx, - }) - .collect() - } - - /// Returns the statement with the given index, throws an error if the index is out of bounds - pub fn statement_ref(&self, pos: usize) -> StatementRef { - self.statement_ranges - .get(pos) - .map(|range| StatementRef { - document_url: self.url.clone(), - text: self.text[*range].to_string(), - idx: pos, - }) - .unwrap() - } - - /// Returns the statement with the given index and its range in the document - pub fn statement_ref_with_range(&self, pos: usize) -> (TextRange, StatementRef) { - self.statement_ranges - .get(pos) - .map(|range| { - ( - *range, - StatementRef { - document_url: self.url.clone(), - text: self.text[*range].to_string(), - idx: pos, - }, - ) - }) - .unwrap() - } -} - -#[cfg(test)] -mod tests { - - use pglt_fs::PgLTPath; - use text_size::{TextRange, TextSize}; - - use crate::Document; - - #[test] - fn test_statements_at_range() { - let url = PgLTPath::new("test.sql"); - - let doc = Document::new( - url, - Some("select unknown from contact;\n\nselect 12345;\n\nalter table test drop column id;\n" - .to_string()), - ); - - let x = doc.statements_at_range(&TextRange::new(TextSize::from(2), TextSize::from(5))); - - assert_eq!(x.len(), 1); - - assert_eq!(x[0].text, "select unknown from contact;"); - } -} diff --git a/crates/pglt_base_db/src/lib.rs b/crates/pglt_base_db/src/lib.rs deleted file mode 100644 index 16b7ab98a..000000000 --- a/crates/pglt_base_db/src/lib.rs +++ /dev/null @@ -1,20 +0,0 @@ -//! # pglt_base_db -//! -//! This crate implements the basic data structures and implements efficient change management. The idea is to make sure we only recompute what is necessary when a change occurs, and this is done by cutting a sql source into statements, and then applying the changes to individual statements. This way, we can avoid re-parsing the entire sql source when a change occurs. -//! -//! The main data structures are: -//! - `Document`: Represents a sql source file, and contains a list of statements represented by their ranges -//! - `StatementRef`: Represents a reference to a sql statement. This is the primary data structure that is used by higher-level crates to save and retrieve information about a statement. -//! - `DocumentChange`: Contains a list of individual `Change`s, and represents a change to a sql source file. This is used to update a `Document` with a new version of the sql source. -//! - `StatementChange` and `ChangedStatement` are results of applying the change and represent references to the changed statements, including information about the changes that were applied. This is used to invalidate caches and recompute information about the changed statements in higher-level crates. -//! -//! I am not yet 100% happy with this, because when we create a `StatementRef`, the text is cloned from `Document` and included in the Hash. This must be improved by leaving the text in the `Document`, and making the `StatementRef` and actual reference to the text in the `Document`. This will make the `StatementRef` smaller and faster to compare. -//! Additionally, the errors returned by the `pglt_statement_splitter::split` are not exposed yet. This must be done to show syntax errors to the user. - -#![feature(extract_if, test)] - -mod change; -mod document; - -pub use change::{Change, ChangedStatement, DocumentChange, StatementChange}; -pub use document::{Document, StatementRef}; diff --git a/crates/pglt_lexer/src/diagnostics.rs b/crates/pglt_lexer/src/diagnostics.rs new file mode 100644 index 000000000..285d5e566 --- /dev/null +++ b/crates/pglt_lexer/src/diagnostics.rs @@ -0,0 +1,63 @@ +use pglt_diagnostics::{Diagnostic, MessageAndDescription}; +use text_size::TextRange; + +/// A specialized diagnostic for scan errors. +/// +/// Scan diagnostics are always **fatal errors**. +#[derive(Clone, Debug, Diagnostic, PartialEq)] +#[diagnostic(category = "syntax", severity = Fatal)] +pub struct ScanError { + /// The location where the error is occurred + #[location(span)] + span: Option, + #[message] + #[description] + pub message: MessageAndDescription, +} + +impl ScanError { + pub fn from_pg_query_err(err: pg_query::Error, input: &str) -> Vec { + let err_msg = err.to_string(); + let re = regex::Regex::new(r#"at or near "(.*?)""#).unwrap(); + let mut diagnostics = Vec::new(); + + for captures in re.captures_iter(&err_msg) { + if let Some(matched) = captures.get(1) { + let search_term = matched.as_str(); + for (idx, _) in input.match_indices(search_term) { + let from = idx; + let to = from + search_term.len(); + diagnostics.push(ScanError { + span: Some(TextRange::new( + from.try_into().unwrap(), + to.try_into().unwrap(), + )), + message: MessageAndDescription::from(err_msg.clone()), + }); + } + } + } + + if diagnostics.is_empty() { + diagnostics.push(ScanError { + span: None, + message: MessageAndDescription::from(err_msg), + }); + } + + diagnostics + } +} + +#[cfg(test)] +mod tests { + use crate::lex; + + #[test] + fn finds_all_occurrences() { + let input = + "select 1443ddwwd33djwdkjw13331333333333; select 1443ddwwd33djwdkjw13331333333333;"; + let diagnostics = lex(input).unwrap_err(); + assert_eq!(diagnostics.len(), 2); + } +} diff --git a/crates/pglt_lexer/src/lib.rs b/crates/pglt_lexer/src/lib.rs index a1179769a..63abe365a 100644 --- a/crates/pglt_lexer/src/lib.rs +++ b/crates/pglt_lexer/src/lib.rs @@ -1,5 +1,7 @@ mod codegen; +pub mod diagnostics; +use diagnostics::ScanError; use pg_query::protobuf::{KeywordKind, ScanToken}; use regex::Regex; use std::{collections::VecDeque, sync::LazyLock}; @@ -107,14 +109,14 @@ fn whitespace_tokens(input: &str) -> VecDeque { /// Turn a string of potentially valid sql code into a list of tokens, including their range in the source text. /// /// The implementation is primarily using libpg_querys `scan` method, and fills in the gaps with tokens that are not parsed by the library, e.g. whitespace. -pub fn lex(text: &str) -> pg_query::Result> { +pub fn lex(text: &str) -> Result, Vec> { let mut whitespace_tokens = whitespace_tokens(text); // tokens from pg_query.rs - let mut pglt_query_tokens = pg_query::scan(text)? - .tokens - .into_iter() - .collect::>(); + let mut pglt_query_tokens = match pg_query::scan(text) { + Ok(r) => r.tokens.into_iter().collect::>(), + Err(err) => return Err(ScanError::from_pg_query_err(err, text)), + }; // merge the two token lists let mut tokens: Vec = Vec::new(); diff --git a/crates/pglt_statement_splitter/src/diagnostics.rs b/crates/pglt_statement_splitter/src/diagnostics.rs index 61b54a58c..5288e08df 100644 --- a/crates/pglt_statement_splitter/src/diagnostics.rs +++ b/crates/pglt_statement_splitter/src/diagnostics.rs @@ -6,75 +6,20 @@ use text_size::TextRange; /// Parser diagnostics are always **errors**. #[derive(Clone, Debug, Diagnostic, PartialEq)] #[diagnostic(category = "syntax", severity = Error)] -pub struct ParseDiagnostic { +pub struct SplitDiagnostic { /// The location where the error is occurred #[location(span)] span: Option, #[message] #[description] pub message: MessageAndDescription, - // if true, the error is fatal and the parsing should stop - pub is_fatal: bool, } -impl ParseDiagnostic { +impl SplitDiagnostic { pub fn new(message: impl Into, range: TextRange) -> Self { Self { span: Some(range), message: MessageAndDescription::from(message.into()), - is_fatal: false, } } - - pub fn from_pg_query_err(err: pglt_query_ext::Error, input: &str) -> Vec { - let err_msg = err.to_string(); - let re = regex::Regex::new(r#"at or near "(.*?)""#).unwrap(); - let mut diagnostics = Vec::new(); - - for captures in re.captures_iter(&err_msg) { - if let Some(matched) = captures.get(1) { - let search_term = matched.as_str(); - for (idx, _) in input.match_indices(search_term) { - let from = idx; - let to = from + search_term.len(); - diagnostics.push(ParseDiagnostic { - span: Some(TextRange::new( - from.try_into().unwrap(), - to.try_into().unwrap(), - )), - message: MessageAndDescription::from(err_msg.clone()), - is_fatal: true, - }); - } - } - } - - if diagnostics.is_empty() { - diagnostics.push(ParseDiagnostic { - span: None, - message: MessageAndDescription::from(err_msg), - is_fatal: true, - }); - } - - diagnostics - } -} - -#[cfg(test)] -mod tests { - use pglt_lexer::lex; - - use super::*; - - #[test] - fn failing_lexer() { - let input = - "select 1443ddwwd33djwdkjw13331333333333; select 1443ddwwd33djwdkjw13331333333333;"; - let err = lex(input).unwrap_err(); - - let diagnostics = ParseDiagnostic::from_pg_query_err(err, input); - assert_eq!(diagnostics.len(), 2); - assert!(diagnostics.iter().all(|d| d.is_fatal)); - } } diff --git a/crates/pglt_statement_splitter/src/lib.rs b/crates/pglt_statement_splitter/src/lib.rs index 33bd6ec0a..f546a9a0a 100644 --- a/crates/pglt_statement_splitter/src/lib.rs +++ b/crates/pglt_statement_splitter/src/lib.rs @@ -4,30 +4,22 @@ pub mod diagnostics; mod parser; -use diagnostics::ParseDiagnostic; use parser::{source, Parse, Parser}; +use pglt_lexer::diagnostics::ScanError; -pub fn split(sql: &str) -> Parse { - let tokens = match pglt_lexer::lex(sql) { - Ok(tokens) => tokens, - Err(e) => { - return Parse { - ranges: Vec::new(), - errors: ParseDiagnostic::from_pg_query_err(e, sql), - }; - } - }; +pub fn split(sql: &str) -> Result> { + let tokens = pglt_lexer::lex(sql)?; let mut parser = Parser::new(tokens); source(&mut parser); - parser.finish() + Ok(parser.finish()) } #[cfg(test)] mod tests { - use diagnostics::ParseDiagnostic; + use diagnostics::SplitDiagnostic; use ntest::timeout; use pglt_lexer::SyntaxKind; use text_size::TextRange; @@ -42,7 +34,7 @@ mod tests { impl From<&str> for Tester { fn from(input: &str) -> Self { Tester { - parse: split(input), + parse: split(input).expect("Failed to split"), input: input.to_string(), } } @@ -75,7 +67,7 @@ mod tests { self } - fn expect_errors(&self, expected: Vec) -> &Self { + fn expect_errors(&self, expected: Vec) -> &Self { assert_eq!( self.parse.errors.len(), expected.len(), @@ -132,7 +124,7 @@ mod tests { fn insert_expect_error() { Tester::from("\ninsert select 1\n\nselect 3") .expect_statements(vec!["insert select 1", "select 3"]) - .expect_errors(vec![ParseDiagnostic::new( + .expect_errors(vec![SplitDiagnostic::new( format!("Expected {:?}", SyntaxKind::Into), TextRange::new(8.into(), 14.into()), )]); diff --git a/crates/pglt_statement_splitter/src/parser.rs b/crates/pglt_statement_splitter/src/parser.rs index d6751c468..915c9ad24 100644 --- a/crates/pglt_statement_splitter/src/parser.rs +++ b/crates/pglt_statement_splitter/src/parser.rs @@ -8,7 +8,7 @@ pub use common::source; use pglt_lexer::{SyntaxKind, Token, WHITESPACE_TOKENS}; use text_size::{TextRange, TextSize}; -use crate::diagnostics::ParseDiagnostic; +use crate::diagnostics::SplitDiagnostic; /// Main parser that exposes the `cstree` api, and collects errors and statements /// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html @@ -16,7 +16,7 @@ pub struct Parser { /// The ranges of the statements ranges: Vec<(usize, usize)>, /// The syntax errors accumulated during parsing - errors: Vec, + errors: Vec, /// The start of the current statement, if any current_stmt_start: Option, /// The tokens to parse @@ -33,7 +33,7 @@ pub struct Parse { /// The ranges of the errors pub ranges: Vec, /// The syntax errors accumulated during parsing - pub errors: Vec, + pub errors: Vec, } impl Parser { @@ -176,7 +176,7 @@ impl Parser { return; } - self.errors.push(ParseDiagnostic::new( + self.errors.push(SplitDiagnostic::new( format!("Expected {:#?}", kind), self.peek().span, )); diff --git a/crates/pglt_workspace/src/workspace/server/change.rs b/crates/pglt_workspace/src/workspace/server/change.rs index 435097bfd..0a7bc54c6 100644 --- a/crates/pglt_workspace/src/workspace/server/change.rs +++ b/crates/pglt_workspace/src/workspace/server/change.rs @@ -1,9 +1,10 @@ +use pglt_diagnostics::Diagnostic; use std::ops::{Add, Sub}; use text_size::{TextLen, TextRange, TextSize}; use crate::workspace::{ChangeFileParams, ChangeParams}; -use super::{Document, Statement}; +use super::{document, Document, Statement}; #[derive(Debug, PartialEq, Eq)] pub enum StatementChange { @@ -58,6 +59,11 @@ struct Affected { impl Document { /// Applies a file change to the document and returns the affected statements pub fn apply_file_change(&mut self, change: &ChangeFileParams) -> Vec { + // cleanup all diagnostics with every change because we cannot guarantee that they are still valid + // this is because we know their ranges only by finding slices within the content which is + // very much not guaranteed to result in correct ranges + self.diagnostics.clear(); + let changes = change .changes .iter() @@ -69,37 +75,55 @@ impl Document { changes } - /// Applies a full change to the document and returns the affected statements - fn apply_full_change(&mut self, text: &str) -> Vec { + /// Helper method to drain all positions and return them as deleted statements + fn drain_positions(&mut self) -> Vec { + self.positions + .drain(..) + .map(|(id, _)| { + StatementChange::Deleted(Statement { + id, + path: self.path.clone(), + }) + }) + .collect() + } + + /// Applies a change to the document and returns the affected statements + /// + /// Will always assume its a full change and reparse the whole document + fn apply_full_change(&mut self, change: &ChangeParams) -> Vec { let mut changes = Vec::new(); - changes.extend(self.positions.drain(..).map(|(id, _)| { - StatementChange::Deleted(Statement { - id, - path: self.path.clone(), - }) - })); + changes.extend(self.drain_positions()); - self.content = text.to_string(); + self.content = change.apply_to_text(&self.content); - changes.extend( - pglt_statement_splitter::split(&self.content) - .ranges - .into_iter() - .map(|range| { - let id = self.id_generator.next(); - let text = self.content[range].to_string(); - self.positions.push((id, range)); + let (ranges, diagnostics) = document::split_with_diagnostics(&self.content, None); - StatementChange::Added(AddedStatement { - stmt: Statement { - path: self.path.clone(), - id, - }, - text, - }) - }), - ); + self.diagnostics = diagnostics; + + // Do not add any statements if there is a fatal error + if self + .diagnostics + .iter() + .any(|d| d.severity() == pglt_diagnostics::Severity::Fatal) + { + return changes; + } + + changes.extend(ranges.into_iter().map(|range| { + let id = self.id_generator.next(); + let text = self.content[range].to_string(); + self.positions.push((id, range)); + + StatementChange::Added(AddedStatement { + stmt: Statement { + path: self.path.clone(), + id, + }, + text, + }) + })); changes } @@ -199,7 +223,7 @@ impl Document { fn apply_change(&mut self, change: &ChangeParams) -> Vec { // if range is none, we have a full change if change.range.is_none() { - return self.apply_full_change(&change.text); + return self.apply_full_change(change); } // i spent a relatively large amount of time thinking about how to handle range changes @@ -235,7 +259,22 @@ impl Document { .get(usize::from(affected_range.start())..usize::from(affected_range.end())) .unwrap(); - let new_ranges = pglt_statement_splitter::split(changed_content).ranges; + let (new_ranges, diags) = + document::split_with_diagnostics(changed_content, Some(affected_range.start())); + + self.diagnostics = diags; + + if self + .diagnostics + .iter() + .any(|d| d.severity() == pglt_diagnostics::Severity::Fatal) + { + // cleanup all positions if there is a fatal error + changed.extend(self.drain_positions()); + // still process text change + self.content = new_content; + return changed; + } if new_ranges.len() == 1 { if change.is_whitespace() { @@ -289,7 +328,22 @@ impl Document { .get(usize::from(full_affected_range.start())..usize::from(full_affected_range.end())) .unwrap(); - let new_ranges = pglt_statement_splitter::split(changed_content).ranges; + let (new_ranges, diags) = + document::split_with_diagnostics(changed_content, Some(full_affected_range.start())); + + self.diagnostics = diags; + + if self + .diagnostics + .iter() + .any(|d| d.severity() == pglt_diagnostics::Severity::Fatal) + { + // cleanup all positions if there is a fatal error + changed.extend(self.drain_positions()); + // still process text change + self.content = new_content; + return changed; + } // delete and add new ones if let Some(next_index) = next_index { @@ -401,7 +455,9 @@ mod tests { } fn assert_document_integrity(d: &Document) { - let ranges = pglt_statement_splitter::split(&d.content).ranges; + let ranges = pglt_statement_splitter::split(&d.content) + .expect("Unexpected scan error") + .ranges; assert!(ranges.len() == d.positions.len()); @@ -410,6 +466,159 @@ mod tests { .all(|r| { d.positions.iter().any(|(_, stmt_range)| stmt_range == r) })); } + #[test] + fn open_doc_with_scan_error() { + let input = "select id from users;\n\n\n\nselect 1443ddwwd33djwdkjw13331333333333;"; + + let d = Document::new(PgLTPath::new("test.sql"), input.to_string(), 0); + + assert_eq!(d.positions.len(), 0); + assert!(d.has_fatal_error()); + } + + #[test] + fn change_into_scan_error_within_statement() { + let path = PgLTPath::new("test.sql"); + let input = "select id from users;\n\n\n\nselect 1;"; + + let mut d = Document::new(PgLTPath::new("test.sql"), input.to_string(), 0); + + assert_eq!(d.positions.len(), 2); + assert!(!d.has_fatal_error()); + + let change = ChangeFileParams { + path: path.clone(), + version: 1, + changes: vec![ChangeParams { + text: "d".to_string(), + range: Some(TextRange::new(33.into(), 33.into())), + }], + }; + + let changed = d.apply_file_change(&change); + + assert_eq!(d.content, "select id from users;\n\n\n\nselect 1d;"); + assert!( + changed + .iter() + .all(|c| matches!(c, StatementChange::Deleted(_))), + "should delete all statements" + ); + assert!(d.positions.is_empty(), "should clear all positions"); + assert_eq!(d.diagnostics.len(), 1, "should return a scan error"); + assert_eq!( + d.diagnostics[0].location().span, + Some(TextRange::new(32.into(), 34.into())), + "should have correct span" + ); + assert!(d.has_fatal_error()); + } + + #[test] + fn change_into_scan_error_across_statements() { + let path = PgLTPath::new("test.sql"); + let input = "select id from users;\n\n\n\nselect 1;"; + + let mut d = Document::new(PgLTPath::new("test.sql"), input.to_string(), 0); + + assert_eq!(d.positions.len(), 2); + assert!(!d.has_fatal_error()); + + let change = ChangeFileParams { + path: path.clone(), + version: 1, + changes: vec![ChangeParams { + text: "1d".to_string(), + range: Some(TextRange::new(7.into(), 33.into())), + }], + }; + + let changed = d.apply_file_change(&change); + + assert_eq!(d.content, "select 1d;"); + assert!( + changed + .iter() + .all(|c| matches!(c, StatementChange::Deleted(_))), + "should delete all statements" + ); + assert!(d.positions.is_empty(), "should clear all positions"); + assert_eq!(d.diagnostics.len(), 1, "should return a scan error"); + assert_eq!( + d.diagnostics[0].location().span, + Some(TextRange::new(7.into(), 9.into())), + "should have correct span" + ); + assert!(d.has_fatal_error()); + } + + #[test] + fn change_from_invalid_to_invalid() { + let path = PgLTPath::new("test.sql"); + let input = "select 1d;"; + + let mut d = Document::new(PgLTPath::new("test.sql"), input.to_string(), 0); + + assert_eq!(d.positions.len(), 0); + assert!(d.has_fatal_error()); + assert_eq!(d.diagnostics.len(), 1); + + let change = ChangeFileParams { + path: path.clone(), + version: 1, + changes: vec![ChangeParams { + text: "2e".to_string(), + range: Some(TextRange::new(7.into(), 9.into())), + }], + }; + + let changed = d.apply_file_change(&change); + + assert_eq!(d.content, "select 2e;"); + assert!(changed.is_empty(), "should not emit any changes"); + assert!(d.positions.is_empty(), "should keep positions empty"); + assert_eq!(d.diagnostics.len(), 1, "should still have a scan error"); + assert_eq!( + d.diagnostics[0].location().span, + Some(TextRange::new(7.into(), 9.into())), + "should have updated span" + ); + assert!(d.has_fatal_error()); + } + + #[test] + fn change_from_invalid_to_valid() { + let path = PgLTPath::new("test.sql"); + let input = "select 1d;"; + + let mut d = Document::new(PgLTPath::new("test.sql"), input.to_string(), 0); + + assert_eq!(d.positions.len(), 0); + assert!(d.has_fatal_error()); + assert_eq!(d.diagnostics.len(), 1); + + let change = ChangeFileParams { + path: path.clone(), + version: 1, + changes: vec![ChangeParams { + text: "1".to_string(), + range: Some(TextRange::new(7.into(), 9.into())), + }], + }; + + let changed = d.apply_file_change(&change); + + assert_eq!(d.content, "select 1;"); + assert_eq!(changed.len(), 1, "should emit one change"); + assert!(matches!( + changed[0], + StatementChange::Added(AddedStatement { .. }) + )); + assert_eq!(d.positions.len(), 1, "should have one position"); + assert!(d.diagnostics.is_empty(), "should have no diagnostics"); + assert!(!d.has_fatal_error()); + } + #[test] fn within_statements() { let path = PgLTPath::new("test.sql"); diff --git a/crates/pglt_workspace/src/workspace/server/document.rs b/crates/pglt_workspace/src/workspace/server/document.rs index a49a1609e..e7a90ac17 100644 --- a/crates/pglt_workspace/src/workspace/server/document.rs +++ b/crates/pglt_workspace/src/workspace/server/document.rs @@ -1,5 +1,6 @@ +use pglt_diagnostics::{serde::Diagnostic as SDiagnostic, Diagnostic, DiagnosticExt, Severity}; use pglt_fs::PgLTPath; -use text_size::TextRange; +use text_size::{TextRange, TextSize}; /// Global unique identifier for a statement #[derive(Debug, Hash, Eq, PartialEq, Clone)] @@ -18,6 +19,8 @@ pub(crate) struct Document { pub(crate) path: PgLTPath, pub(crate) content: String, pub(crate) version: i32, + + pub(super) diagnostics: Vec, /// List of statements sorted by range.start() pub(super) positions: Vec, @@ -28,22 +31,35 @@ impl Document { pub(crate) fn new(path: PgLTPath, content: String, version: i32) -> Self { let mut id_generator = IdGenerator::new(); - let ranges: Vec = pglt_statement_splitter::split(&content) - .ranges - .iter() - .map(|r| (id_generator.next(), *r)) - .collect(); + let (ranges, diagnostics) = split_with_diagnostics(&content, None); Self { path, - positions: ranges, + positions: ranges + .into_iter() + .map(|range| (id_generator.next(), range)) + .collect(), content, version, + diagnostics, id_generator, } } + pub fn diagnostics(&self) -> &[SDiagnostic] { + &self.diagnostics + } + + /// Returns true if there is at least one fatal error in the diagnostics + /// + /// A fatal error is a scan error that prevents the document from being used + pub(super) fn has_fatal_error(&self) -> bool { + self.diagnostics + .iter() + .any(|d| d.severity() == Severity::Fatal) + } + pub fn iter_statements(&self) -> impl Iterator + '_ { self.positions.iter().map(move |(id, _)| Statement { id: *id, @@ -104,3 +120,38 @@ impl IdGenerator { id } } + +/// Helper function that wraps the statement splitter and returns the ranges with unified +/// diagnostics +pub(crate) fn split_with_diagnostics( + content: &str, + offset: Option, +) -> (Vec, Vec) { + let o = offset.unwrap_or_else(|| 0.into()); + match pglt_statement_splitter::split(content) { + Ok(parse) => ( + parse.ranges, + parse + .errors + .into_iter() + .map(|err| { + SDiagnostic::new( + err.clone() + .with_file_span(err.location().span.map(|r| r + o)), + ) + }) + .collect(), + ), + Err(errs) => ( + vec![], + errs.into_iter() + .map(|err| { + SDiagnostic::new( + err.clone() + .with_file_span(err.location().span.map(|r| r + o)), + ) + }) + .collect(), + ), + } +} From 97dd5a36f52e47ccdb2ab5bcd920d484c806c1f5 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Mon, 24 Feb 2025 09:41:22 +0100 Subject: [PATCH 3/8] fix: test --- crates/pglt_statement_splitter/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/pglt_statement_splitter/src/lib.rs b/crates/pglt_statement_splitter/src/lib.rs index f546a9a0a..24ca4c6ab 100644 --- a/crates/pglt_statement_splitter/src/lib.rs +++ b/crates/pglt_statement_splitter/src/lib.rs @@ -88,8 +88,8 @@ mod tests { #[test] fn failing_lexer() { let input = "select 1443ddwwd33djwdkjw13331333333333"; - let res = split(input); - assert!(res.errors.iter().any(|d| d.is_fatal)); + let res = split(input).unwrap_err(); + assert!(!res.is_empty()); } #[test] From 502ea28702550900b760b1cdaf24d196042a2c5a Mon Sep 17 00:00:00 2001 From: psteinroe Date: Mon, 24 Feb 2025 10:03:10 +0100 Subject: [PATCH 4/8] fix: rules check --- xtask/rules_check/src/lib.rs | 63 +++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/xtask/rules_check/src/lib.rs b/xtask/rules_check/src/lib.rs index 22c4a72aa..03ea6b7b4 100644 --- a/xtask/rules_check/src/lib.rs +++ b/xtask/rules_check/src/lib.rs @@ -127,34 +127,51 @@ fn assert_lint( }); // split and parse each statement - let stmts = pglt_statement_splitter::split(code); - for stmt in stmts.ranges { - match pglt_query_ext::parse(&code[stmt]) { - Ok(ast) => { - for rule_diag in analyser.run(pglt_analyser::AnalyserContext { root: &ast }) { - let diag = pglt_diagnostics::serde::Diagnostic::new(rule_diag); - - let category = diag.category().expect("linter diagnostic has no code"); - let severity = settings.get_severity_from_rule_code(category).expect( + match pglt_statement_splitter::split(code) { + Ok(stmts) => { + for stmt in stmts.ranges { + match pglt_query_ext::parse(&code[stmt]) { + Ok(ast) => { + for rule_diag in analyser.run(pglt_analyser::AnalyserContext { root: &ast }) + { + let diag = pglt_diagnostics::serde::Diagnostic::new(rule_diag); + + let category = diag.category().expect("linter diagnostic has no code"); + let severity = settings.get_severity_from_rule_code(category).expect( "If you see this error, it means you need to run cargo codegen-configuration", ); - let error = diag - .with_severity(severity) - .with_file_path(&file_path) - .with_file_source_code(code); - - write_diagnostic(code, error)?; - } + let error = diag + .with_severity(severity) + .with_file_path(&file_path) + .with_file_source_code(code); + + write_diagnostic(code, error)?; + } + } + Err(e) => { + let error = SyntaxDiagnostic::from(e) + .with_file_path(&file_path) + .with_file_source_code(code); + write_diagnostic(code, error)?; + } + }; } - Err(e) => { - let error = SyntaxDiagnostic::from(e) - .with_file_path(&file_path) - .with_file_source_code(code); - write_diagnostic(code, error)?; + } + Err(errs) => { + // Print all diagnostics to help the user + let mut console = pglt_console::EnvConsole::default(); + for err in errs { + console.println( + pglt_console::LogLevel::Error, + markup! { + {PrintDiagnostic::verbose(&err)} + }, + ); } - }; - } + bail!("Analysis of '{group}/{rule}' on the following code block returned a scan diagnostic.\n\n{code}"); + } + }; Ok(()) } From 36339e850d677a70e7b2553d7de033d9d86c4278 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Mon, 24 Feb 2025 10:10:06 +0100 Subject: [PATCH 5/8] fix: rules docs --- docs/codegen/src/rules_docs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/codegen/src/rules_docs.rs b/docs/codegen/src/rules_docs.rs index 12160699a..19a26839f 100644 --- a/docs/codegen/src/rules_docs.rs +++ b/docs/codegen/src/rules_docs.rs @@ -431,7 +431,7 @@ fn print_diagnostics( }); // split and parse each statement - let stmts = pglt_statement_splitter::split(code); + let stmts = pglt_statement_splitter::split(code).expect("unexpected parse error"); for stmt in stmts.ranges { match pglt_query_ext::parse(&code[stmt]) { Ok(ast) => { From 165b3517075b94f3b1ab3d495a63b2db0f9f5a72 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Mon, 24 Feb 2025 10:18:36 +0100 Subject: [PATCH 6/8] fix: tests --- .../pglt_statement_splitter/tests/statement_splitter_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/pglt_statement_splitter/tests/statement_splitter_tests.rs b/crates/pglt_statement_splitter/tests/statement_splitter_tests.rs index 3bc9b2bc0..ca9c01b73 100644 --- a/crates/pglt_statement_splitter/tests/statement_splitter_tests.rs +++ b/crates/pglt_statement_splitter/tests/statement_splitter_tests.rs @@ -22,7 +22,7 @@ fn test_statement_splitter() { let contents = fs::read_to_string(&path).unwrap(); - let split = pglt_statement_splitter::split(&contents); + let split = pglt_statement_splitter::split(&contents).expect("Failed to split"); assert_eq!( split.ranges.len(), From df7b1b5271e2eddf402e82f37ea6faff629f154e Mon Sep 17 00:00:00 2001 From: psteinroe Date: Mon, 24 Feb 2025 10:56:47 +0100 Subject: [PATCH 7/8] feat: fetch doc diagnostics in workspace api --- crates/pglt_workspace/src/workspace/server.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/pglt_workspace/src/workspace/server.rs b/crates/pglt_workspace/src/workspace/server.rs index 45ffb1987..06cf83124 100644 --- a/crates/pglt_workspace/src/workspace/server.rs +++ b/crates/pglt_workspace/src/workspace/server.rs @@ -298,7 +298,7 @@ impl Workspace for WorkspaceServer { filter, }); - let mut diagnostics: Vec = vec![]; + let mut diagnostics: Vec = doc.diagnostics().to_vec(); if let Some(pool) = self .connection @@ -394,7 +394,7 @@ impl Workspace for WorkspaceServer { let errors = diagnostics .iter() - .filter(|d| d.severity() == Severity::Error) + .filter(|d| d.severity() == Severity::Error || d.severity() == Severity::Fatal) .count(); info!("Pulled {:?} diagnostic(s)", diagnostics.len()); From aab046b121ed96e624260b5d8a4f0175cabf5a24 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Tue, 25 Feb 2025 08:24:35 +0100 Subject: [PATCH 8/8] fix: review --- crates/pglt_lexer/src/diagnostics.rs | 4 ++++ .../src/workspace/server/change.rs | 18 +++--------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/crates/pglt_lexer/src/diagnostics.rs b/crates/pglt_lexer/src/diagnostics.rs index 285d5e566..29cd72cd1 100644 --- a/crates/pglt_lexer/src/diagnostics.rs +++ b/crates/pglt_lexer/src/diagnostics.rs @@ -59,5 +59,9 @@ mod tests { "select 1443ddwwd33djwdkjw13331333333333; select 1443ddwwd33djwdkjw13331333333333;"; let diagnostics = lex(input).unwrap_err(); assert_eq!(diagnostics.len(), 2); + assert_eq!(diagnostics[0].span.unwrap().start(), 7.into()); + assert_eq!(diagnostics[0].span.unwrap().end(), 39.into()); + assert_eq!(diagnostics[1].span.unwrap().start(), 48.into()); + assert_eq!(diagnostics[1].span.unwrap().end(), 80.into()); } } diff --git a/crates/pglt_workspace/src/workspace/server/change.rs b/crates/pglt_workspace/src/workspace/server/change.rs index 0a7bc54c6..473563c83 100644 --- a/crates/pglt_workspace/src/workspace/server/change.rs +++ b/crates/pglt_workspace/src/workspace/server/change.rs @@ -103,11 +103,7 @@ impl Document { self.diagnostics = diagnostics; // Do not add any statements if there is a fatal error - if self - .diagnostics - .iter() - .any(|d| d.severity() == pglt_diagnostics::Severity::Fatal) - { + if self.has_fatal_error() { return changes; } @@ -264,11 +260,7 @@ impl Document { self.diagnostics = diags; - if self - .diagnostics - .iter() - .any(|d| d.severity() == pglt_diagnostics::Severity::Fatal) - { + if self.has_fatal_error() { // cleanup all positions if there is a fatal error changed.extend(self.drain_positions()); // still process text change @@ -333,11 +325,7 @@ impl Document { self.diagnostics = diags; - if self - .diagnostics - .iter() - .any(|d| d.severity() == pglt_diagnostics::Severity::Fatal) - { + if self.has_fatal_error() { // cleanup all positions if there is a fatal error changed.extend(self.drain_positions()); // still process text change